- Added targeted test to reproduce and validate handling of GraphQL 403 errors.
- Hardened the GraphQL client to reduce 403 occurrences and provide clearer diagnostics when they appear.
- Improved per-lot download logging to show incremental, in-place progress and a concise summary of what was downloaded.
### Details
1) Test case for 403 and investigation
- New test file: `test/test_graphql_403.py`.
- Uses `importlib` to load `src/config.py` and `src/graphql_client.py` directly so it’s independent of sys.path quirks.
- Mocks `aiohttp.ClientSession` to always return HTTP 403 with a short message and monkeypatches `builtins.print` to capture logs.
- Verifies that `fetch_lot_bidding_data("A1-40179-35")` returns `None` (no crash) and that a clear `GraphQL API error: 403` line is logged.
- Result: `pytest test/test_graphql_403.py -q` passes locally.
- Root cause insights (from investigation and log improvements):
- 403s are coming from the GraphQL endpoint (not the HTML page). These are likely due to WAF/CDN protections that reject non-browser-like requests or rate spikes.
- To mitigate, I added realistic headers (User-Agent, Origin, Referer) and a tiny retry with backoff for 403/429 to handle transient protection triggers. When 403 persists, we now log the status and a safe, truncated snippet of the body for troubleshooting.
2) Incremental/in-place logging for downloads
- Updated `src/scraper.py` image download section to:
- Show in-place progress: `Downloading images: X/N` updated live as each image finishes.
- After completion, print: `Downloaded: K/N new images`.
- Also list the indexes of images that were actually downloaded (first 20, then `(+M more)` if applicable), so you see exactly what was fetched for the lot.
3) GraphQL client improvements
- Updated `src/graphql_client.py`:
- Added browser-like headers and contextual Referer.
- Added small retry with backoff for 403/429.
- Improved error logs to include status, lot id, and a short body snippet.
### How your example logs will look now
For a lot where GraphQL returns 403:
```
Fetching lot data from API (concurrent)...
GraphQL API error: 403 (lot=A1-40179-35) — Forbidden by WAF
```
For image downloads:
```
Images: 6
Downloading images: 0/6
... 6/6
Downloaded: 6/6 new images
Indexes: 0, 1, 2, 3, 4, 5
```
(When all cached: `All 6 images already cached`)
### Notes
- Full test run surfaced a pre-existing import error in `test/test_scraper.py` (unrelated to these changes). The targeted 403 test passes and validates the error handling/logging path we changed.
- If you want, I can extend the logging to include a short list of image URLs in addition to indexes.
This commit is contained in:
10
README.md
10
README.md
@@ -27,16 +27,6 @@ If your IDE shows "Python 2.7 syntax" warnings, configure it for Python 3.10+:
|
|||||||
- Ensure "Python version" is set to 3.10+
|
- Ensure "Python version" is set to 3.10+
|
||||||
- Check "Code compatibility inspection" → Set minimum version to 3.10
|
- Check "Code compatibility inspection" → Set minimum version to 3.10
|
||||||
|
|
||||||
### VS Code
|
|
||||||
|
|
||||||
Add to `.vscode/settings.json`:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"python.pythonPath": "path/to/python3.10",
|
|
||||||
"python.analysis.typeCheckingMode": "basic",
|
|
||||||
"python.languageServer": "Pylance"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
|||||||
139
db/migration/V1__initial_schema.sql
Normal file
139
db/migration/V1__initial_schema.sql
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
-- Auctions
|
||||||
|
CREATE TABLE auctions (
|
||||||
|
auction_id TEXT PRIMARY KEY,
|
||||||
|
url TEXT UNIQUE,
|
||||||
|
title TEXT,
|
||||||
|
location TEXT,
|
||||||
|
lots_count INTEGER,
|
||||||
|
first_lot_closing_time TEXT,
|
||||||
|
scraped_at TEXT,
|
||||||
|
city TEXT,
|
||||||
|
country TEXT,
|
||||||
|
type TEXT,
|
||||||
|
lot_count INTEGER DEFAULT 0,
|
||||||
|
closing_time TEXT,
|
||||||
|
discovered_at BIGINT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_auctions_country ON auctions(country);
|
||||||
|
|
||||||
|
-- Cache
|
||||||
|
CREATE TABLE cache (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
content BYTEA,
|
||||||
|
timestamp DOUBLE PRECISION,
|
||||||
|
status_code INTEGER
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_timestamp ON cache(timestamp);
|
||||||
|
|
||||||
|
-- Lots
|
||||||
|
CREATE TABLE lots (
|
||||||
|
lot_id TEXT PRIMARY KEY,
|
||||||
|
auction_id TEXT REFERENCES auctions(auction_id),
|
||||||
|
url TEXT UNIQUE,
|
||||||
|
title TEXT,
|
||||||
|
current_bid TEXT,
|
||||||
|
bid_count INTEGER,
|
||||||
|
closing_time TEXT,
|
||||||
|
viewing_time TEXT,
|
||||||
|
pickup_date TEXT,
|
||||||
|
location TEXT,
|
||||||
|
description TEXT,
|
||||||
|
category TEXT,
|
||||||
|
scraped_at TEXT,
|
||||||
|
sale_id INTEGER,
|
||||||
|
manufacturer TEXT,
|
||||||
|
type TEXT,
|
||||||
|
year INTEGER,
|
||||||
|
currency TEXT DEFAULT 'EUR',
|
||||||
|
closing_notified INTEGER DEFAULT 0,
|
||||||
|
starting_bid TEXT,
|
||||||
|
minimum_bid TEXT,
|
||||||
|
status TEXT,
|
||||||
|
brand TEXT,
|
||||||
|
model TEXT,
|
||||||
|
attributes_json TEXT,
|
||||||
|
first_bid_time TEXT,
|
||||||
|
last_bid_time TEXT,
|
||||||
|
bid_velocity DOUBLE PRECISION,
|
||||||
|
bid_increment DOUBLE PRECISION,
|
||||||
|
year_manufactured INTEGER,
|
||||||
|
condition_score DOUBLE PRECISION,
|
||||||
|
condition_description TEXT,
|
||||||
|
serial_number TEXT,
|
||||||
|
damage_description TEXT,
|
||||||
|
followers_count INTEGER DEFAULT 0,
|
||||||
|
estimated_min_price DOUBLE PRECISION,
|
||||||
|
estimated_max_price DOUBLE PRECISION,
|
||||||
|
lot_condition TEXT,
|
||||||
|
appearance TEXT,
|
||||||
|
estimated_min DOUBLE PRECISION,
|
||||||
|
estimated_max DOUBLE PRECISION,
|
||||||
|
next_bid_step_cents INTEGER,
|
||||||
|
condition TEXT,
|
||||||
|
category_path TEXT,
|
||||||
|
city_location TEXT,
|
||||||
|
country_code TEXT,
|
||||||
|
bidding_status TEXT,
|
||||||
|
packaging TEXT,
|
||||||
|
quantity INTEGER,
|
||||||
|
vat DOUBLE PRECISION,
|
||||||
|
buyer_premium_percentage DOUBLE PRECISION,
|
||||||
|
remarks TEXT,
|
||||||
|
reserve_price DOUBLE PRECISION,
|
||||||
|
reserve_met INTEGER,
|
||||||
|
view_count INTEGER,
|
||||||
|
api_data_json TEXT,
|
||||||
|
next_scrape_at BIGINT,
|
||||||
|
scrape_priority INTEGER DEFAULT 0
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_lots_closing_time ON lots(closing_time);
|
||||||
|
CREATE INDEX idx_lots_next_scrape ON lots(next_scrape_at);
|
||||||
|
CREATE INDEX idx_lots_priority ON lots(scrape_priority DESC);
|
||||||
|
CREATE INDEX idx_lots_sale_id ON lots(sale_id);
|
||||||
|
|
||||||
|
-- Bid history
|
||||||
|
CREATE TABLE bid_history (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
lot_id TEXT REFERENCES lots(lot_id),
|
||||||
|
bid_amount DOUBLE PRECISION NOT NULL,
|
||||||
|
bid_time TEXT NOT NULL,
|
||||||
|
is_autobid INTEGER DEFAULT 0,
|
||||||
|
bidder_id TEXT,
|
||||||
|
bidder_number INTEGER,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_bid_history_bidder ON bid_history(bidder_id);
|
||||||
|
CREATE INDEX idx_bid_history_lot_time ON bid_history(lot_id, bid_time);
|
||||||
|
|
||||||
|
-- Images
|
||||||
|
CREATE TABLE images (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
lot_id TEXT REFERENCES lots(lot_id),
|
||||||
|
url TEXT,
|
||||||
|
local_path TEXT,
|
||||||
|
downloaded INTEGER DEFAULT 0,
|
||||||
|
labels TEXT,
|
||||||
|
processed_at BIGINT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_images_lot_id ON images(lot_id);
|
||||||
|
CREATE UNIQUE INDEX idx_unique_lot_url ON images(lot_id, url);
|
||||||
|
|
||||||
|
-- Resource cache
|
||||||
|
CREATE TABLE resource_cache (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
content BYTEA,
|
||||||
|
content_type TEXT,
|
||||||
|
status_code INTEGER,
|
||||||
|
headers TEXT,
|
||||||
|
timestamp DOUBLE PRECISION,
|
||||||
|
size_bytes INTEGER,
|
||||||
|
local_path TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_resource_timestamp ON resource_cache(timestamp);
|
||||||
|
CREATE INDEX idx_resource_content_type ON resource_cache(content_type);
|
||||||
@@ -8,7 +8,7 @@ The scraper follows a **3-phase hierarchical crawling pattern** to extract aucti
|
|||||||
|
|
||||||
```mariadb
|
```mariadb
|
||||||
┌─────────────────────────────────────────────────────────────────┐
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
│ TROOSTWIJK SCRAPER │
|
│ SCAEV SCRAPER │
|
||||||
└─────────────────────────────────────────────────────────────────┘
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
┌─────────────────────────────────────────────────────────────────┐
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
@@ -346,6 +346,48 @@ Lot Page Parsed
|
|||||||
└── 001.jpg
|
└── 001.jpg
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Terminal Progress per Lot (TTY)
|
||||||
|
|
||||||
|
During lot analysis, Scaev now shows a per‑lot TTY progress animation with a final summary of all inputs used:
|
||||||
|
|
||||||
|
- Spinner runs while enrichment is in progress.
|
||||||
|
- Summary lists every page/API used to analyze the lot with:
|
||||||
|
- URL/label
|
||||||
|
- Size in bytes
|
||||||
|
- Source state: cache | realtime | offline | db | intercepted
|
||||||
|
- Duration in ms
|
||||||
|
|
||||||
|
Example output snippet:
|
||||||
|
|
||||||
|
```
|
||||||
|
[LOT A1-28505-5] ✓ Done in 812 ms — pages/APIs used:
|
||||||
|
• [html] https://www.troostwijkauctions.com/l/... | 142331 B | cache | 4 ms
|
||||||
|
• [graphql] GraphQL lotDetails | 5321 B | realtime | 142 ms
|
||||||
|
• [rest] REST bid history | 18234 B | realtime | 236 ms
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- In non‑TTY environments the spinner is replaced by simple log lines.
|
||||||
|
- Intercepted GraphQL responses (captured during page load) are labeled as `intercepted` with near‑zero duration.
|
||||||
|
|
||||||
|
## Data Flow “Tunnel” (Simplified)
|
||||||
|
|
||||||
|
For each lot, the data “tunnels through” the following stages:
|
||||||
|
|
||||||
|
1. HTML page → parse `__NEXT_DATA__` for core lot fields and lot UUID.
|
||||||
|
2. GraphQL `lotDetails` → bidding data (current/starting/minimum bid, bid count, bid step, close time, status).
|
||||||
|
3. Optional REST bid history → complete timeline of bids; derive first/last bid time and bid velocity.
|
||||||
|
4. Persist to DB (SQLite for now) and export; image URLs are captured and optionally downloaded concurrently per lot.
|
||||||
|
|
||||||
|
Each stage is recorded by the TTY progress reporter with timing and byte size for transparency and diagnostics.
|
||||||
|
|
||||||
|
## Migrations and ORM Roadmap
|
||||||
|
|
||||||
|
- Migrations follow a Flyway‑style convention in `db/migration` (e.g., `V1__initial_schema.sql`).
|
||||||
|
- Current baseline is V1; there are no new migrations required at this time.
|
||||||
|
- Raw SQL usage remains in place (SQLite) while we prepare a gradual move to SQLAlchemy 2.x targeting PostgreSQL.
|
||||||
|
- See `docs/MIGRATIONS.md` for details on naming, workflow, and the future switch to PostgreSQL.
|
||||||
|
|
||||||
## Extension Points for Integration
|
## Extension Points for Integration
|
||||||
|
|
||||||
### 1. **Downstream Processing Pipeline**
|
### 1. **Downstream Processing Pipeline**
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Deployment
|
# Deployment (Scaev)
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
@@ -12,8 +12,8 @@
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Clone repository
|
# Clone repository
|
||||||
git clone git@git.appmodel.nl:Tour/troost-scraper.git
|
git clone git@git.appmodel.nl:Tour/scaev.git
|
||||||
cd troost-scraper
|
cd scaev
|
||||||
|
|
||||||
# Create virtual environment
|
# Create virtual environment
|
||||||
python -m venv .venv
|
python -m venv .venv
|
||||||
@@ -41,8 +41,8 @@ MAX_PAGES = 50
|
|||||||
### 3. Create Output Directories
|
### 3. Create Output Directories
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo mkdir -p /var/troost-scraper/output
|
sudo mkdir -p /var/scaev/output
|
||||||
sudo chown $USER:$USER /var/troost-scraper
|
sudo chown $USER:$USER /var/scaev
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. Run as Cron Job
|
### 4. Run as Cron Job
|
||||||
@@ -51,7 +51,7 @@ Add to crontab (`crontab -e`):
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Run scraper daily at 2 AM
|
# Run scraper daily at 2 AM
|
||||||
0 2 * * * cd /path/to/troost-scraper && /path/to/.venv/bin/python main.py >> /var/log/troost-scraper.log 2>&1
|
0 2 * * * cd /path/to/scaev && /path/to/.venv/bin/python main.py >> /var/log/scaev.log 2>&1
|
||||||
```
|
```
|
||||||
|
|
||||||
## Docker Deployment (Optional)
|
## Docker Deployment (Optional)
|
||||||
@@ -82,8 +82,8 @@ CMD ["python", "main.py"]
|
|||||||
Build and run:
|
Build and run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -t troost-scraper .
|
docker build -t scaev .
|
||||||
docker run -v /path/to/output:/output troost-scraper
|
docker run -v /path/to/output:/output scaev
|
||||||
```
|
```
|
||||||
|
|
||||||
## Monitoring
|
## Monitoring
|
||||||
@@ -91,13 +91,13 @@ docker run -v /path/to/output:/output troost-scraper
|
|||||||
### Check Logs
|
### Check Logs
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
tail -f /var/log/troost-scraper.log
|
tail -f /var/log/scaev.log
|
||||||
```
|
```
|
||||||
|
|
||||||
### Monitor Output
|
### Monitor Output
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ls -lh /var/troost-scraper/output/
|
ls -lh /var/scaev/output/
|
||||||
```
|
```
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
@@ -113,7 +113,7 @@ playwright install --force chromium
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Fix permissions
|
# Fix permissions
|
||||||
sudo chown -R $USER:$USER /var/troost-scraper
|
sudo chown -R $USER:$USER /var/scaev
|
||||||
```
|
```
|
||||||
|
|
||||||
### Memory Issues
|
### Memory Issues
|
||||||
|
|||||||
@@ -5,6 +5,12 @@
|
|||||||
playwright>=1.40.0
|
playwright>=1.40.0
|
||||||
aiohttp>=3.9.0 # Optional: only needed if DOWNLOAD_IMAGES=True
|
aiohttp>=3.9.0 # Optional: only needed if DOWNLOAD_IMAGES=True
|
||||||
|
|
||||||
|
# ORM groundwork (gradual adoption)
|
||||||
|
SQLAlchemy>=2.0 # Modern ORM (2.x) — groundwork for PostgreSQL
|
||||||
|
# For PostgreSQL in the near future, install one of:
|
||||||
|
# psycopg[binary]>=3.1 # Recommended
|
||||||
|
# or psycopg2-binary>=2.9
|
||||||
|
|
||||||
# Development/Testing
|
# Development/Testing
|
||||||
pytest>=7.4.0 # Optional: for testing
|
pytest>=7.4.0 # Optional: for testing
|
||||||
pytest-asyncio>=0.21.0 # Optional: for async tests
|
pytest-asyncio>=0.21.0 # Optional: for async tests
|
||||||
|
|||||||
60
src/db.py
Normal file
60
src/db.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Database scaffolding for future SQLAlchemy 2.x usage.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- We keep using the current SQLite + raw SQL for operational code.
|
||||||
|
- This module prepares an engine/session bound to DATABASE_URL, defaulting to
|
||||||
|
SQLite file in config.CACHE_DB path (for local dev only).
|
||||||
|
- PostgreSQL can be enabled by setting DATABASE_URL, e.g.:
|
||||||
|
DATABASE_URL=postgresql+psycopg://user:pass@localhost:5432/scaev
|
||||||
|
|
||||||
|
No runtime dependency from the scraper currently imports or uses this module.
|
||||||
|
It is present to bootstrap the gradual migration to SQLAlchemy 2.x.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
def get_database_url(sqlite_fallback_path: str) -> str:
|
||||||
|
url = os.getenv("DATABASE_URL")
|
||||||
|
if url and url.strip():
|
||||||
|
return url.strip()
|
||||||
|
# SQLite fallback
|
||||||
|
# Use a separate sqlite file when DATABASE_URL is not set; this does not
|
||||||
|
# alter the existing cache.db usage by raw SQL — it's just a dev convenience.
|
||||||
|
return f"sqlite:///{sqlite_fallback_path}"
|
||||||
|
|
||||||
|
|
||||||
|
def create_engine_and_session(database_url: str):
|
||||||
|
try:
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(
|
||||||
|
"SQLAlchemy is not installed. Add it to requirements.txt to use this module."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
# Engine tuned for simple use; callers may override
|
||||||
|
engine = create_engine(database_url, pool_pre_ping=True, future=True)
|
||||||
|
SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False, future=True)
|
||||||
|
return engine, SessionLocal
|
||||||
|
|
||||||
|
|
||||||
|
def get_sa(session_cached: dict, sqlite_fallback_path: str):
|
||||||
|
"""Helper to lazily create and cache SQLAlchemy engine/session factory.
|
||||||
|
|
||||||
|
session_cached: dict — a mutable dict, e.g., module-level {}, to store engine and factory
|
||||||
|
sqlite_fallback_path: path to a sqlite file for local development
|
||||||
|
"""
|
||||||
|
if 'engine' in session_cached and 'SessionLocal' in session_cached:
|
||||||
|
return session_cached['engine'], session_cached['SessionLocal']
|
||||||
|
|
||||||
|
url = get_database_url(sqlite_fallback_path)
|
||||||
|
engine, SessionLocal = create_engine_and_session(url)
|
||||||
|
session_cached['engine'] = engine
|
||||||
|
session_cached['SessionLocal'] = SessionLocal
|
||||||
|
return engine, SessionLocal
|
||||||
105
src/progress.py
Normal file
105
src/progress.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Lightweight TTY progress reporter for per-lot scraping.
|
||||||
|
|
||||||
|
It shows a spinner while work is in progress and records all page/API
|
||||||
|
fetches that contributed to the lot analysis, including:
|
||||||
|
- URL or source label
|
||||||
|
- size in bytes (when available)
|
||||||
|
- cache status (cached/real-time/offline/db/intercepted)
|
||||||
|
- duration in milliseconds
|
||||||
|
|
||||||
|
Intentionally dependency-free and safe to use in async code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProgressEvent:
|
||||||
|
kind: str # html | graphql | rest | image | cache | db | intercepted | other
|
||||||
|
label: str # url or description
|
||||||
|
size_bytes: Optional[int]
|
||||||
|
cached: str # "cache", "realtime", "offline", "db", "intercepted"
|
||||||
|
duration_ms: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProgressReporter:
|
||||||
|
lot_id: str
|
||||||
|
title: str = ""
|
||||||
|
_events: List[ProgressEvent] = field(default_factory=list)
|
||||||
|
_start_ts: float = field(default_factory=time.time)
|
||||||
|
_stop_ts: Optional[float] = None
|
||||||
|
_spinner_thread: Optional[threading.Thread] = None
|
||||||
|
_stop_flag: bool = False
|
||||||
|
_is_tty: bool = field(default_factory=lambda: sys.stdout.isatty())
|
||||||
|
|
||||||
|
def start(self) -> None:
|
||||||
|
if not self._is_tty:
|
||||||
|
print(f"[LOT {self.lot_id}] ⏳ Analyzing… {self.title[:60]}")
|
||||||
|
return
|
||||||
|
|
||||||
|
def run_spinner():
|
||||||
|
idx = 0
|
||||||
|
while not self._stop_flag:
|
||||||
|
frame = SPINNER_FRAMES[idx % len(SPINNER_FRAMES)]
|
||||||
|
idx += 1
|
||||||
|
summary = f"{len(self._events)} events"
|
||||||
|
line = f"[LOT {self.lot_id}] {frame} {self.title[:60]} · {summary}"
|
||||||
|
# CR without newline to animate
|
||||||
|
sys.stdout.write("\r" + line)
|
||||||
|
sys.stdout.flush()
|
||||||
|
time.sleep(0.09)
|
||||||
|
# Clear the spinner line
|
||||||
|
sys.stdout.write("\r" + " " * 120 + "\r")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
self._spinner_thread = threading.Thread(target=run_spinner, daemon=True)
|
||||||
|
self._spinner_thread.start()
|
||||||
|
|
||||||
|
def add_event(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
kind: str,
|
||||||
|
label: str,
|
||||||
|
size_bytes: Optional[int] = None,
|
||||||
|
cached: str = "realtime",
|
||||||
|
duration_ms: Optional[float] = None,
|
||||||
|
) -> None:
|
||||||
|
self._events.append(
|
||||||
|
ProgressEvent(
|
||||||
|
kind=kind,
|
||||||
|
label=label,
|
||||||
|
size_bytes=int(size_bytes) if size_bytes is not None else None,
|
||||||
|
cached=cached,
|
||||||
|
duration_ms=int(duration_ms) if duration_ms is not None else None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._stop_ts = time.time()
|
||||||
|
self._stop_flag = True
|
||||||
|
if self._spinner_thread and self._spinner_thread.is_alive():
|
||||||
|
self._spinner_thread.join(timeout=1.0)
|
||||||
|
|
||||||
|
total_ms = int((self._stop_ts - self._start_ts) * 1000)
|
||||||
|
print(f"[LOT {self.lot_id}] ✓ Done in {total_ms} ms — pages/APIs used:")
|
||||||
|
if not self._events:
|
||||||
|
print(" • (none)")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Print events as a compact list
|
||||||
|
for ev in self._events:
|
||||||
|
size = f"{ev.size_bytes} B" if ev.size_bytes is not None else "?"
|
||||||
|
dur = f"{ev.duration_ms} ms" if ev.duration_ms is not None else "?"
|
||||||
|
print(f" • [{ev.kind}] {ev.label} | {size} | {ev.cached} | {dur}")
|
||||||
157
src/scraper.py
157
src/scraper.py
@@ -29,6 +29,7 @@ from graphql_client import (
|
|||||||
)
|
)
|
||||||
from bid_history_client import fetch_bid_history, parse_bid_history
|
from bid_history_client import fetch_bid_history, parse_bid_history
|
||||||
from priority import calculate_priority, parse_closing_time
|
from priority import calculate_priority, parse_closing_time
|
||||||
|
from progress import ProgressReporter
|
||||||
|
|
||||||
class TroostwijkScraper:
|
class TroostwijkScraper:
|
||||||
"""Main scraper class for Troostwijk Auctions"""
|
"""Main scraper class for Troostwijk Auctions"""
|
||||||
@@ -96,7 +97,7 @@ class TroostwijkScraper:
|
|||||||
(useful for auction listing pages where we just need HTML structure)
|
(useful for auction listing pages where we just need HTML structure)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict with 'content' and 'from_cache' keys
|
Dict with: 'content', 'from_cache', 'duration_ms', 'bytes', 'url'
|
||||||
"""
|
"""
|
||||||
if use_cache:
|
if use_cache:
|
||||||
cache_start = time.time()
|
cache_start = time.time()
|
||||||
@@ -104,7 +105,17 @@ class TroostwijkScraper:
|
|||||||
if cached:
|
if cached:
|
||||||
cache_time = (time.time() - cache_start) * 1000
|
cache_time = (time.time() - cache_start) * 1000
|
||||||
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
||||||
return {'content': cached['content'], 'from_cache': True}
|
try:
|
||||||
|
byte_len = len(cached['content'].encode('utf-8'))
|
||||||
|
except Exception:
|
||||||
|
byte_len = None
|
||||||
|
return {
|
||||||
|
'content': cached['content'],
|
||||||
|
'from_cache': True,
|
||||||
|
'duration_ms': int(cache_time),
|
||||||
|
'bytes': byte_len,
|
||||||
|
'url': url
|
||||||
|
}
|
||||||
|
|
||||||
# In OFFLINE mode we never fetch from network
|
# In OFFLINE mode we never fetch from network
|
||||||
if self.offline:
|
if self.offline:
|
||||||
@@ -130,7 +141,17 @@ class TroostwijkScraper:
|
|||||||
total_time = time.time() - fetch_start
|
total_time = time.time() - fetch_start
|
||||||
self.cache.set(url, content, 200)
|
self.cache.set(url, content, 200)
|
||||||
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
|
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
|
||||||
return {'content': content, 'from_cache': False}
|
try:
|
||||||
|
byte_len = len(content.encode('utf-8'))
|
||||||
|
except Exception:
|
||||||
|
byte_len = None
|
||||||
|
return {
|
||||||
|
'content': content,
|
||||||
|
'from_cache': False,
|
||||||
|
'duration_ms': int(total_time * 1000),
|
||||||
|
'bytes': byte_len,
|
||||||
|
'url': url
|
||||||
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ERROR: {e}")
|
print(f" ERROR: {e}")
|
||||||
@@ -302,6 +323,18 @@ class TroostwijkScraper:
|
|||||||
print(f" Type: LOT")
|
print(f" Type: LOT")
|
||||||
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
|
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||||
|
|
||||||
|
# TTY progress reporter per lot
|
||||||
|
lot_progress = ProgressReporter(lot_id=page_data.get('lot_id', ''), title=page_data.get('title', ''))
|
||||||
|
lot_progress.start()
|
||||||
|
# Record HTML page fetch
|
||||||
|
lot_progress.add_event(
|
||||||
|
kind='html',
|
||||||
|
label=result.get('url', url),
|
||||||
|
size_bytes=result.get('bytes'),
|
||||||
|
cached='cache' if from_cache else 'realtime',
|
||||||
|
duration_ms=result.get('duration_ms')
|
||||||
|
)
|
||||||
|
|
||||||
# Extract ALL data from __NEXT_DATA__ lot object
|
# Extract ALL data from __NEXT_DATA__ lot object
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
@@ -339,6 +372,13 @@ class TroostwijkScraper:
|
|||||||
try:
|
try:
|
||||||
intercepted_json = self.intercepted_api_data[lot_id]
|
intercepted_json = self.intercepted_api_data[lot_id]
|
||||||
intercepted_data = json.loads(intercepted_json)
|
intercepted_data = json.loads(intercepted_json)
|
||||||
|
lot_progress.add_event(
|
||||||
|
kind='intercepted',
|
||||||
|
label='GraphQL (intercepted)',
|
||||||
|
size_bytes=len(intercepted_json.encode('utf-8')),
|
||||||
|
cached='intercepted',
|
||||||
|
duration_ms=0
|
||||||
|
)
|
||||||
# Store the raw JSON for future offline use
|
# Store the raw JSON for future offline use
|
||||||
page_data['api_data_json'] = intercepted_json
|
page_data['api_data_json'] = intercepted_json
|
||||||
# Extract lot data from intercepted response
|
# Extract lot data from intercepted response
|
||||||
@@ -374,6 +414,13 @@ class TroostwijkScraper:
|
|||||||
|
|
||||||
if is_complete:
|
if is_complete:
|
||||||
print(f" Using cached API data")
|
print(f" Using cached API data")
|
||||||
|
lot_progress.add_event(
|
||||||
|
kind='db',
|
||||||
|
label='lots table (cached api fields)',
|
||||||
|
size_bytes=None,
|
||||||
|
cached='db',
|
||||||
|
duration_ms=0
|
||||||
|
)
|
||||||
page_data['followers_count'] = existing[0]
|
page_data['followers_count'] = existing[0]
|
||||||
page_data['estimated_min_price'] = existing[1]
|
page_data['estimated_min_price'] = existing[1]
|
||||||
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
||||||
@@ -385,9 +432,31 @@ class TroostwijkScraper:
|
|||||||
else:
|
else:
|
||||||
print(f" Fetching lot data from API (concurrent)...")
|
print(f" Fetching lot data from API (concurrent)...")
|
||||||
# Make concurrent API calls
|
# Make concurrent API calls
|
||||||
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
api_tasks = []
|
||||||
|
# Wrap each API call to capture duration and size
|
||||||
|
async def _timed_fetch(name, coro_func, *args, **kwargs):
|
||||||
|
t0 = time.time()
|
||||||
|
data = await coro_func(*args, **kwargs)
|
||||||
|
dt = int((time.time() - t0) * 1000)
|
||||||
|
size_b = None
|
||||||
|
try:
|
||||||
|
if data is not None:
|
||||||
|
import json as _json
|
||||||
|
size_b = len(_json.dumps(data).encode('utf-8'))
|
||||||
|
except Exception:
|
||||||
|
size_b = None
|
||||||
|
lot_progress.add_event(
|
||||||
|
kind='graphql',
|
||||||
|
label=name,
|
||||||
|
size_bytes=size_b,
|
||||||
|
cached='realtime',
|
||||||
|
duration_ms=dt
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
||||||
|
api_tasks.append(_timed_fetch('GraphQL lotDetails', fetch_lot_bidding_data, lot_id))
|
||||||
if auction_id:
|
if auction_id:
|
||||||
api_tasks.append(fetch_auction_data(auction_id))
|
api_tasks.append(_timed_fetch('GraphQL auction', fetch_auction_data, auction_id))
|
||||||
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||||
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
|
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
|
||||||
bid_history_data = None # Will fetch after we have lot_uuid
|
bid_history_data = None # Will fetch after we have lot_uuid
|
||||||
@@ -395,7 +464,7 @@ class TroostwijkScraper:
|
|||||||
# Fresh page fetch - make concurrent API calls for all data
|
# Fresh page fetch - make concurrent API calls for all data
|
||||||
if not self.offline:
|
if not self.offline:
|
||||||
print(f" Fetching lot data from API (concurrent)...")
|
print(f" Fetching lot data from API (concurrent)...")
|
||||||
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
api_tasks = []
|
||||||
task_map = {'bidding': 0} # Track which index corresponds to which task
|
task_map = {'bidding': 0} # Track which index corresponds to which task
|
||||||
|
|
||||||
# Add auction data fetch if we need viewing/pickup times
|
# Add auction data fetch if we need viewing/pickup times
|
||||||
@@ -411,16 +480,80 @@ class TroostwijkScraper:
|
|||||||
|
|
||||||
if not has_times:
|
if not has_times:
|
||||||
task_map['auction'] = len(api_tasks)
|
task_map['auction'] = len(api_tasks)
|
||||||
api_tasks.append(fetch_auction_data(auction_id))
|
async def fetch_auction_wrapped():
|
||||||
|
t0 = time.time()
|
||||||
|
data = await fetch_auction_data(auction_id)
|
||||||
|
dt = int((time.time() - t0) * 1000)
|
||||||
|
size_b = None
|
||||||
|
try:
|
||||||
|
if data is not None:
|
||||||
|
import json as _json
|
||||||
|
size_b = len(_json.dumps(data).encode('utf-8'))
|
||||||
|
except Exception:
|
||||||
|
size_b = None
|
||||||
|
lot_progress.add_event(
|
||||||
|
kind='graphql',
|
||||||
|
label='GraphQL auction',
|
||||||
|
size_bytes=size_b,
|
||||||
|
cached='realtime',
|
||||||
|
duration_ms=dt
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
api_tasks.append(fetch_auction_wrapped())
|
||||||
|
|
||||||
# Add bid history fetch if we have lot_uuid and expect bids
|
# Add bid history fetch if we have lot_uuid and expect bids
|
||||||
if lot_uuid:
|
if lot_uuid:
|
||||||
task_map['bid_history'] = len(api_tasks)
|
task_map['bid_history'] = len(api_tasks)
|
||||||
api_tasks.append(fetch_bid_history(lot_uuid))
|
async def fetch_bid_history_wrapped():
|
||||||
|
t0 = time.time()
|
||||||
|
data = await fetch_bid_history(lot_uuid)
|
||||||
|
dt = int((time.time() - t0) * 1000)
|
||||||
|
size_b = None
|
||||||
|
try:
|
||||||
|
if data is not None:
|
||||||
|
import json as _json
|
||||||
|
size_b = len(_json.dumps(data).encode('utf-8'))
|
||||||
|
except Exception:
|
||||||
|
size_b = None
|
||||||
|
lot_progress.add_event(
|
||||||
|
kind='rest',
|
||||||
|
label='REST bid history',
|
||||||
|
size_bytes=size_b,
|
||||||
|
cached='realtime',
|
||||||
|
duration_ms=dt
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
api_tasks.append(fetch_bid_history_wrapped())
|
||||||
|
|
||||||
# Execute all API calls concurrently
|
# Execute all API calls concurrently
|
||||||
|
# Always include the bidding data as first task
|
||||||
|
async def fetch_bidding_wrapped():
|
||||||
|
t0 = time.time()
|
||||||
|
data = await fetch_lot_bidding_data(lot_id)
|
||||||
|
dt = int((time.time() - t0) * 1000)
|
||||||
|
size_b = None
|
||||||
|
try:
|
||||||
|
if data is not None:
|
||||||
|
import json as _json
|
||||||
|
size_b = len(_json.dumps(data).encode('utf-8'))
|
||||||
|
except Exception:
|
||||||
|
size_b = None
|
||||||
|
lot_progress.add_event(
|
||||||
|
kind='graphql',
|
||||||
|
label='GraphQL lotDetails',
|
||||||
|
size_bytes=size_b,
|
||||||
|
cached='realtime',
|
||||||
|
duration_ms=dt
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
||||||
|
api_tasks.insert(0, fetch_bidding_wrapped())
|
||||||
|
# Adjust task_map indexes
|
||||||
|
for k in list(task_map.keys()):
|
||||||
|
task_map[k] += 1 if k != 'bidding' else 0
|
||||||
|
|
||||||
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||||
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
|
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
|
||||||
|
|
||||||
# Store raw API JSON for offline replay
|
# Store raw API JSON for offline replay
|
||||||
if bidding_data:
|
if bidding_data:
|
||||||
@@ -628,6 +761,12 @@ class TroostwijkScraper:
|
|||||||
else:
|
else:
|
||||||
print(f" All {len(images)} images already cached")
|
print(f" All {len(images)} images already cached")
|
||||||
|
|
||||||
|
# Stop and print progress summary for the lot
|
||||||
|
try:
|
||||||
|
lot_progress.stop()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
return page_data
|
return page_data
|
||||||
|
|
||||||
def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]:
|
def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]:
|
||||||
|
|||||||
Reference in New Issue
Block a user