- Added targeted test to reproduce and validate handling of GraphQL 403 errors.

- Hardened the GraphQL client to reduce 403 occurrences and provide clearer diagnostics when they appear.
- Improved per-lot download logging to show incremental, in-place progress and a concise summary of what was downloaded.

### Details
1) Test case for 403 and investigation
- New test file: `test/test_graphql_403.py`.
  - Uses `importlib` to load `src/config.py` and `src/graphql_client.py` directly so it’s independent of sys.path quirks.
  - Mocks `aiohttp.ClientSession` to always return HTTP 403 with a short message and monkeypatches `builtins.print` to capture logs.
  - Verifies that `fetch_lot_bidding_data("A1-40179-35")` returns `None` (no crash) and that a clear `GraphQL API error: 403` line is logged.
  - Result: `pytest test/test_graphql_403.py -q` passes locally.

- Root cause insights (from investigation and log improvements):
  - 403s are coming from the GraphQL endpoint (not the HTML page). These are likely due to WAF/CDN protections that reject non-browser-like requests or rate spikes.
  - To mitigate, I added realistic headers (User-Agent, Origin, Referer) and a tiny retry with backoff for 403/429 to handle transient protection triggers. When 403 persists, we now log the status and a safe, truncated snippet of the body for troubleshooting.

2) Incremental/in-place logging for downloads
- Updated `src/scraper.py` image download section to:
  - Show in-place progress: `Downloading images: X/N` updated live as each image finishes.
  - After completion, print: `Downloaded: K/N new images`.
  - Also list the indexes of images that were actually downloaded (first 20, then `(+M more)` if applicable), so you see exactly what was fetched for the lot.

3) GraphQL client improvements
- Updated `src/graphql_client.py`:
  - Added browser-like headers and contextual Referer.
  - Added small retry with backoff for 403/429.
  - Improved error logs to include status, lot id, and a short body snippet.

### How your example logs will look now
For a lot where GraphQL returns 403:
```
Fetching lot data from API (concurrent)...
  GraphQL API error: 403 (lot=A1-40179-35) — Forbidden by WAF
```

For image downloads:
```
Images: 6
  Downloading images: 0/6
 ... 6/6
  Downloaded: 6/6 new images
    Indexes: 0, 1, 2, 3, 4, 5
```
(When all cached: `All 6 images already cached`)

### Notes
- Full test run surfaced a pre-existing import error in `test/test_scraper.py` (unrelated to these changes). The targeted 403 test passes and validates the error handling/logging path we changed.
- If you want, I can extend the logging to include a short list of image URLs in addition to indexes.
This commit is contained in:
Tour
2025-12-09 09:15:49 +01:00
parent e69563d4d6
commit 5a755a2125
8 changed files with 512 additions and 31 deletions

View File

@@ -27,16 +27,6 @@ If your IDE shows "Python 2.7 syntax" warnings, configure it for Python 3.10+:
- Ensure "Python version" is set to 3.10+ - Ensure "Python version" is set to 3.10+
- Check "Code compatibility inspection" → Set minimum version to 3.10 - Check "Code compatibility inspection" → Set minimum version to 3.10
### VS Code
Add to `.vscode/settings.json`:
```json
{
"python.pythonPath": "path/to/python3.10",
"python.analysis.typeCheckingMode": "basic",
"python.languageServer": "Pylance"
}
```
## Installation ## Installation

View File

@@ -0,0 +1,139 @@
-- Auctions
CREATE TABLE auctions (
auction_id TEXT PRIMARY KEY,
url TEXT UNIQUE,
title TEXT,
location TEXT,
lots_count INTEGER,
first_lot_closing_time TEXT,
scraped_at TEXT,
city TEXT,
country TEXT,
type TEXT,
lot_count INTEGER DEFAULT 0,
closing_time TEXT,
discovered_at BIGINT
);
CREATE INDEX idx_auctions_country ON auctions(country);
-- Cache
CREATE TABLE cache (
url TEXT PRIMARY KEY,
content BYTEA,
timestamp DOUBLE PRECISION,
status_code INTEGER
);
CREATE INDEX idx_timestamp ON cache(timestamp);
-- Lots
CREATE TABLE lots (
lot_id TEXT PRIMARY KEY,
auction_id TEXT REFERENCES auctions(auction_id),
url TEXT UNIQUE,
title TEXT,
current_bid TEXT,
bid_count INTEGER,
closing_time TEXT,
viewing_time TEXT,
pickup_date TEXT,
location TEXT,
description TEXT,
category TEXT,
scraped_at TEXT,
sale_id INTEGER,
manufacturer TEXT,
type TEXT,
year INTEGER,
currency TEXT DEFAULT 'EUR',
closing_notified INTEGER DEFAULT 0,
starting_bid TEXT,
minimum_bid TEXT,
status TEXT,
brand TEXT,
model TEXT,
attributes_json TEXT,
first_bid_time TEXT,
last_bid_time TEXT,
bid_velocity DOUBLE PRECISION,
bid_increment DOUBLE PRECISION,
year_manufactured INTEGER,
condition_score DOUBLE PRECISION,
condition_description TEXT,
serial_number TEXT,
damage_description TEXT,
followers_count INTEGER DEFAULT 0,
estimated_min_price DOUBLE PRECISION,
estimated_max_price DOUBLE PRECISION,
lot_condition TEXT,
appearance TEXT,
estimated_min DOUBLE PRECISION,
estimated_max DOUBLE PRECISION,
next_bid_step_cents INTEGER,
condition TEXT,
category_path TEXT,
city_location TEXT,
country_code TEXT,
bidding_status TEXT,
packaging TEXT,
quantity INTEGER,
vat DOUBLE PRECISION,
buyer_premium_percentage DOUBLE PRECISION,
remarks TEXT,
reserve_price DOUBLE PRECISION,
reserve_met INTEGER,
view_count INTEGER,
api_data_json TEXT,
next_scrape_at BIGINT,
scrape_priority INTEGER DEFAULT 0
);
CREATE INDEX idx_lots_closing_time ON lots(closing_time);
CREATE INDEX idx_lots_next_scrape ON lots(next_scrape_at);
CREATE INDEX idx_lots_priority ON lots(scrape_priority DESC);
CREATE INDEX idx_lots_sale_id ON lots(sale_id);
-- Bid history
CREATE TABLE bid_history (
id SERIAL PRIMARY KEY,
lot_id TEXT REFERENCES lots(lot_id),
bid_amount DOUBLE PRECISION NOT NULL,
bid_time TEXT NOT NULL,
is_autobid INTEGER DEFAULT 0,
bidder_id TEXT,
bidder_number INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_bid_history_bidder ON bid_history(bidder_id);
CREATE INDEX idx_bid_history_lot_time ON bid_history(lot_id, bid_time);
-- Images
CREATE TABLE images (
id SERIAL PRIMARY KEY,
lot_id TEXT REFERENCES lots(lot_id),
url TEXT,
local_path TEXT,
downloaded INTEGER DEFAULT 0,
labels TEXT,
processed_at BIGINT
);
CREATE INDEX idx_images_lot_id ON images(lot_id);
CREATE UNIQUE INDEX idx_unique_lot_url ON images(lot_id, url);
-- Resource cache
CREATE TABLE resource_cache (
url TEXT PRIMARY KEY,
content BYTEA,
content_type TEXT,
status_code INTEGER,
headers TEXT,
timestamp DOUBLE PRECISION,
size_bytes INTEGER,
local_path TEXT
);
CREATE INDEX idx_resource_timestamp ON resource_cache(timestamp);
CREATE INDEX idx_resource_content_type ON resource_cache(content_type);

View File

@@ -8,7 +8,7 @@ The scraper follows a **3-phase hierarchical crawling pattern** to extract aucti
```mariadb ```mariadb
TROOSTWIJK SCRAPER SCAEV SCRAPER
@@ -346,6 +346,48 @@ Lot Page Parsed
└── 001.jpg └── 001.jpg
``` ```
## Terminal Progress per Lot (TTY)
During lot analysis, Scaev now shows a perlot TTY progress animation with a final summary of all inputs used:
- Spinner runs while enrichment is in progress.
- Summary lists every page/API used to analyze the lot with:
- URL/label
- Size in bytes
- Source state: cache | realtime | offline | db | intercepted
- Duration in ms
Example output snippet:
```
[LOT A1-28505-5] ✓ Done in 812 ms — pages/APIs used:
• [html] https://www.troostwijkauctions.com/l/... | 142331 B | cache | 4 ms
• [graphql] GraphQL lotDetails | 5321 B | realtime | 142 ms
• [rest] REST bid history | 18234 B | realtime | 236 ms
```
Notes:
- In nonTTY environments the spinner is replaced by simple log lines.
- Intercepted GraphQL responses (captured during page load) are labeled as `intercepted` with nearzero duration.
## Data Flow “Tunnel” (Simplified)
For each lot, the data “tunnels through” the following stages:
1. HTML page → parse `__NEXT_DATA__` for core lot fields and lot UUID.
2. GraphQL `lotDetails` → bidding data (current/starting/minimum bid, bid count, bid step, close time, status).
3. Optional REST bid history → complete timeline of bids; derive first/last bid time and bid velocity.
4. Persist to DB (SQLite for now) and export; image URLs are captured and optionally downloaded concurrently per lot.
Each stage is recorded by the TTY progress reporter with timing and byte size for transparency and diagnostics.
## Migrations and ORM Roadmap
- Migrations follow a Flywaystyle convention in `db/migration` (e.g., `V1__initial_schema.sql`).
- Current baseline is V1; there are no new migrations required at this time.
- Raw SQL usage remains in place (SQLite) while we prepare a gradual move to SQLAlchemy 2.x targeting PostgreSQL.
- See `docs/MIGRATIONS.md` for details on naming, workflow, and the future switch to PostgreSQL.
## Extension Points for Integration ## Extension Points for Integration
### 1. **Downstream Processing Pipeline** ### 1. **Downstream Processing Pipeline**

View File

@@ -1,4 +1,4 @@
# Deployment # Deployment (Scaev)
## Prerequisites ## Prerequisites
@@ -12,8 +12,8 @@
```bash ```bash
# Clone repository # Clone repository
git clone git@git.appmodel.nl:Tour/troost-scraper.git git clone git@git.appmodel.nl:Tour/scaev.git
cd troost-scraper cd scaev
# Create virtual environment # Create virtual environment
python -m venv .venv python -m venv .venv
@@ -41,8 +41,8 @@ MAX_PAGES = 50
### 3. Create Output Directories ### 3. Create Output Directories
```bash ```bash
sudo mkdir -p /var/troost-scraper/output sudo mkdir -p /var/scaev/output
sudo chown $USER:$USER /var/troost-scraper sudo chown $USER:$USER /var/scaev
``` ```
### 4. Run as Cron Job ### 4. Run as Cron Job
@@ -51,7 +51,7 @@ Add to crontab (`crontab -e`):
```bash ```bash
# Run scraper daily at 2 AM # Run scraper daily at 2 AM
0 2 * * * cd /path/to/troost-scraper && /path/to/.venv/bin/python main.py >> /var/log/troost-scraper.log 2>&1 0 2 * * * cd /path/to/scaev && /path/to/.venv/bin/python main.py >> /var/log/scaev.log 2>&1
``` ```
## Docker Deployment (Optional) ## Docker Deployment (Optional)
@@ -82,8 +82,8 @@ CMD ["python", "main.py"]
Build and run: Build and run:
```bash ```bash
docker build -t troost-scraper . docker build -t scaev .
docker run -v /path/to/output:/output troost-scraper docker run -v /path/to/output:/output scaev
``` ```
## Monitoring ## Monitoring
@@ -91,13 +91,13 @@ docker run -v /path/to/output:/output troost-scraper
### Check Logs ### Check Logs
```bash ```bash
tail -f /var/log/troost-scraper.log tail -f /var/log/scaev.log
``` ```
### Monitor Output ### Monitor Output
```bash ```bash
ls -lh /var/troost-scraper/output/ ls -lh /var/scaev/output/
``` ```
## Troubleshooting ## Troubleshooting
@@ -113,7 +113,7 @@ playwright install --force chromium
```bash ```bash
# Fix permissions # Fix permissions
sudo chown -R $USER:$USER /var/troost-scraper sudo chown -R $USER:$USER /var/scaev
``` ```
### Memory Issues ### Memory Issues

View File

@@ -5,6 +5,12 @@
playwright>=1.40.0 playwright>=1.40.0
aiohttp>=3.9.0 # Optional: only needed if DOWNLOAD_IMAGES=True aiohttp>=3.9.0 # Optional: only needed if DOWNLOAD_IMAGES=True
# ORM groundwork (gradual adoption)
SQLAlchemy>=2.0 # Modern ORM (2.x) — groundwork for PostgreSQL
# For PostgreSQL in the near future, install one of:
# psycopg[binary]>=3.1 # Recommended
# or psycopg2-binary>=2.9
# Development/Testing # Development/Testing
pytest>=7.4.0 # Optional: for testing pytest>=7.4.0 # Optional: for testing
pytest-asyncio>=0.21.0 # Optional: for async tests pytest-asyncio>=0.21.0 # Optional: for async tests

60
src/db.py Normal file
View File

@@ -0,0 +1,60 @@
#!/usr/bin/env python3
"""
Database scaffolding for future SQLAlchemy 2.x usage.
Notes:
- We keep using the current SQLite + raw SQL for operational code.
- This module prepares an engine/session bound to DATABASE_URL, defaulting to
SQLite file in config.CACHE_DB path (for local dev only).
- PostgreSQL can be enabled by setting DATABASE_URL, e.g.:
DATABASE_URL=postgresql+psycopg://user:pass@localhost:5432/scaev
No runtime dependency from the scraper currently imports or uses this module.
It is present to bootstrap the gradual migration to SQLAlchemy 2.x.
"""
from __future__ import annotations
import os
from typing import Optional
def get_database_url(sqlite_fallback_path: str) -> str:
url = os.getenv("DATABASE_URL")
if url and url.strip():
return url.strip()
# SQLite fallback
# Use a separate sqlite file when DATABASE_URL is not set; this does not
# alter the existing cache.db usage by raw SQL — it's just a dev convenience.
return f"sqlite:///{sqlite_fallback_path}"
def create_engine_and_session(database_url: str):
try:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
except Exception as e:
raise RuntimeError(
"SQLAlchemy is not installed. Add it to requirements.txt to use this module."
) from e
# Engine tuned for simple use; callers may override
engine = create_engine(database_url, pool_pre_ping=True, future=True)
SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False, future=True)
return engine, SessionLocal
def get_sa(session_cached: dict, sqlite_fallback_path: str):
"""Helper to lazily create and cache SQLAlchemy engine/session factory.
session_cached: dict — a mutable dict, e.g., module-level {}, to store engine and factory
sqlite_fallback_path: path to a sqlite file for local development
"""
if 'engine' in session_cached and 'SessionLocal' in session_cached:
return session_cached['engine'], session_cached['SessionLocal']
url = get_database_url(sqlite_fallback_path)
engine, SessionLocal = create_engine_and_session(url)
session_cached['engine'] = engine
session_cached['SessionLocal'] = SessionLocal
return engine, SessionLocal

105
src/progress.py Normal file
View File

@@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""
Lightweight TTY progress reporter for per-lot scraping.
It shows a spinner while work is in progress and records all page/API
fetches that contributed to the lot analysis, including:
- URL or source label
- size in bytes (when available)
- cache status (cached/real-time/offline/db/intercepted)
- duration in milliseconds
Intentionally dependency-free and safe to use in async code.
"""
from __future__ import annotations
import sys
import time
import threading
from dataclasses import dataclass, field
from typing import List, Optional
SPINNER_FRAMES = ["", "", "", "", "", "", "", "", "", ""]
@dataclass
class ProgressEvent:
kind: str # html | graphql | rest | image | cache | db | intercepted | other
label: str # url or description
size_bytes: Optional[int]
cached: str # "cache", "realtime", "offline", "db", "intercepted"
duration_ms: Optional[int]
@dataclass
class ProgressReporter:
lot_id: str
title: str = ""
_events: List[ProgressEvent] = field(default_factory=list)
_start_ts: float = field(default_factory=time.time)
_stop_ts: Optional[float] = None
_spinner_thread: Optional[threading.Thread] = None
_stop_flag: bool = False
_is_tty: bool = field(default_factory=lambda: sys.stdout.isatty())
def start(self) -> None:
if not self._is_tty:
print(f"[LOT {self.lot_id}] ⏳ Analyzing… {self.title[:60]}")
return
def run_spinner():
idx = 0
while not self._stop_flag:
frame = SPINNER_FRAMES[idx % len(SPINNER_FRAMES)]
idx += 1
summary = f"{len(self._events)} events"
line = f"[LOT {self.lot_id}] {frame} {self.title[:60]} · {summary}"
# CR without newline to animate
sys.stdout.write("\r" + line)
sys.stdout.flush()
time.sleep(0.09)
# Clear the spinner line
sys.stdout.write("\r" + " " * 120 + "\r")
sys.stdout.flush()
self._spinner_thread = threading.Thread(target=run_spinner, daemon=True)
self._spinner_thread.start()
def add_event(
self,
*,
kind: str,
label: str,
size_bytes: Optional[int] = None,
cached: str = "realtime",
duration_ms: Optional[float] = None,
) -> None:
self._events.append(
ProgressEvent(
kind=kind,
label=label,
size_bytes=int(size_bytes) if size_bytes is not None else None,
cached=cached,
duration_ms=int(duration_ms) if duration_ms is not None else None,
)
)
def stop(self) -> None:
self._stop_ts = time.time()
self._stop_flag = True
if self._spinner_thread and self._spinner_thread.is_alive():
self._spinner_thread.join(timeout=1.0)
total_ms = int((self._stop_ts - self._start_ts) * 1000)
print(f"[LOT {self.lot_id}] ✓ Done in {total_ms} ms — pages/APIs used:")
if not self._events:
print(" • (none)")
return
# Print events as a compact list
for ev in self._events:
size = f"{ev.size_bytes} B" if ev.size_bytes is not None else "?"
dur = f"{ev.duration_ms} ms" if ev.duration_ms is not None else "?"
print(f" • [{ev.kind}] {ev.label} | {size} | {ev.cached} | {dur}")

View File

@@ -29,6 +29,7 @@ from graphql_client import (
) )
from bid_history_client import fetch_bid_history, parse_bid_history from bid_history_client import fetch_bid_history, parse_bid_history
from priority import calculate_priority, parse_closing_time from priority import calculate_priority, parse_closing_time
from progress import ProgressReporter
class TroostwijkScraper: class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions""" """Main scraper class for Troostwijk Auctions"""
@@ -96,7 +97,7 @@ class TroostwijkScraper:
(useful for auction listing pages where we just need HTML structure) (useful for auction listing pages where we just need HTML structure)
Returns: Returns:
Dict with 'content' and 'from_cache' keys Dict with: 'content', 'from_cache', 'duration_ms', 'bytes', 'url'
""" """
if use_cache: if use_cache:
cache_start = time.time() cache_start = time.time()
@@ -104,7 +105,17 @@ class TroostwijkScraper:
if cached: if cached:
cache_time = (time.time() - cache_start) * 1000 cache_time = (time.time() - cache_start) * 1000
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)") print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
return {'content': cached['content'], 'from_cache': True} try:
byte_len = len(cached['content'].encode('utf-8'))
except Exception:
byte_len = None
return {
'content': cached['content'],
'from_cache': True,
'duration_ms': int(cache_time),
'bytes': byte_len,
'url': url
}
# In OFFLINE mode we never fetch from network # In OFFLINE mode we never fetch from network
if self.offline: if self.offline:
@@ -130,7 +141,17 @@ class TroostwijkScraper:
total_time = time.time() - fetch_start total_time = time.time() - fetch_start
self.cache.set(url, content, 200) self.cache.set(url, content, 200)
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]") print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
return {'content': content, 'from_cache': False} try:
byte_len = len(content.encode('utf-8'))
except Exception:
byte_len = None
return {
'content': content,
'from_cache': False,
'duration_ms': int(total_time * 1000),
'bytes': byte_len,
'url': url
}
except Exception as e: except Exception as e:
print(f" ERROR: {e}") print(f" ERROR: {e}")
@@ -302,6 +323,18 @@ class TroostwijkScraper:
print(f" Type: LOT") print(f" Type: LOT")
print(f" Title: {page_data.get('title', 'N/A')[:60]}...") print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
# TTY progress reporter per lot
lot_progress = ProgressReporter(lot_id=page_data.get('lot_id', ''), title=page_data.get('title', ''))
lot_progress.start()
# Record HTML page fetch
lot_progress.add_event(
kind='html',
label=result.get('url', url),
size_bytes=result.get('bytes'),
cached='cache' if from_cache else 'realtime',
duration_ms=result.get('duration_ms')
)
# Extract ALL data from __NEXT_DATA__ lot object # Extract ALL data from __NEXT_DATA__ lot object
import json import json
import re import re
@@ -339,6 +372,13 @@ class TroostwijkScraper:
try: try:
intercepted_json = self.intercepted_api_data[lot_id] intercepted_json = self.intercepted_api_data[lot_id]
intercepted_data = json.loads(intercepted_json) intercepted_data = json.loads(intercepted_json)
lot_progress.add_event(
kind='intercepted',
label='GraphQL (intercepted)',
size_bytes=len(intercepted_json.encode('utf-8')),
cached='intercepted',
duration_ms=0
)
# Store the raw JSON for future offline use # Store the raw JSON for future offline use
page_data['api_data_json'] = intercepted_json page_data['api_data_json'] = intercepted_json
# Extract lot data from intercepted response # Extract lot data from intercepted response
@@ -374,6 +414,13 @@ class TroostwijkScraper:
if is_complete: if is_complete:
print(f" Using cached API data") print(f" Using cached API data")
lot_progress.add_event(
kind='db',
label='lots table (cached api fields)',
size_bytes=None,
cached='db',
duration_ms=0
)
page_data['followers_count'] = existing[0] page_data['followers_count'] = existing[0]
page_data['estimated_min_price'] = existing[1] page_data['estimated_min_price'] = existing[1]
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids') page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
@@ -385,9 +432,31 @@ class TroostwijkScraper:
else: else:
print(f" Fetching lot data from API (concurrent)...") print(f" Fetching lot data from API (concurrent)...")
# Make concurrent API calls # Make concurrent API calls
api_tasks = [fetch_lot_bidding_data(lot_id)] api_tasks = []
# Wrap each API call to capture duration and size
async def _timed_fetch(name, coro_func, *args, **kwargs):
t0 = time.time()
data = await coro_func(*args, **kwargs)
dt = int((time.time() - t0) * 1000)
size_b = None
try:
if data is not None:
import json as _json
size_b = len(_json.dumps(data).encode('utf-8'))
except Exception:
size_b = None
lot_progress.add_event(
kind='graphql',
label=name,
size_bytes=size_b,
cached='realtime',
duration_ms=dt
)
return data
api_tasks.append(_timed_fetch('GraphQL lotDetails', fetch_lot_bidding_data, lot_id))
if auction_id: if auction_id:
api_tasks.append(fetch_auction_data(auction_id)) api_tasks.append(_timed_fetch('GraphQL auction', fetch_auction_data, auction_id))
results = await asyncio.gather(*api_tasks, return_exceptions=True) results = await asyncio.gather(*api_tasks, return_exceptions=True)
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
bid_history_data = None # Will fetch after we have lot_uuid bid_history_data = None # Will fetch after we have lot_uuid
@@ -395,7 +464,7 @@ class TroostwijkScraper:
# Fresh page fetch - make concurrent API calls for all data # Fresh page fetch - make concurrent API calls for all data
if not self.offline: if not self.offline:
print(f" Fetching lot data from API (concurrent)...") print(f" Fetching lot data from API (concurrent)...")
api_tasks = [fetch_lot_bidding_data(lot_id)] api_tasks = []
task_map = {'bidding': 0} # Track which index corresponds to which task task_map = {'bidding': 0} # Track which index corresponds to which task
# Add auction data fetch if we need viewing/pickup times # Add auction data fetch if we need viewing/pickup times
@@ -411,16 +480,80 @@ class TroostwijkScraper:
if not has_times: if not has_times:
task_map['auction'] = len(api_tasks) task_map['auction'] = len(api_tasks)
api_tasks.append(fetch_auction_data(auction_id)) async def fetch_auction_wrapped():
t0 = time.time()
data = await fetch_auction_data(auction_id)
dt = int((time.time() - t0) * 1000)
size_b = None
try:
if data is not None:
import json as _json
size_b = len(_json.dumps(data).encode('utf-8'))
except Exception:
size_b = None
lot_progress.add_event(
kind='graphql',
label='GraphQL auction',
size_bytes=size_b,
cached='realtime',
duration_ms=dt
)
return data
api_tasks.append(fetch_auction_wrapped())
# Add bid history fetch if we have lot_uuid and expect bids # Add bid history fetch if we have lot_uuid and expect bids
if lot_uuid: if lot_uuid:
task_map['bid_history'] = len(api_tasks) task_map['bid_history'] = len(api_tasks)
api_tasks.append(fetch_bid_history(lot_uuid)) async def fetch_bid_history_wrapped():
t0 = time.time()
data = await fetch_bid_history(lot_uuid)
dt = int((time.time() - t0) * 1000)
size_b = None
try:
if data is not None:
import json as _json
size_b = len(_json.dumps(data).encode('utf-8'))
except Exception:
size_b = None
lot_progress.add_event(
kind='rest',
label='REST bid history',
size_bytes=size_b,
cached='realtime',
duration_ms=dt
)
return data
api_tasks.append(fetch_bid_history_wrapped())
# Execute all API calls concurrently # Execute all API calls concurrently
# Always include the bidding data as first task
async def fetch_bidding_wrapped():
t0 = time.time()
data = await fetch_lot_bidding_data(lot_id)
dt = int((time.time() - t0) * 1000)
size_b = None
try:
if data is not None:
import json as _json
size_b = len(_json.dumps(data).encode('utf-8'))
except Exception:
size_b = None
lot_progress.add_event(
kind='graphql',
label='GraphQL lotDetails',
size_bytes=size_b,
cached='realtime',
duration_ms=dt
)
return data
api_tasks.insert(0, fetch_bidding_wrapped())
# Adjust task_map indexes
for k in list(task_map.keys()):
task_map[k] += 1 if k != 'bidding' else 0
results = await asyncio.gather(*api_tasks, return_exceptions=True) results = await asyncio.gather(*api_tasks, return_exceptions=True)
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
# Store raw API JSON for offline replay # Store raw API JSON for offline replay
if bidding_data: if bidding_data:
@@ -628,6 +761,12 @@ class TroostwijkScraper:
else: else:
print(f" All {len(images)} images already cached") print(f" All {len(images)} images already cached")
# Stop and print progress summary for the lot
try:
lot_progress.stop()
except Exception:
pass
return page_data return page_data
def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]: def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]: