Compare commits
2 Commits
e69563d4d6
...
570fd3870e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
570fd3870e | ||
|
|
5a755a2125 |
12
README.md
12
README.md
@@ -27,23 +27,13 @@ If your IDE shows "Python 2.7 syntax" warnings, configure it for Python 3.10+:
|
||||
- Ensure "Python version" is set to 3.10+
|
||||
- Check "Code compatibility inspection" → Set minimum version to 3.10
|
||||
|
||||
### VS Code
|
||||
|
||||
Add to `.vscode/settings.json`:
|
||||
```json
|
||||
{
|
||||
"python.pythonPath": "path/to/python3.10",
|
||||
"python.analysis.typeCheckingMode": "basic",
|
||||
"python.languageServer": "Pylance"
|
||||
}
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Check Python version
|
||||
python --version # Should be 3.10+
|
||||
|
||||
~\venvs\scaev\Scripts\Activate.ps1
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
|
||||
|
||||
139
db/migration/V1__initial_schema.sql
Normal file
139
db/migration/V1__initial_schema.sql
Normal file
@@ -0,0 +1,139 @@
|
||||
-- Auctions
|
||||
CREATE TABLE auctions (
|
||||
auction_id TEXT PRIMARY KEY,
|
||||
url TEXT UNIQUE,
|
||||
title TEXT,
|
||||
location TEXT,
|
||||
lots_count INTEGER,
|
||||
first_lot_closing_time TEXT,
|
||||
scraped_at TEXT,
|
||||
city TEXT,
|
||||
country TEXT,
|
||||
type TEXT,
|
||||
lot_count INTEGER DEFAULT 0,
|
||||
closing_time TEXT,
|
||||
discovered_at BIGINT
|
||||
);
|
||||
|
||||
CREATE INDEX idx_auctions_country ON auctions(country);
|
||||
|
||||
-- Cache
|
||||
CREATE TABLE cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
content BYTEA,
|
||||
timestamp DOUBLE PRECISION,
|
||||
status_code INTEGER
|
||||
);
|
||||
|
||||
CREATE INDEX idx_timestamp ON cache(timestamp);
|
||||
|
||||
-- Lots
|
||||
CREATE TABLE lots (
|
||||
lot_id TEXT PRIMARY KEY,
|
||||
auction_id TEXT REFERENCES auctions(auction_id),
|
||||
url TEXT UNIQUE,
|
||||
title TEXT,
|
||||
current_bid TEXT,
|
||||
bid_count INTEGER,
|
||||
closing_time TEXT,
|
||||
viewing_time TEXT,
|
||||
pickup_date TEXT,
|
||||
location TEXT,
|
||||
description TEXT,
|
||||
category TEXT,
|
||||
scraped_at TEXT,
|
||||
sale_id INTEGER,
|
||||
manufacturer TEXT,
|
||||
type TEXT,
|
||||
year INTEGER,
|
||||
currency TEXT DEFAULT 'EUR',
|
||||
closing_notified INTEGER DEFAULT 0,
|
||||
starting_bid TEXT,
|
||||
minimum_bid TEXT,
|
||||
status TEXT,
|
||||
brand TEXT,
|
||||
model TEXT,
|
||||
attributes_json TEXT,
|
||||
first_bid_time TEXT,
|
||||
last_bid_time TEXT,
|
||||
bid_velocity DOUBLE PRECISION,
|
||||
bid_increment DOUBLE PRECISION,
|
||||
year_manufactured INTEGER,
|
||||
condition_score DOUBLE PRECISION,
|
||||
condition_description TEXT,
|
||||
serial_number TEXT,
|
||||
damage_description TEXT,
|
||||
followers_count INTEGER DEFAULT 0,
|
||||
estimated_min_price DOUBLE PRECISION,
|
||||
estimated_max_price DOUBLE PRECISION,
|
||||
lot_condition TEXT,
|
||||
appearance TEXT,
|
||||
estimated_min DOUBLE PRECISION,
|
||||
estimated_max DOUBLE PRECISION,
|
||||
next_bid_step_cents INTEGER,
|
||||
condition TEXT,
|
||||
category_path TEXT,
|
||||
city_location TEXT,
|
||||
country_code TEXT,
|
||||
bidding_status TEXT,
|
||||
packaging TEXT,
|
||||
quantity INTEGER,
|
||||
vat DOUBLE PRECISION,
|
||||
buyer_premium_percentage DOUBLE PRECISION,
|
||||
remarks TEXT,
|
||||
reserve_price DOUBLE PRECISION,
|
||||
reserve_met INTEGER,
|
||||
view_count INTEGER,
|
||||
api_data_json TEXT,
|
||||
next_scrape_at BIGINT,
|
||||
scrape_priority INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX idx_lots_closing_time ON lots(closing_time);
|
||||
CREATE INDEX idx_lots_next_scrape ON lots(next_scrape_at);
|
||||
CREATE INDEX idx_lots_priority ON lots(scrape_priority DESC);
|
||||
CREATE INDEX idx_lots_sale_id ON lots(sale_id);
|
||||
|
||||
-- Bid history
|
||||
CREATE TABLE bid_history (
|
||||
id SERIAL PRIMARY KEY,
|
||||
lot_id TEXT REFERENCES lots(lot_id),
|
||||
bid_amount DOUBLE PRECISION NOT NULL,
|
||||
bid_time TEXT NOT NULL,
|
||||
is_autobid INTEGER DEFAULT 0,
|
||||
bidder_id TEXT,
|
||||
bidder_number INTEGER,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX idx_bid_history_bidder ON bid_history(bidder_id);
|
||||
CREATE INDEX idx_bid_history_lot_time ON bid_history(lot_id, bid_time);
|
||||
|
||||
-- Images
|
||||
CREATE TABLE images (
|
||||
id SERIAL PRIMARY KEY,
|
||||
lot_id TEXT REFERENCES lots(lot_id),
|
||||
url TEXT,
|
||||
local_path TEXT,
|
||||
downloaded INTEGER DEFAULT 0,
|
||||
labels TEXT,
|
||||
processed_at BIGINT
|
||||
);
|
||||
|
||||
CREATE INDEX idx_images_lot_id ON images(lot_id);
|
||||
CREATE UNIQUE INDEX idx_unique_lot_url ON images(lot_id, url);
|
||||
|
||||
-- Resource cache
|
||||
CREATE TABLE resource_cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
content BYTEA,
|
||||
content_type TEXT,
|
||||
status_code INTEGER,
|
||||
headers TEXT,
|
||||
timestamp DOUBLE PRECISION,
|
||||
size_bytes INTEGER,
|
||||
local_path TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX idx_resource_timestamp ON resource_cache(timestamp);
|
||||
CREATE INDEX idx_resource_content_type ON resource_cache(content_type);
|
||||
@@ -8,7 +8,7 @@ The scraper follows a **3-phase hierarchical crawling pattern** to extract aucti
|
||||
|
||||
```mariadb
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ TROOSTWIJK SCRAPER │
|
||||
│ SCAEV SCRAPER │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
@@ -346,6 +346,48 @@ Lot Page Parsed
|
||||
└── 001.jpg
|
||||
```
|
||||
|
||||
## Terminal Progress per Lot (TTY)
|
||||
|
||||
During lot analysis, Scaev now shows a per‑lot TTY progress animation with a final summary of all inputs used:
|
||||
|
||||
- Spinner runs while enrichment is in progress.
|
||||
- Summary lists every page/API used to analyze the lot with:
|
||||
- URL/label
|
||||
- Size in bytes
|
||||
- Source state: cache | realtime | offline | db | intercepted
|
||||
- Duration in ms
|
||||
|
||||
Example output snippet:
|
||||
|
||||
```
|
||||
[LOT A1-28505-5] ✓ Done in 812 ms — pages/APIs used:
|
||||
• [html] https://www.troostwijkauctions.com/l/... | 142331 B | cache | 4 ms
|
||||
• [graphql] GraphQL lotDetails | 5321 B | realtime | 142 ms
|
||||
• [rest] REST bid history | 18234 B | realtime | 236 ms
|
||||
```
|
||||
|
||||
Notes:
|
||||
- In non‑TTY environments the spinner is replaced by simple log lines.
|
||||
- Intercepted GraphQL responses (captured during page load) are labeled as `intercepted` with near‑zero duration.
|
||||
|
||||
## Data Flow “Tunnel” (Simplified)
|
||||
|
||||
For each lot, the data “tunnels through” the following stages:
|
||||
|
||||
1. HTML page → parse `__NEXT_DATA__` for core lot fields and lot UUID.
|
||||
2. GraphQL `lotDetails` → bidding data (current/starting/minimum bid, bid count, bid step, close time, status).
|
||||
3. Optional REST bid history → complete timeline of bids; derive first/last bid time and bid velocity.
|
||||
4. Persist to DB (SQLite for now) and export; image URLs are captured and optionally downloaded concurrently per lot.
|
||||
|
||||
Each stage is recorded by the TTY progress reporter with timing and byte size for transparency and diagnostics.
|
||||
|
||||
## Migrations and ORM Roadmap
|
||||
|
||||
- Migrations follow a Flyway‑style convention in `db/migration` (e.g., `V1__initial_schema.sql`).
|
||||
- Current baseline is V1; there are no new migrations required at this time.
|
||||
- Raw SQL usage remains in place (SQLite) while we prepare a gradual move to SQLAlchemy 2.x targeting PostgreSQL.
|
||||
- See `docs/MIGRATIONS.md` for details on naming, workflow, and the future switch to PostgreSQL.
|
||||
|
||||
## Extension Points for Integration
|
||||
|
||||
### 1. **Downstream Processing Pipeline**
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Deployment
|
||||
# Deployment (Scaev)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
@@ -12,8 +12,8 @@
|
||||
|
||||
```bash
|
||||
# Clone repository
|
||||
git clone git@git.appmodel.nl:Tour/troost-scraper.git
|
||||
cd troost-scraper
|
||||
git clone git@git.appmodel.nl:Tour/scaev.git
|
||||
cd scaev
|
||||
|
||||
# Create virtual environment
|
||||
python -m venv .venv
|
||||
@@ -41,8 +41,8 @@ MAX_PAGES = 50
|
||||
### 3. Create Output Directories
|
||||
|
||||
```bash
|
||||
sudo mkdir -p /var/troost-scraper/output
|
||||
sudo chown $USER:$USER /var/troost-scraper
|
||||
sudo mkdir -p /var/scaev/output
|
||||
sudo chown $USER:$USER /var/scaev
|
||||
```
|
||||
|
||||
### 4. Run as Cron Job
|
||||
@@ -51,7 +51,7 @@ Add to crontab (`crontab -e`):
|
||||
|
||||
```bash
|
||||
# Run scraper daily at 2 AM
|
||||
0 2 * * * cd /path/to/troost-scraper && /path/to/.venv/bin/python main.py >> /var/log/troost-scraper.log 2>&1
|
||||
0 2 * * * cd /path/to/scaev && /path/to/.venv/bin/python main.py >> /var/log/scaev.log 2>&1
|
||||
```
|
||||
|
||||
## Docker Deployment (Optional)
|
||||
@@ -82,8 +82,8 @@ CMD ["python", "main.py"]
|
||||
Build and run:
|
||||
|
||||
```bash
|
||||
docker build -t troost-scraper .
|
||||
docker run -v /path/to/output:/output troost-scraper
|
||||
docker build -t scaev .
|
||||
docker run -v /path/to/output:/output scaev
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
@@ -91,13 +91,13 @@ docker run -v /path/to/output:/output troost-scraper
|
||||
### Check Logs
|
||||
|
||||
```bash
|
||||
tail -f /var/log/troost-scraper.log
|
||||
tail -f /var/log/scaev.log
|
||||
```
|
||||
|
||||
### Monitor Output
|
||||
|
||||
```bash
|
||||
ls -lh /var/troost-scraper/output/
|
||||
ls -lh /var/scaev/output/
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
@@ -113,7 +113,7 @@ playwright install --force chromium
|
||||
|
||||
```bash
|
||||
# Fix permissions
|
||||
sudo chown -R $USER:$USER /var/troost-scraper
|
||||
sudo chown -R $USER:$USER /var/scaev
|
||||
```
|
||||
|
||||
### Memory Issues
|
||||
|
||||
@@ -5,6 +5,12 @@
|
||||
playwright>=1.40.0
|
||||
aiohttp>=3.9.0 # Optional: only needed if DOWNLOAD_IMAGES=True
|
||||
|
||||
# ORM groundwork (gradual adoption)
|
||||
SQLAlchemy>=2.0 # Modern ORM (2.x) — groundwork for PostgreSQL
|
||||
# For PostgreSQL in the near future, install one of:
|
||||
# psycopg[binary]>=3.1 # Recommended
|
||||
# or psycopg2-binary>=2.9
|
||||
|
||||
# Development/Testing
|
||||
pytest>=7.4.0 # Optional: for testing
|
||||
pytest-asyncio>=0.21.0 # Optional: for async tests
|
||||
|
||||
60
src/db.py
Normal file
60
src/db.py
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Database scaffolding for future SQLAlchemy 2.x usage.
|
||||
|
||||
Notes:
|
||||
- We keep using the current SQLite + raw SQL for operational code.
|
||||
- This module prepares an engine/session bound to DATABASE_URL, defaulting to
|
||||
SQLite file in config.CACHE_DB path (for local dev only).
|
||||
- PostgreSQL can be enabled by setting DATABASE_URL, e.g.:
|
||||
DATABASE_URL=postgresql+psycopg://user:pass@localhost:5432/scaev
|
||||
|
||||
No runtime dependency from the scraper currently imports or uses this module.
|
||||
It is present to bootstrap the gradual migration to SQLAlchemy 2.x.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def get_database_url(sqlite_fallback_path: str) -> str:
|
||||
url = os.getenv("DATABASE_URL")
|
||||
if url and url.strip():
|
||||
return url.strip()
|
||||
# SQLite fallback
|
||||
# Use a separate sqlite file when DATABASE_URL is not set; this does not
|
||||
# alter the existing cache.db usage by raw SQL — it's just a dev convenience.
|
||||
return f"sqlite:///{sqlite_fallback_path}"
|
||||
|
||||
|
||||
def create_engine_and_session(database_url: str):
|
||||
try:
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
"SQLAlchemy is not installed. Add it to requirements.txt to use this module."
|
||||
) from e
|
||||
|
||||
# Engine tuned for simple use; callers may override
|
||||
engine = create_engine(database_url, pool_pre_ping=True, future=True)
|
||||
SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False, future=True)
|
||||
return engine, SessionLocal
|
||||
|
||||
|
||||
def get_sa(session_cached: dict, sqlite_fallback_path: str):
|
||||
"""Helper to lazily create and cache SQLAlchemy engine/session factory.
|
||||
|
||||
session_cached: dict — a mutable dict, e.g., module-level {}, to store engine and factory
|
||||
sqlite_fallback_path: path to a sqlite file for local development
|
||||
"""
|
||||
if 'engine' in session_cached and 'SessionLocal' in session_cached:
|
||||
return session_cached['engine'], session_cached['SessionLocal']
|
||||
|
||||
url = get_database_url(sqlite_fallback_path)
|
||||
engine, SessionLocal = create_engine_and_session(url)
|
||||
session_cached['engine'] = engine
|
||||
session_cached['SessionLocal'] = SessionLocal
|
||||
return engine, SessionLocal
|
||||
105
src/progress.py
Normal file
105
src/progress.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Lightweight TTY progress reporter for per-lot scraping.
|
||||
|
||||
It shows a spinner while work is in progress and records all page/API
|
||||
fetches that contributed to the lot analysis, including:
|
||||
- URL or source label
|
||||
- size in bytes (when available)
|
||||
- cache status (cached/real-time/offline/db/intercepted)
|
||||
- duration in milliseconds
|
||||
|
||||
Intentionally dependency-free and safe to use in async code.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import time
|
||||
import threading
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProgressEvent:
|
||||
kind: str # html | graphql | rest | image | cache | db | intercepted | other
|
||||
label: str # url or description
|
||||
size_bytes: Optional[int]
|
||||
cached: str # "cache", "realtime", "offline", "db", "intercepted"
|
||||
duration_ms: Optional[int]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProgressReporter:
|
||||
lot_id: str
|
||||
title: str = ""
|
||||
_events: List[ProgressEvent] = field(default_factory=list)
|
||||
_start_ts: float = field(default_factory=time.time)
|
||||
_stop_ts: Optional[float] = None
|
||||
_spinner_thread: Optional[threading.Thread] = None
|
||||
_stop_flag: bool = False
|
||||
_is_tty: bool = field(default_factory=lambda: sys.stdout.isatty())
|
||||
|
||||
def start(self) -> None:
|
||||
if not self._is_tty:
|
||||
print(f"[LOT {self.lot_id}] ⏳ Analyzing… {self.title[:60]}")
|
||||
return
|
||||
|
||||
def run_spinner():
|
||||
idx = 0
|
||||
while not self._stop_flag:
|
||||
frame = SPINNER_FRAMES[idx % len(SPINNER_FRAMES)]
|
||||
idx += 1
|
||||
summary = f"{len(self._events)} events"
|
||||
line = f"[LOT {self.lot_id}] {frame} {self.title[:60]} · {summary}"
|
||||
# CR without newline to animate
|
||||
sys.stdout.write("\r" + line)
|
||||
sys.stdout.flush()
|
||||
time.sleep(0.09)
|
||||
# Clear the spinner line
|
||||
sys.stdout.write("\r" + " " * 120 + "\r")
|
||||
sys.stdout.flush()
|
||||
|
||||
self._spinner_thread = threading.Thread(target=run_spinner, daemon=True)
|
||||
self._spinner_thread.start()
|
||||
|
||||
def add_event(
|
||||
self,
|
||||
*,
|
||||
kind: str,
|
||||
label: str,
|
||||
size_bytes: Optional[int] = None,
|
||||
cached: str = "realtime",
|
||||
duration_ms: Optional[float] = None,
|
||||
) -> None:
|
||||
self._events.append(
|
||||
ProgressEvent(
|
||||
kind=kind,
|
||||
label=label,
|
||||
size_bytes=int(size_bytes) if size_bytes is not None else None,
|
||||
cached=cached,
|
||||
duration_ms=int(duration_ms) if duration_ms is not None else None,
|
||||
)
|
||||
)
|
||||
|
||||
def stop(self) -> None:
|
||||
self._stop_ts = time.time()
|
||||
self._stop_flag = True
|
||||
if self._spinner_thread and self._spinner_thread.is_alive():
|
||||
self._spinner_thread.join(timeout=1.0)
|
||||
|
||||
total_ms = int((self._stop_ts - self._start_ts) * 1000)
|
||||
print(f"[LOT {self.lot_id}] ✓ Done in {total_ms} ms — pages/APIs used:")
|
||||
if not self._events:
|
||||
print(" • (none)")
|
||||
return
|
||||
|
||||
# Print events as a compact list
|
||||
for ev in self._events:
|
||||
size = f"{ev.size_bytes} B" if ev.size_bytes is not None else "?"
|
||||
dur = f"{ev.duration_ms} ms" if ev.duration_ms is not None else "?"
|
||||
print(f" • [{ev.kind}] {ev.label} | {size} | {ev.cached} | {dur}")
|
||||
157
src/scraper.py
157
src/scraper.py
@@ -29,6 +29,7 @@ from graphql_client import (
|
||||
)
|
||||
from bid_history_client import fetch_bid_history, parse_bid_history
|
||||
from priority import calculate_priority, parse_closing_time
|
||||
from progress import ProgressReporter
|
||||
|
||||
class TroostwijkScraper:
|
||||
"""Main scraper class for Troostwijk Auctions"""
|
||||
@@ -96,7 +97,7 @@ class TroostwijkScraper:
|
||||
(useful for auction listing pages where we just need HTML structure)
|
||||
|
||||
Returns:
|
||||
Dict with 'content' and 'from_cache' keys
|
||||
Dict with: 'content', 'from_cache', 'duration_ms', 'bytes', 'url'
|
||||
"""
|
||||
if use_cache:
|
||||
cache_start = time.time()
|
||||
@@ -104,7 +105,17 @@ class TroostwijkScraper:
|
||||
if cached:
|
||||
cache_time = (time.time() - cache_start) * 1000
|
||||
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
||||
return {'content': cached['content'], 'from_cache': True}
|
||||
try:
|
||||
byte_len = len(cached['content'].encode('utf-8'))
|
||||
except Exception:
|
||||
byte_len = None
|
||||
return {
|
||||
'content': cached['content'],
|
||||
'from_cache': True,
|
||||
'duration_ms': int(cache_time),
|
||||
'bytes': byte_len,
|
||||
'url': url
|
||||
}
|
||||
|
||||
# In OFFLINE mode we never fetch from network
|
||||
if self.offline:
|
||||
@@ -130,7 +141,17 @@ class TroostwijkScraper:
|
||||
total_time = time.time() - fetch_start
|
||||
self.cache.set(url, content, 200)
|
||||
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
|
||||
return {'content': content, 'from_cache': False}
|
||||
try:
|
||||
byte_len = len(content.encode('utf-8'))
|
||||
except Exception:
|
||||
byte_len = None
|
||||
return {
|
||||
'content': content,
|
||||
'from_cache': False,
|
||||
'duration_ms': int(total_time * 1000),
|
||||
'bytes': byte_len,
|
||||
'url': url
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
@@ -302,6 +323,18 @@ class TroostwijkScraper:
|
||||
print(f" Type: LOT")
|
||||
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||
|
||||
# TTY progress reporter per lot
|
||||
lot_progress = ProgressReporter(lot_id=page_data.get('lot_id', ''), title=page_data.get('title', ''))
|
||||
lot_progress.start()
|
||||
# Record HTML page fetch
|
||||
lot_progress.add_event(
|
||||
kind='html',
|
||||
label=result.get('url', url),
|
||||
size_bytes=result.get('bytes'),
|
||||
cached='cache' if from_cache else 'realtime',
|
||||
duration_ms=result.get('duration_ms')
|
||||
)
|
||||
|
||||
# Extract ALL data from __NEXT_DATA__ lot object
|
||||
import json
|
||||
import re
|
||||
@@ -339,6 +372,13 @@ class TroostwijkScraper:
|
||||
try:
|
||||
intercepted_json = self.intercepted_api_data[lot_id]
|
||||
intercepted_data = json.loads(intercepted_json)
|
||||
lot_progress.add_event(
|
||||
kind='intercepted',
|
||||
label='GraphQL (intercepted)',
|
||||
size_bytes=len(intercepted_json.encode('utf-8')),
|
||||
cached='intercepted',
|
||||
duration_ms=0
|
||||
)
|
||||
# Store the raw JSON for future offline use
|
||||
page_data['api_data_json'] = intercepted_json
|
||||
# Extract lot data from intercepted response
|
||||
@@ -374,6 +414,13 @@ class TroostwijkScraper:
|
||||
|
||||
if is_complete:
|
||||
print(f" Using cached API data")
|
||||
lot_progress.add_event(
|
||||
kind='db',
|
||||
label='lots table (cached api fields)',
|
||||
size_bytes=None,
|
||||
cached='db',
|
||||
duration_ms=0
|
||||
)
|
||||
page_data['followers_count'] = existing[0]
|
||||
page_data['estimated_min_price'] = existing[1]
|
||||
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
||||
@@ -385,9 +432,31 @@ class TroostwijkScraper:
|
||||
else:
|
||||
print(f" Fetching lot data from API (concurrent)...")
|
||||
# Make concurrent API calls
|
||||
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
||||
api_tasks = []
|
||||
# Wrap each API call to capture duration and size
|
||||
async def _timed_fetch(name, coro_func, *args, **kwargs):
|
||||
t0 = time.time()
|
||||
data = await coro_func(*args, **kwargs)
|
||||
dt = int((time.time() - t0) * 1000)
|
||||
size_b = None
|
||||
try:
|
||||
if data is not None:
|
||||
import json as _json
|
||||
size_b = len(_json.dumps(data).encode('utf-8'))
|
||||
except Exception:
|
||||
size_b = None
|
||||
lot_progress.add_event(
|
||||
kind='graphql',
|
||||
label=name,
|
||||
size_bytes=size_b,
|
||||
cached='realtime',
|
||||
duration_ms=dt
|
||||
)
|
||||
return data
|
||||
|
||||
api_tasks.append(_timed_fetch('GraphQL lotDetails', fetch_lot_bidding_data, lot_id))
|
||||
if auction_id:
|
||||
api_tasks.append(fetch_auction_data(auction_id))
|
||||
api_tasks.append(_timed_fetch('GraphQL auction', fetch_auction_data, auction_id))
|
||||
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
|
||||
bid_history_data = None # Will fetch after we have lot_uuid
|
||||
@@ -395,7 +464,7 @@ class TroostwijkScraper:
|
||||
# Fresh page fetch - make concurrent API calls for all data
|
||||
if not self.offline:
|
||||
print(f" Fetching lot data from API (concurrent)...")
|
||||
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
||||
api_tasks = []
|
||||
task_map = {'bidding': 0} # Track which index corresponds to which task
|
||||
|
||||
# Add auction data fetch if we need viewing/pickup times
|
||||
@@ -411,16 +480,80 @@ class TroostwijkScraper:
|
||||
|
||||
if not has_times:
|
||||
task_map['auction'] = len(api_tasks)
|
||||
api_tasks.append(fetch_auction_data(auction_id))
|
||||
async def fetch_auction_wrapped():
|
||||
t0 = time.time()
|
||||
data = await fetch_auction_data(auction_id)
|
||||
dt = int((time.time() - t0) * 1000)
|
||||
size_b = None
|
||||
try:
|
||||
if data is not None:
|
||||
import json as _json
|
||||
size_b = len(_json.dumps(data).encode('utf-8'))
|
||||
except Exception:
|
||||
size_b = None
|
||||
lot_progress.add_event(
|
||||
kind='graphql',
|
||||
label='GraphQL auction',
|
||||
size_bytes=size_b,
|
||||
cached='realtime',
|
||||
duration_ms=dt
|
||||
)
|
||||
return data
|
||||
api_tasks.append(fetch_auction_wrapped())
|
||||
|
||||
# Add bid history fetch if we have lot_uuid and expect bids
|
||||
if lot_uuid:
|
||||
task_map['bid_history'] = len(api_tasks)
|
||||
api_tasks.append(fetch_bid_history(lot_uuid))
|
||||
async def fetch_bid_history_wrapped():
|
||||
t0 = time.time()
|
||||
data = await fetch_bid_history(lot_uuid)
|
||||
dt = int((time.time() - t0) * 1000)
|
||||
size_b = None
|
||||
try:
|
||||
if data is not None:
|
||||
import json as _json
|
||||
size_b = len(_json.dumps(data).encode('utf-8'))
|
||||
except Exception:
|
||||
size_b = None
|
||||
lot_progress.add_event(
|
||||
kind='rest',
|
||||
label='REST bid history',
|
||||
size_bytes=size_b,
|
||||
cached='realtime',
|
||||
duration_ms=dt
|
||||
)
|
||||
return data
|
||||
api_tasks.append(fetch_bid_history_wrapped())
|
||||
|
||||
# Execute all API calls concurrently
|
||||
# Always include the bidding data as first task
|
||||
async def fetch_bidding_wrapped():
|
||||
t0 = time.time()
|
||||
data = await fetch_lot_bidding_data(lot_id)
|
||||
dt = int((time.time() - t0) * 1000)
|
||||
size_b = None
|
||||
try:
|
||||
if data is not None:
|
||||
import json as _json
|
||||
size_b = len(_json.dumps(data).encode('utf-8'))
|
||||
except Exception:
|
||||
size_b = None
|
||||
lot_progress.add_event(
|
||||
kind='graphql',
|
||||
label='GraphQL lotDetails',
|
||||
size_bytes=size_b,
|
||||
cached='realtime',
|
||||
duration_ms=dt
|
||||
)
|
||||
return data
|
||||
|
||||
api_tasks.insert(0, fetch_bidding_wrapped())
|
||||
# Adjust task_map indexes
|
||||
for k in list(task_map.keys()):
|
||||
task_map[k] += 1 if k != 'bidding' else 0
|
||||
|
||||
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
|
||||
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
|
||||
|
||||
# Store raw API JSON for offline replay
|
||||
if bidding_data:
|
||||
@@ -628,6 +761,12 @@ class TroostwijkScraper:
|
||||
else:
|
||||
print(f" All {len(images)} images already cached")
|
||||
|
||||
# Stop and print progress summary for the lot
|
||||
try:
|
||||
lot_progress.stop()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return page_data
|
||||
|
||||
def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]:
|
||||
|
||||
Reference in New Issue
Block a user