enrich data

This commit is contained in:
Tour
2025-12-09 01:19:55 +01:00
parent 999c5609b6
commit 83d0fc1329
7 changed files with 212 additions and 13 deletions

View File

@@ -2,9 +2,9 @@
"""
Client for fetching bid history from Troostwijk REST API
"""
import aiohttp
from typing import Dict, List, Optional
from datetime import datetime
import config
BID_HISTORY_ENDPOINT = "https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history"
@@ -20,6 +20,13 @@ async def fetch_bid_history(lot_uuid: str, page_size: int = 100) -> Optional[Lis
Returns:
List of bid dictionaries or None if request fails
"""
if config.OFFLINE:
# Offline mode: do not perform any network requests
print(" OFFLINE: skipping bid history fetch")
return None
import aiohttp
all_bids = []
page_number = 1
has_more = True

View File

@@ -112,6 +112,7 @@ class CacheManager:
reserve_price REAL,
reserve_met INTEGER,
view_count INTEGER,
api_data_json TEXT,
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
)
""")
@@ -270,8 +271,8 @@ class CacheManager:
year_manufactured, condition_score, condition_description,
serial_number, manufacturer, damage_description,
followers_count, estimated_min_price, estimated_max_price, lot_condition, appearance,
scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
scraped_at, api_data_json)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
lot_data['lot_id'],
lot_data.get('auction_id', ''),
@@ -306,7 +307,8 @@ class CacheManager:
lot_data.get('estimated_max_price'),
lot_data.get('lot_condition', ''),
lot_data.get('appearance', ''),
lot_data['scraped_at']
lot_data['scraped_at'],
lot_data.get('api_data_json')
))
conn.commit()

View File

@@ -4,6 +4,7 @@ Configuration module for Scaev Auctions Scraper
"""
import sys
import os
from pathlib import Path
# Require Python 3.10+
@@ -19,7 +20,12 @@ OUTPUT_DIR = "/mnt/okcomputer/output"
IMAGES_DIR = "/mnt/okcomputer/output/images"
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
MAX_PAGES = 50 # Number of listing pages to crawl
DOWNLOAD_IMAGES = True # Set to True to download images
# OFFLINE mode: when enabled, no network calls are performed; only DB/cache are used
OFFLINE = os.getenv("SCAEV_OFFLINE", "0").strip().lower() in {"1", "true", "yes", "on"}
# Image downloading can be disabled explicitly; in OFFLINE it's always disabled
DOWNLOAD_IMAGES = False if OFFLINE else True
# Setup directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

View File

@@ -2,8 +2,8 @@
"""
GraphQL client for fetching lot bidding data from Troostwijk API
"""
import aiohttp
from typing import Dict, Optional
import config
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
@@ -86,6 +86,13 @@ async def fetch_auction_data(auction_id: str) -> Optional[Dict]:
Returns:
Dict with auction data or None if request fails
"""
if config.OFFLINE:
# Offline mode: do not perform any network requests
print(" OFFLINE: skipping GraphQL auction fetch")
return None
import aiohttp
variables = {
"auctionId": auction_id,
"locale": "nl",
@@ -122,6 +129,13 @@ async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
Returns:
Dict with bidding data or None if request fails
"""
if config.OFFLINE:
# Offline mode: do not perform any network requests
print(" OFFLINE: skipping GraphQL lot bidding fetch")
return None
import aiohttp
variables = {
"lotDisplayId": lot_display_id,
"locale": "nl",

View File

@@ -31,6 +31,8 @@ def main():
print("Scaev Auctions Scraper")
print("=" * 60)
if config.OFFLINE:
print("OFFLINE MODE ENABLED — only database and cache will be used (no network)")
print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
print(f"Cache database: {config.CACHE_DB}")
print(f"Output directory: {config.OUTPUT_DIR}")

View File

@@ -103,6 +103,8 @@ class AuctionMonitor:
print("="*60)
print("AUCTION MONITOR STARTED")
print("="*60)
if config.OFFLINE:
print("OFFLINE MODE ENABLED — only database and cache will be used (no network)")
print(f"Poll interval: {self.poll_interval / 60:.0f} minutes")
print(f"Cache database: {config.CACHE_DB}")
print(f"Rate limit: {config.RATE_LIMIT_SECONDS}s between requests")

View File

@@ -16,7 +16,7 @@ from urllib.parse import urljoin
from playwright.async_api import async_playwright, Page
from config import (
BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR
BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR, OFFLINE
)
from cache import CacheManager
from parse import DataParser
@@ -38,6 +38,8 @@ class TroostwijkScraper:
self.visited_lots: Set[str] = set()
self.last_request_time = 0
self.download_images = DOWNLOAD_IMAGES
self.intercepted_api_data: Dict[str, str] = {} # Store intercepted GraphQL responses by lot_id
self.offline = OFFLINE
async def _download_image(self, session: 'aiohttp.ClientSession', url: str, lot_id: str, index: int) -> Optional[str]:
"""Download an image and save it locally (without rate limiting - concurrent within lot)"""
@@ -102,6 +104,11 @@ class TroostwijkScraper:
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
return {'content': cached['content'], 'from_cache': True}
# In OFFLINE mode we never fetch from network
if self.offline:
print(f" OFFLINE: cache miss for {url} — skipping fetch")
return None
await self._rate_limit()
try:
@@ -205,6 +212,73 @@ class TroostwijkScraper:
result = await self._get_page(page, url)
if not result:
# OFFLINE fallback: try to construct page data directly from DB
if self.offline:
import sqlite3
conn = sqlite3.connect(self.cache.db_path)
cur = conn.cursor()
# Try lot first
cur.execute("SELECT * FROM lots WHERE url = ?", (url,))
lot_row = cur.fetchone()
if lot_row:
# Build a dict using column names
col_names = [d[0] for d in cur.description]
lot_dict = dict(zip(col_names, lot_row))
conn.close()
page_data = {
'type': 'lot',
'lot_id': lot_dict.get('lot_id'),
'auction_id': lot_dict.get('auction_id'),
'url': lot_dict.get('url') or url,
'title': lot_dict.get('title') or '',
'current_bid': lot_dict.get('current_bid') or '',
'bid_count': lot_dict.get('bid_count') or 0,
'closing_time': lot_dict.get('closing_time') or '',
'viewing_time': lot_dict.get('viewing_time') or '',
'pickup_date': lot_dict.get('pickup_date') or '',
'location': lot_dict.get('location') or '',
'description': lot_dict.get('description') or '',
'category': lot_dict.get('category') or '',
'status': lot_dict.get('status') or '',
'brand': lot_dict.get('brand') or '',
'model': lot_dict.get('model') or '',
'attributes_json': lot_dict.get('attributes_json') or '',
'first_bid_time': lot_dict.get('first_bid_time'),
'last_bid_time': lot_dict.get('last_bid_time'),
'bid_velocity': lot_dict.get('bid_velocity'),
'followers_count': lot_dict.get('followers_count') or 0,
'estimated_min_price': lot_dict.get('estimated_min_price'),
'estimated_max_price': lot_dict.get('estimated_max_price'),
'lot_condition': lot_dict.get('lot_condition') or '',
'appearance': lot_dict.get('appearance') or '',
'scraped_at': lot_dict.get('scraped_at') or '',
}
print(" OFFLINE: using DB record for lot")
self.visited_lots.add(url)
return page_data
# Try auction by URL
cur.execute("SELECT * FROM auctions WHERE url = ?", (url,))
auc_row = cur.fetchone()
if auc_row:
col_names = [d[0] for d in cur.description]
auc_dict = dict(zip(col_names, auc_row))
conn.close()
page_data = {
'type': 'auction',
'auction_id': auc_dict.get('auction_id'),
'url': auc_dict.get('url') or url,
'title': auc_dict.get('title') or '',
'location': auc_dict.get('location') or '',
'lots_count': auc_dict.get('lots_count') or 0,
'first_lot_closing_time': auc_dict.get('first_lot_closing_time') or '',
'scraped_at': auc_dict.get('scraped_at') or '',
}
print(" OFFLINE: using DB record for auction")
self.visited_lots.add(url)
return page_data
conn.close()
return None
content = result['content']
@@ -251,24 +325,52 @@ class TroostwijkScraper:
except:
pass
# Fetch all API data concurrently (or use cache if HTML was cached)
# Fetch all API data concurrently (or use intercepted/cached data)
lot_id = page_data.get('lot_id')
auction_id = page_data.get('auction_id')
import sqlite3
if from_cache:
# Step 1: Check if we intercepted API data during page load
intercepted_data = None
if lot_id in self.intercepted_api_data:
print(f" Using intercepted API data (free!)")
try:
intercepted_json = self.intercepted_api_data[lot_id]
intercepted_data = json.loads(intercepted_json)
# Store the raw JSON for future offline use
page_data['api_data_json'] = intercepted_json
# Extract lot data from intercepted response
if 'data' in intercepted_data and 'lot' in intercepted_data['data']:
lot_api_data = intercepted_data['data']['lot']
# Format it as if it came from our fetch_lot_bidding_data
bidding_data = {'lot': lot_api_data}
from_cache = False # We have fresh data
except Exception as e:
print(f" Error parsing intercepted data: {e}")
intercepted_data = None
if intercepted_data:
# We got free API data from interception - skip the fetch logic
pass
elif from_cache:
# Check if we have cached API data in database
conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT followers_count, estimated_min_price, current_bid, bid_count
SELECT followers_count, estimated_min_price, current_bid, bid_count, closing_time
FROM lots WHERE lot_id = ?
""", (lot_id,))
existing = cursor.fetchone()
conn.close()
# Use cached API data if available and not null
if existing and existing[0] is not None:
# Data quality check: Must have followers_count AND closing_time to be considered "complete"
# This prevents using stale records like old "0 bids" entries
is_complete = (existing and
existing[0] is not None and # followers_count exists
existing[4] is not None and # closing_time exists
existing[4] != '') # closing_time is not empty
if is_complete:
print(f" Using cached API data")
page_data['followers_count'] = existing[0]
page_data['estimated_min_price'] = existing[1]
@@ -287,6 +389,7 @@ class TroostwijkScraper:
bid_history_data = None # Will fetch after we have lot_uuid
else:
# Fresh page fetch - make concurrent API calls for all data
if not self.offline:
print(f" Fetching lot data from API (concurrent)...")
api_tasks = [fetch_lot_bidding_data(lot_id)]
task_map = {'bidding': 0} # Track which index corresponds to which task
@@ -315,6 +418,10 @@ class TroostwijkScraper:
results = await asyncio.gather(*api_tasks, return_exceptions=True)
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
# Store raw API JSON for offline replay
if bidding_data:
page_data['api_data_json'] = json.dumps(bidding_data)
# Process auction data if it was fetched
if 'auction' in task_map and len(results) > task_map['auction']:
auction_data = results[task_map['auction']]
@@ -470,6 +577,39 @@ class TroostwijkScraper:
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
"""Main crawl function"""
if self.offline:
print("Launching OFFLINE crawl (no network requests)")
# Gather URLs from database
import sqlite3
conn = sqlite3.connect(self.cache.db_path)
cur = conn.cursor()
cur.execute("SELECT DISTINCT url FROM auctions")
auction_urls = [r[0] for r in cur.fetchall() if r and r[0]]
cur.execute("SELECT DISTINCT url FROM lots")
lot_urls = [r[0] for r in cur.fetchall() if r and r[0]]
conn.close()
print(f" OFFLINE: {len(auction_urls)} auctions and {len(lot_urls)} lots in DB")
results: List[Dict] = []
# Optionally process auctions (parse cached HTML if exists or DB fallback)
for i, auc_url in enumerate(auction_urls):
print(f"\n[AUC {i+1:>3}/{len(auction_urls)}] ", end="")
page_data = await self.crawl_page(page=None, url=auc_url)
if page_data:
results.append(page_data)
print("\n" + "="*60)
print("PHASE OFFLINE: PROCESSING LOT PAGES FROM DB/CACHE")
print("="*60)
for i, lot_url in enumerate(lot_urls):
print(f"\n[LOT {i+1:>3}/{len(lot_urls)}] ", end="")
page_data = await self.crawl_page(page=None, url=lot_url)
if page_data:
results.append(page_data)
return results
async with async_playwright() as p:
print("Launching browser...")
browser = await p.chromium.launch(
@@ -491,6 +631,32 @@ class TroostwijkScraper:
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
})
# Set up GraphQL API interception
async def handle_response(response):
"""Intercept GraphQL API responses"""
if 'graphql' in response.url and response.status == 200:
try:
body = await response.body()
body_text = body.decode('utf-8')
# Try to extract lot_id from the request to key our cache
# The URL pattern is typically: .../storefront/graphql
# We'll store by lot_id which we extract from the response data
data = json.loads(body_text)
# Check if this is a lot details query
if 'data' in data and 'lot' in data.get('data', {}):
lot_data = data['data']['lot']
lot_slug = lot_data.get('urlSlug', '')
if lot_slug:
self.intercepted_api_data[lot_slug] = body_text
print(f" >> Intercepted API data for: {lot_slug}")
except Exception as e:
# Silent fail - interception is opportunistic
pass
page.on('response', handle_response)
all_auction_urls = []
all_lot_urls = []