enrich data
This commit is contained in:
@@ -2,9 +2,9 @@
|
|||||||
"""
|
"""
|
||||||
Client for fetching bid history from Troostwijk REST API
|
Client for fetching bid history from Troostwijk REST API
|
||||||
"""
|
"""
|
||||||
import aiohttp
|
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import config
|
||||||
|
|
||||||
BID_HISTORY_ENDPOINT = "https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history"
|
BID_HISTORY_ENDPOINT = "https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history"
|
||||||
|
|
||||||
@@ -20,6 +20,13 @@ async def fetch_bid_history(lot_uuid: str, page_size: int = 100) -> Optional[Lis
|
|||||||
Returns:
|
Returns:
|
||||||
List of bid dictionaries or None if request fails
|
List of bid dictionaries or None if request fails
|
||||||
"""
|
"""
|
||||||
|
if config.OFFLINE:
|
||||||
|
# Offline mode: do not perform any network requests
|
||||||
|
print(" OFFLINE: skipping bid history fetch")
|
||||||
|
return None
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
all_bids = []
|
all_bids = []
|
||||||
page_number = 1
|
page_number = 1
|
||||||
has_more = True
|
has_more = True
|
||||||
|
|||||||
@@ -112,6 +112,7 @@ class CacheManager:
|
|||||||
reserve_price REAL,
|
reserve_price REAL,
|
||||||
reserve_met INTEGER,
|
reserve_met INTEGER,
|
||||||
view_count INTEGER,
|
view_count INTEGER,
|
||||||
|
api_data_json TEXT,
|
||||||
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
|
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
@@ -270,8 +271,8 @@ class CacheManager:
|
|||||||
year_manufactured, condition_score, condition_description,
|
year_manufactured, condition_score, condition_description,
|
||||||
serial_number, manufacturer, damage_description,
|
serial_number, manufacturer, damage_description,
|
||||||
followers_count, estimated_min_price, estimated_max_price, lot_condition, appearance,
|
followers_count, estimated_min_price, estimated_max_price, lot_condition, appearance,
|
||||||
scraped_at)
|
scraped_at, api_data_json)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
""", (
|
""", (
|
||||||
lot_data['lot_id'],
|
lot_data['lot_id'],
|
||||||
lot_data.get('auction_id', ''),
|
lot_data.get('auction_id', ''),
|
||||||
@@ -306,7 +307,8 @@ class CacheManager:
|
|||||||
lot_data.get('estimated_max_price'),
|
lot_data.get('estimated_max_price'),
|
||||||
lot_data.get('lot_condition', ''),
|
lot_data.get('lot_condition', ''),
|
||||||
lot_data.get('appearance', ''),
|
lot_data.get('appearance', ''),
|
||||||
lot_data['scraped_at']
|
lot_data['scraped_at'],
|
||||||
|
lot_data.get('api_data_json')
|
||||||
))
|
))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ Configuration module for Scaev Auctions Scraper
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Require Python 3.10+
|
# Require Python 3.10+
|
||||||
@@ -19,7 +20,12 @@ OUTPUT_DIR = "/mnt/okcomputer/output"
|
|||||||
IMAGES_DIR = "/mnt/okcomputer/output/images"
|
IMAGES_DIR = "/mnt/okcomputer/output/images"
|
||||||
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
|
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
|
||||||
MAX_PAGES = 50 # Number of listing pages to crawl
|
MAX_PAGES = 50 # Number of listing pages to crawl
|
||||||
DOWNLOAD_IMAGES = True # Set to True to download images
|
|
||||||
|
# OFFLINE mode: when enabled, no network calls are performed; only DB/cache are used
|
||||||
|
OFFLINE = os.getenv("SCAEV_OFFLINE", "0").strip().lower() in {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
# Image downloading can be disabled explicitly; in OFFLINE it's always disabled
|
||||||
|
DOWNLOAD_IMAGES = False if OFFLINE else True
|
||||||
|
|
||||||
# Setup directories
|
# Setup directories
|
||||||
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
||||||
|
|||||||
@@ -2,8 +2,8 @@
|
|||||||
"""
|
"""
|
||||||
GraphQL client for fetching lot bidding data from Troostwijk API
|
GraphQL client for fetching lot bidding data from Troostwijk API
|
||||||
"""
|
"""
|
||||||
import aiohttp
|
|
||||||
from typing import Dict, Optional
|
from typing import Dict, Optional
|
||||||
|
import config
|
||||||
|
|
||||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||||
|
|
||||||
@@ -86,6 +86,13 @@ async def fetch_auction_data(auction_id: str) -> Optional[Dict]:
|
|||||||
Returns:
|
Returns:
|
||||||
Dict with auction data or None if request fails
|
Dict with auction data or None if request fails
|
||||||
"""
|
"""
|
||||||
|
if config.OFFLINE:
|
||||||
|
# Offline mode: do not perform any network requests
|
||||||
|
print(" OFFLINE: skipping GraphQL auction fetch")
|
||||||
|
return None
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
variables = {
|
variables = {
|
||||||
"auctionId": auction_id,
|
"auctionId": auction_id,
|
||||||
"locale": "nl",
|
"locale": "nl",
|
||||||
@@ -122,6 +129,13 @@ async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
|
|||||||
Returns:
|
Returns:
|
||||||
Dict with bidding data or None if request fails
|
Dict with bidding data or None if request fails
|
||||||
"""
|
"""
|
||||||
|
if config.OFFLINE:
|
||||||
|
# Offline mode: do not perform any network requests
|
||||||
|
print(" OFFLINE: skipping GraphQL lot bidding fetch")
|
||||||
|
return None
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
variables = {
|
variables = {
|
||||||
"lotDisplayId": lot_display_id,
|
"lotDisplayId": lot_display_id,
|
||||||
"locale": "nl",
|
"locale": "nl",
|
||||||
|
|||||||
@@ -31,6 +31,8 @@ def main():
|
|||||||
|
|
||||||
print("Scaev Auctions Scraper")
|
print("Scaev Auctions Scraper")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
if config.OFFLINE:
|
||||||
|
print("OFFLINE MODE ENABLED — only database and cache will be used (no network)")
|
||||||
print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
|
print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
|
||||||
print(f"Cache database: {config.CACHE_DB}")
|
print(f"Cache database: {config.CACHE_DB}")
|
||||||
print(f"Output directory: {config.OUTPUT_DIR}")
|
print(f"Output directory: {config.OUTPUT_DIR}")
|
||||||
|
|||||||
@@ -103,6 +103,8 @@ class AuctionMonitor:
|
|||||||
print("="*60)
|
print("="*60)
|
||||||
print("AUCTION MONITOR STARTED")
|
print("AUCTION MONITOR STARTED")
|
||||||
print("="*60)
|
print("="*60)
|
||||||
|
if config.OFFLINE:
|
||||||
|
print("OFFLINE MODE ENABLED — only database and cache will be used (no network)")
|
||||||
print(f"Poll interval: {self.poll_interval / 60:.0f} minutes")
|
print(f"Poll interval: {self.poll_interval / 60:.0f} minutes")
|
||||||
print(f"Cache database: {config.CACHE_DB}")
|
print(f"Cache database: {config.CACHE_DB}")
|
||||||
print(f"Rate limit: {config.RATE_LIMIT_SECONDS}s between requests")
|
print(f"Rate limit: {config.RATE_LIMIT_SECONDS}s between requests")
|
||||||
|
|||||||
180
src/scraper.py
180
src/scraper.py
@@ -16,7 +16,7 @@ from urllib.parse import urljoin
|
|||||||
from playwright.async_api import async_playwright, Page
|
from playwright.async_api import async_playwright, Page
|
||||||
|
|
||||||
from config import (
|
from config import (
|
||||||
BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR
|
BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR, OFFLINE
|
||||||
)
|
)
|
||||||
from cache import CacheManager
|
from cache import CacheManager
|
||||||
from parse import DataParser
|
from parse import DataParser
|
||||||
@@ -38,6 +38,8 @@ class TroostwijkScraper:
|
|||||||
self.visited_lots: Set[str] = set()
|
self.visited_lots: Set[str] = set()
|
||||||
self.last_request_time = 0
|
self.last_request_time = 0
|
||||||
self.download_images = DOWNLOAD_IMAGES
|
self.download_images = DOWNLOAD_IMAGES
|
||||||
|
self.intercepted_api_data: Dict[str, str] = {} # Store intercepted GraphQL responses by lot_id
|
||||||
|
self.offline = OFFLINE
|
||||||
|
|
||||||
async def _download_image(self, session: 'aiohttp.ClientSession', url: str, lot_id: str, index: int) -> Optional[str]:
|
async def _download_image(self, session: 'aiohttp.ClientSession', url: str, lot_id: str, index: int) -> Optional[str]:
|
||||||
"""Download an image and save it locally (without rate limiting - concurrent within lot)"""
|
"""Download an image and save it locally (without rate limiting - concurrent within lot)"""
|
||||||
@@ -102,6 +104,11 @@ class TroostwijkScraper:
|
|||||||
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
||||||
return {'content': cached['content'], 'from_cache': True}
|
return {'content': cached['content'], 'from_cache': True}
|
||||||
|
|
||||||
|
# In OFFLINE mode we never fetch from network
|
||||||
|
if self.offline:
|
||||||
|
print(f" OFFLINE: cache miss for {url} — skipping fetch")
|
||||||
|
return None
|
||||||
|
|
||||||
await self._rate_limit()
|
await self._rate_limit()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -205,6 +212,73 @@ class TroostwijkScraper:
|
|||||||
|
|
||||||
result = await self._get_page(page, url)
|
result = await self._get_page(page, url)
|
||||||
if not result:
|
if not result:
|
||||||
|
# OFFLINE fallback: try to construct page data directly from DB
|
||||||
|
if self.offline:
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
|
cur = conn.cursor()
|
||||||
|
# Try lot first
|
||||||
|
cur.execute("SELECT * FROM lots WHERE url = ?", (url,))
|
||||||
|
lot_row = cur.fetchone()
|
||||||
|
if lot_row:
|
||||||
|
# Build a dict using column names
|
||||||
|
col_names = [d[0] for d in cur.description]
|
||||||
|
lot_dict = dict(zip(col_names, lot_row))
|
||||||
|
conn.close()
|
||||||
|
page_data = {
|
||||||
|
'type': 'lot',
|
||||||
|
'lot_id': lot_dict.get('lot_id'),
|
||||||
|
'auction_id': lot_dict.get('auction_id'),
|
||||||
|
'url': lot_dict.get('url') or url,
|
||||||
|
'title': lot_dict.get('title') or '',
|
||||||
|
'current_bid': lot_dict.get('current_bid') or '',
|
||||||
|
'bid_count': lot_dict.get('bid_count') or 0,
|
||||||
|
'closing_time': lot_dict.get('closing_time') or '',
|
||||||
|
'viewing_time': lot_dict.get('viewing_time') or '',
|
||||||
|
'pickup_date': lot_dict.get('pickup_date') or '',
|
||||||
|
'location': lot_dict.get('location') or '',
|
||||||
|
'description': lot_dict.get('description') or '',
|
||||||
|
'category': lot_dict.get('category') or '',
|
||||||
|
'status': lot_dict.get('status') or '',
|
||||||
|
'brand': lot_dict.get('brand') or '',
|
||||||
|
'model': lot_dict.get('model') or '',
|
||||||
|
'attributes_json': lot_dict.get('attributes_json') or '',
|
||||||
|
'first_bid_time': lot_dict.get('first_bid_time'),
|
||||||
|
'last_bid_time': lot_dict.get('last_bid_time'),
|
||||||
|
'bid_velocity': lot_dict.get('bid_velocity'),
|
||||||
|
'followers_count': lot_dict.get('followers_count') or 0,
|
||||||
|
'estimated_min_price': lot_dict.get('estimated_min_price'),
|
||||||
|
'estimated_max_price': lot_dict.get('estimated_max_price'),
|
||||||
|
'lot_condition': lot_dict.get('lot_condition') or '',
|
||||||
|
'appearance': lot_dict.get('appearance') or '',
|
||||||
|
'scraped_at': lot_dict.get('scraped_at') or '',
|
||||||
|
}
|
||||||
|
print(" OFFLINE: using DB record for lot")
|
||||||
|
self.visited_lots.add(url)
|
||||||
|
return page_data
|
||||||
|
|
||||||
|
# Try auction by URL
|
||||||
|
cur.execute("SELECT * FROM auctions WHERE url = ?", (url,))
|
||||||
|
auc_row = cur.fetchone()
|
||||||
|
if auc_row:
|
||||||
|
col_names = [d[0] for d in cur.description]
|
||||||
|
auc_dict = dict(zip(col_names, auc_row))
|
||||||
|
conn.close()
|
||||||
|
page_data = {
|
||||||
|
'type': 'auction',
|
||||||
|
'auction_id': auc_dict.get('auction_id'),
|
||||||
|
'url': auc_dict.get('url') or url,
|
||||||
|
'title': auc_dict.get('title') or '',
|
||||||
|
'location': auc_dict.get('location') or '',
|
||||||
|
'lots_count': auc_dict.get('lots_count') or 0,
|
||||||
|
'first_lot_closing_time': auc_dict.get('first_lot_closing_time') or '',
|
||||||
|
'scraped_at': auc_dict.get('scraped_at') or '',
|
||||||
|
}
|
||||||
|
print(" OFFLINE: using DB record for auction")
|
||||||
|
self.visited_lots.add(url)
|
||||||
|
return page_data
|
||||||
|
|
||||||
|
conn.close()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
content = result['content']
|
content = result['content']
|
||||||
@@ -251,24 +325,52 @@ class TroostwijkScraper:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Fetch all API data concurrently (or use cache if HTML was cached)
|
# Fetch all API data concurrently (or use intercepted/cached data)
|
||||||
lot_id = page_data.get('lot_id')
|
lot_id = page_data.get('lot_id')
|
||||||
auction_id = page_data.get('auction_id')
|
auction_id = page_data.get('auction_id')
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
if from_cache:
|
# Step 1: Check if we intercepted API data during page load
|
||||||
|
intercepted_data = None
|
||||||
|
if lot_id in self.intercepted_api_data:
|
||||||
|
print(f" Using intercepted API data (free!)")
|
||||||
|
try:
|
||||||
|
intercepted_json = self.intercepted_api_data[lot_id]
|
||||||
|
intercepted_data = json.loads(intercepted_json)
|
||||||
|
# Store the raw JSON for future offline use
|
||||||
|
page_data['api_data_json'] = intercepted_json
|
||||||
|
# Extract lot data from intercepted response
|
||||||
|
if 'data' in intercepted_data and 'lot' in intercepted_data['data']:
|
||||||
|
lot_api_data = intercepted_data['data']['lot']
|
||||||
|
# Format it as if it came from our fetch_lot_bidding_data
|
||||||
|
bidding_data = {'lot': lot_api_data}
|
||||||
|
from_cache = False # We have fresh data
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error parsing intercepted data: {e}")
|
||||||
|
intercepted_data = None
|
||||||
|
|
||||||
|
if intercepted_data:
|
||||||
|
# We got free API data from interception - skip the fetch logic
|
||||||
|
pass
|
||||||
|
elif from_cache:
|
||||||
# Check if we have cached API data in database
|
# Check if we have cached API data in database
|
||||||
conn = sqlite3.connect(self.cache.db_path)
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
SELECT followers_count, estimated_min_price, current_bid, bid_count
|
SELECT followers_count, estimated_min_price, current_bid, bid_count, closing_time
|
||||||
FROM lots WHERE lot_id = ?
|
FROM lots WHERE lot_id = ?
|
||||||
""", (lot_id,))
|
""", (lot_id,))
|
||||||
existing = cursor.fetchone()
|
existing = cursor.fetchone()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
# Use cached API data if available and not null
|
# Data quality check: Must have followers_count AND closing_time to be considered "complete"
|
||||||
if existing and existing[0] is not None:
|
# This prevents using stale records like old "0 bids" entries
|
||||||
|
is_complete = (existing and
|
||||||
|
existing[0] is not None and # followers_count exists
|
||||||
|
existing[4] is not None and # closing_time exists
|
||||||
|
existing[4] != '') # closing_time is not empty
|
||||||
|
|
||||||
|
if is_complete:
|
||||||
print(f" Using cached API data")
|
print(f" Using cached API data")
|
||||||
page_data['followers_count'] = existing[0]
|
page_data['followers_count'] = existing[0]
|
||||||
page_data['estimated_min_price'] = existing[1]
|
page_data['estimated_min_price'] = existing[1]
|
||||||
@@ -287,7 +389,8 @@ class TroostwijkScraper:
|
|||||||
bid_history_data = None # Will fetch after we have lot_uuid
|
bid_history_data = None # Will fetch after we have lot_uuid
|
||||||
else:
|
else:
|
||||||
# Fresh page fetch - make concurrent API calls for all data
|
# Fresh page fetch - make concurrent API calls for all data
|
||||||
print(f" Fetching lot data from API (concurrent)...")
|
if not self.offline:
|
||||||
|
print(f" Fetching lot data from API (concurrent)...")
|
||||||
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
||||||
task_map = {'bidding': 0} # Track which index corresponds to which task
|
task_map = {'bidding': 0} # Track which index corresponds to which task
|
||||||
|
|
||||||
@@ -315,6 +418,10 @@ class TroostwijkScraper:
|
|||||||
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||||
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
|
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
|
||||||
|
|
||||||
|
# Store raw API JSON for offline replay
|
||||||
|
if bidding_data:
|
||||||
|
page_data['api_data_json'] = json.dumps(bidding_data)
|
||||||
|
|
||||||
# Process auction data if it was fetched
|
# Process auction data if it was fetched
|
||||||
if 'auction' in task_map and len(results) > task_map['auction']:
|
if 'auction' in task_map and len(results) > task_map['auction']:
|
||||||
auction_data = results[task_map['auction']]
|
auction_data = results[task_map['auction']]
|
||||||
@@ -470,6 +577,39 @@ class TroostwijkScraper:
|
|||||||
|
|
||||||
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
|
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
|
||||||
"""Main crawl function"""
|
"""Main crawl function"""
|
||||||
|
if self.offline:
|
||||||
|
print("Launching OFFLINE crawl (no network requests)")
|
||||||
|
# Gather URLs from database
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("SELECT DISTINCT url FROM auctions")
|
||||||
|
auction_urls = [r[0] for r in cur.fetchall() if r and r[0]]
|
||||||
|
cur.execute("SELECT DISTINCT url FROM lots")
|
||||||
|
lot_urls = [r[0] for r in cur.fetchall() if r and r[0]]
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print(f" OFFLINE: {len(auction_urls)} auctions and {len(lot_urls)} lots in DB")
|
||||||
|
|
||||||
|
results: List[Dict] = []
|
||||||
|
# Optionally process auctions (parse cached HTML if exists or DB fallback)
|
||||||
|
for i, auc_url in enumerate(auction_urls):
|
||||||
|
print(f"\n[AUC {i+1:>3}/{len(auction_urls)}] ", end="")
|
||||||
|
page_data = await self.crawl_page(page=None, url=auc_url)
|
||||||
|
if page_data:
|
||||||
|
results.append(page_data)
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("PHASE OFFLINE: PROCESSING LOT PAGES FROM DB/CACHE")
|
||||||
|
print("="*60)
|
||||||
|
for i, lot_url in enumerate(lot_urls):
|
||||||
|
print(f"\n[LOT {i+1:>3}/{len(lot_urls)}] ", end="")
|
||||||
|
page_data = await self.crawl_page(page=None, url=lot_url)
|
||||||
|
if page_data:
|
||||||
|
results.append(page_data)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
print("Launching browser...")
|
print("Launching browser...")
|
||||||
browser = await p.chromium.launch(
|
browser = await p.chromium.launch(
|
||||||
@@ -491,6 +631,32 @@ class TroostwijkScraper:
|
|||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Set up GraphQL API interception
|
||||||
|
async def handle_response(response):
|
||||||
|
"""Intercept GraphQL API responses"""
|
||||||
|
if 'graphql' in response.url and response.status == 200:
|
||||||
|
try:
|
||||||
|
body = await response.body()
|
||||||
|
body_text = body.decode('utf-8')
|
||||||
|
|
||||||
|
# Try to extract lot_id from the request to key our cache
|
||||||
|
# The URL pattern is typically: .../storefront/graphql
|
||||||
|
# We'll store by lot_id which we extract from the response data
|
||||||
|
data = json.loads(body_text)
|
||||||
|
|
||||||
|
# Check if this is a lot details query
|
||||||
|
if 'data' in data and 'lot' in data.get('data', {}):
|
||||||
|
lot_data = data['data']['lot']
|
||||||
|
lot_slug = lot_data.get('urlSlug', '')
|
||||||
|
if lot_slug:
|
||||||
|
self.intercepted_api_data[lot_slug] = body_text
|
||||||
|
print(f" >> Intercepted API data for: {lot_slug}")
|
||||||
|
except Exception as e:
|
||||||
|
# Silent fail - interception is opportunistic
|
||||||
|
pass
|
||||||
|
|
||||||
|
page.on('response', handle_response)
|
||||||
|
|
||||||
all_auction_urls = []
|
all_auction_urls = []
|
||||||
all_lot_urls = []
|
all_lot_urls = []
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user