remove-field-scraped_at_timestamp

This commit is contained in:
Tour
2025-12-08 13:07:45 +01:00
parent d67cb15748
commit 207916c1fe

View File

@@ -84,12 +84,15 @@ class TroostwijkScraper:
self.last_request_time = time.time() self.last_request_time = time.time()
async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]: async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[Dict]:
"""Get page content with caching and strict rate limiting """Get page content with caching and strict rate limiting
Args: Args:
fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
(useful for auction listing pages where we just need HTML structure) (useful for auction listing pages where we just need HTML structure)
Returns:
Dict with 'content' and 'from_cache' keys
""" """
if use_cache: if use_cache:
cache_start = time.time() cache_start = time.time()
@@ -97,7 +100,7 @@ class TroostwijkScraper:
if cached: if cached:
cache_time = (time.time() - cache_start) * 1000 cache_time = (time.time() - cache_start) * 1000
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)") print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
return cached['content'] return {'content': cached['content'], 'from_cache': True}
await self._rate_limit() await self._rate_limit()
@@ -118,7 +121,7 @@ class TroostwijkScraper:
total_time = time.time() - fetch_start total_time = time.time() - fetch_start
self.cache.set(url, content, 200) self.cache.set(url, content, 200)
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]") print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
return content return {'content': content, 'from_cache': False}
except Exception as e: except Exception as e:
print(f" ERROR: {e}") print(f" ERROR: {e}")
@@ -158,21 +161,22 @@ class TroostwijkScraper:
print(f"{'='*60}") print(f"{'='*60}")
# Use fast mode - we only need HTML structure for link extraction # Use fast mode - we only need HTML structure for link extraction
content = await self._get_page(page, url, fast_mode=True) result = await self._get_page(page, url, fast_mode=True)
if not content: if not result:
return [] return []
auction_urls = self._extract_auction_urls_from_listing(content) auction_urls = self._extract_auction_urls_from_listing(result['content'])
print(f"→ Found {len(auction_urls)} auction URLs") print(f"→ Found {len(auction_urls)} auction URLs")
return auction_urls return auction_urls
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]: async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
"""Crawl an auction page and extract lot URLs""" """Crawl an auction page and extract lot URLs"""
# Use fast mode for auction pages - we only need the HTML structure, not all assets # Use fast mode for auction pages - we only need the HTML structure, not all assets
content = await self._get_page(page, auction_url, fast_mode=True) result = await self._get_page(page, auction_url, fast_mode=True)
if not content: if not result:
return [] return []
content = result['content']
parse_start = time.time() parse_start = time.time()
page_data = self.parser.parse_page(content, auction_url) page_data = self.parser.parse_page(content, auction_url)
parse_time = (time.time() - parse_start) * 1000 parse_time = (time.time() - parse_start) * 1000
@@ -199,10 +203,12 @@ class TroostwijkScraper:
page_id = self.parser.extract_lot_id(url) page_id = self.parser.extract_lot_id(url)
print(f"\n[PAGE {page_id}]") print(f"\n[PAGE {page_id}]")
content = await self._get_page(page, url) result = await self._get_page(page, url)
if not content: if not result:
return None return None
content = result['content']
from_cache = result['from_cache']
page_data = self.parser.parse_page(content, url) page_data = self.parser.parse_page(content, url)
if not page_data: if not page_data:
return None return None
@@ -245,10 +251,12 @@ class TroostwijkScraper:
except: except:
pass pass
# Fetch bidding data from GraphQL API (skip if already complete in DB) # Fetch all API data concurrently (or use cache if HTML was cached)
lot_id = page_data.get('lot_id') lot_id = page_data.get('lot_id')
auction_id = page_data.get('auction_id')
# Check if lot already has complete API data in database if from_cache:
# Check if we have cached API data in database
import sqlite3 import sqlite3
conn = sqlite3.connect(self.cache.db_path) conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor() cursor = conn.cursor()
@@ -259,20 +267,67 @@ class TroostwijkScraper:
existing = cursor.fetchone() existing = cursor.fetchone()
conn.close() conn.close()
# Skip API if we have complete data (followers_count indicates API was called) # Use cached API data if available and not null
skip_api = existing and existing[0] is not None if existing and existing[0] is not None:
if skip_api:
print(f" Using cached API data") print(f" Using cached API data")
bidding_data = None
# Load cached data for display
page_data['followers_count'] = existing[0] page_data['followers_count'] = existing[0]
page_data['estimated_min_price'] = existing[1] page_data['estimated_min_price'] = existing[1]
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids') page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
page_data['bid_count'] = existing[3] or 0 page_data['bid_count'] = existing[3] or 0
bidding_data = None
bid_history_data = None
else: else:
print(f" Fetching bidding data from API...") print(f" Fetching lot data from API (concurrent)...")
bidding_data = await fetch_lot_bidding_data(lot_id) # Make concurrent API calls
api_tasks = [fetch_lot_bidding_data(lot_id)]
if auction_id:
api_tasks.append(fetch_auction_data(auction_id))
results = await asyncio.gather(*api_tasks, return_exceptions=True)
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
bid_history_data = None # Will fetch after we have lot_uuid
else:
# Fresh page fetch - make concurrent API calls for all data
print(f" Fetching lot data from API (concurrent)...")
api_tasks = [fetch_lot_bidding_data(lot_id)]
task_map = {'bidding': 0} # Track which index corresponds to which task
# Add auction data fetch if we need viewing/pickup times
if auction_id:
conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
""", (lot_id,))
times = cursor.fetchone()
conn.close()
has_times = times and (times[0] or times[1])
if not has_times:
task_map['auction'] = len(api_tasks)
api_tasks.append(fetch_auction_data(auction_id))
# Add bid history fetch if we have lot_uuid and expect bids
if lot_uuid:
task_map['bid_history'] = len(api_tasks)
api_tasks.append(fetch_bid_history(lot_uuid))
# Execute all API calls concurrently
results = await asyncio.gather(*api_tasks, return_exceptions=True)
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
# Process auction data if it was fetched
if 'auction' in task_map and len(results) > task_map['auction']:
auction_data = results[task_map['auction']]
if not isinstance(auction_data, Exception) and auction_data:
auction_times = format_auction_data(auction_data)
page_data.update(auction_times)
# Process bid history if it was fetched
bid_history_data = None
if 'bid_history' in task_map and len(results) > task_map['bid_history']:
bid_history_data = results[task_map['bid_history']]
if isinstance(bid_history_data, Exception):
bid_history_data = None
if bidding_data: if bidding_data:
formatted_data = format_bid_data(bidding_data) formatted_data = format_bid_data(bidding_data)
@@ -342,18 +397,15 @@ class TroostwijkScraper:
if not lot_uuid: if not lot_uuid:
lot_uuid = lot_details_lot.get('id') lot_uuid = lot_details_lot.get('id')
# Fetch bid history for intelligence (skip if already in DB) # Process bid history if we fetched it concurrently
if lot_uuid and page_data.get('bid_count', 0) > 0: if not from_cache and 'bid_history_data' in locals() and bid_history_data and page_data.get('bid_count', 0) > 0:
# Check if bid history already exists bid_data = parse_bid_history(bid_history_data, lot_id)
conn = sqlite3.connect(self.cache.db_path) page_data.update(bid_data)
cursor = conn.cursor() print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
cursor.execute(""" self.cache.save_bid_history(lot_id, bid_data['bid_records'])
SELECT COUNT(*) FROM bid_history WHERE lot_id = ? elif not from_cache and lot_uuid and page_data.get('bid_count', 0) > 0:
""", (lot_id,)) # Fallback: fetch bid history if we didn't get it in the concurrent batch
has_history = cursor.fetchone()[0] > 0 # (This happens when lot_uuid wasn't available before the first API call)
conn.close()
if not has_history:
print(f" Fetching bid history...") print(f" Fetching bid history...")
bid_history = await fetch_bid_history(lot_uuid) bid_history = await fetch_bid_history(lot_uuid)
if bid_history: if bid_history:
@@ -361,28 +413,17 @@ class TroostwijkScraper:
page_data.update(bid_data) page_data.update(bid_data)
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour") print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
self.cache.save_bid_history(lot_id, bid_data['bid_records']) self.cache.save_bid_history(lot_id, bid_data['bid_records'])
else: elif from_cache and page_data.get('bid_count', 0) > 0:
print(f" Bid history cached") # Check if cached bid history exists
# Fetch auction data for viewing/pickup times (skip if already in DB)
auction_id = page_data.get('auction_id')
if auction_id:
# Check if lot already has viewing/pickup times
conn = sqlite3.connect(self.cache.db_path) conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute(""" cursor.execute("""
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ? SELECT COUNT(*) FROM bid_history WHERE lot_id = ?
""", (lot_id,)) """, (lot_id,))
times = cursor.fetchone() has_history = cursor.fetchone()[0] > 0
conn.close() conn.close()
if has_history:
has_times = times and (times[0] or times[1]) print(f" Bid history cached")
if not has_times:
auction_data = await fetch_auction_data(auction_id)
if auction_data:
auction_times = format_auction_data(auction_data)
page_data.update(auction_times)
else: else:
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)") print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")