diff --git a/src/scraper.py b/src/scraper.py index b353212..cdca099 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -84,12 +84,15 @@ class TroostwijkScraper: self.last_request_time = time.time() - async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]: + async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[Dict]: """Get page content with caching and strict rate limiting Args: fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading (useful for auction listing pages where we just need HTML structure) + + Returns: + Dict with 'content' and 'from_cache' keys """ if use_cache: cache_start = time.time() @@ -97,7 +100,7 @@ class TroostwijkScraper: if cached: cache_time = (time.time() - cache_start) * 1000 print(f" CACHE HIT: {url} ({cache_time:.0f}ms)") - return cached['content'] + return {'content': cached['content'], 'from_cache': True} await self._rate_limit() @@ -118,7 +121,7 @@ class TroostwijkScraper: total_time = time.time() - fetch_start self.cache.set(url, content, 200) print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]") - return content + return {'content': content, 'from_cache': False} except Exception as e: print(f" ERROR: {e}") @@ -158,21 +161,22 @@ class TroostwijkScraper: print(f"{'='*60}") # Use fast mode - we only need HTML structure for link extraction - content = await self._get_page(page, url, fast_mode=True) - if not content: + result = await self._get_page(page, url, fast_mode=True) + if not result: return [] - auction_urls = self._extract_auction_urls_from_listing(content) + auction_urls = self._extract_auction_urls_from_listing(result['content']) print(f"→ Found {len(auction_urls)} auction URLs") return auction_urls async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]: """Crawl an auction page and extract lot URLs""" # Use fast mode for auction pages - we only need the HTML structure, not all assets - content = await self._get_page(page, auction_url, fast_mode=True) - if not content: + result = await self._get_page(page, auction_url, fast_mode=True) + if not result: return [] + content = result['content'] parse_start = time.time() page_data = self.parser.parse_page(content, auction_url) parse_time = (time.time() - parse_start) * 1000 @@ -199,10 +203,12 @@ class TroostwijkScraper: page_id = self.parser.extract_lot_id(url) print(f"\n[PAGE {page_id}]") - content = await self._get_page(page, url) - if not content: + result = await self._get_page(page, url) + if not result: return None + content = result['content'] + from_cache = result['from_cache'] page_data = self.parser.parse_page(content, url) if not page_data: return None @@ -245,34 +251,83 @@ class TroostwijkScraper: except: pass - # Fetch bidding data from GraphQL API (skip if already complete in DB) + # Fetch all API data concurrently (or use cache if HTML was cached) lot_id = page_data.get('lot_id') + auction_id = page_data.get('auction_id') - # Check if lot already has complete API data in database - import sqlite3 - conn = sqlite3.connect(self.cache.db_path) - cursor = conn.cursor() - cursor.execute(""" - SELECT followers_count, estimated_min_price, current_bid, bid_count - FROM lots WHERE lot_id = ? - """, (lot_id,)) - existing = cursor.fetchone() - conn.close() + if from_cache: + # Check if we have cached API data in database + import sqlite3 + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT followers_count, estimated_min_price, current_bid, bid_count + FROM lots WHERE lot_id = ? + """, (lot_id,)) + existing = cursor.fetchone() + conn.close() - # Skip API if we have complete data (followers_count indicates API was called) - skip_api = existing and existing[0] is not None - - if skip_api: - print(f" Using cached API data") - bidding_data = None - # Load cached data for display - page_data['followers_count'] = existing[0] - page_data['estimated_min_price'] = existing[1] - page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids') - page_data['bid_count'] = existing[3] or 0 + # Use cached API data if available and not null + if existing and existing[0] is not None: + print(f" Using cached API data") + page_data['followers_count'] = existing[0] + page_data['estimated_min_price'] = existing[1] + page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids') + page_data['bid_count'] = existing[3] or 0 + bidding_data = None + bid_history_data = None + else: + print(f" Fetching lot data from API (concurrent)...") + # Make concurrent API calls + api_tasks = [fetch_lot_bidding_data(lot_id)] + if auction_id: + api_tasks.append(fetch_auction_data(auction_id)) + results = await asyncio.gather(*api_tasks, return_exceptions=True) + bidding_data = results[0] if results and not isinstance(results[0], Exception) else None + bid_history_data = None # Will fetch after we have lot_uuid else: - print(f" Fetching bidding data from API...") - bidding_data = await fetch_lot_bidding_data(lot_id) + # Fresh page fetch - make concurrent API calls for all data + print(f" Fetching lot data from API (concurrent)...") + api_tasks = [fetch_lot_bidding_data(lot_id)] + task_map = {'bidding': 0} # Track which index corresponds to which task + + # Add auction data fetch if we need viewing/pickup times + if auction_id: + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ? + """, (lot_id,)) + times = cursor.fetchone() + conn.close() + has_times = times and (times[0] or times[1]) + + if not has_times: + task_map['auction'] = len(api_tasks) + api_tasks.append(fetch_auction_data(auction_id)) + + # Add bid history fetch if we have lot_uuid and expect bids + if lot_uuid: + task_map['bid_history'] = len(api_tasks) + api_tasks.append(fetch_bid_history(lot_uuid)) + + # Execute all API calls concurrently + results = await asyncio.gather(*api_tasks, return_exceptions=True) + bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None + + # Process auction data if it was fetched + if 'auction' in task_map and len(results) > task_map['auction']: + auction_data = results[task_map['auction']] + if not isinstance(auction_data, Exception) and auction_data: + auction_times = format_auction_data(auction_data) + page_data.update(auction_times) + + # Process bid history if it was fetched + bid_history_data = None + if 'bid_history' in task_map and len(results) > task_map['bid_history']: + bid_history_data = results[task_map['bid_history']] + if isinstance(bid_history_data, Exception): + bid_history_data = None if bidding_data: formatted_data = format_bid_data(bidding_data) @@ -342,9 +397,24 @@ class TroostwijkScraper: if not lot_uuid: lot_uuid = lot_details_lot.get('id') - # Fetch bid history for intelligence (skip if already in DB) - if lot_uuid and page_data.get('bid_count', 0) > 0: - # Check if bid history already exists + # Process bid history if we fetched it concurrently + if not from_cache and 'bid_history_data' in locals() and bid_history_data and page_data.get('bid_count', 0) > 0: + bid_data = parse_bid_history(bid_history_data, lot_id) + page_data.update(bid_data) + print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour") + self.cache.save_bid_history(lot_id, bid_data['bid_records']) + elif not from_cache and lot_uuid and page_data.get('bid_count', 0) > 0: + # Fallback: fetch bid history if we didn't get it in the concurrent batch + # (This happens when lot_uuid wasn't available before the first API call) + print(f" Fetching bid history...") + bid_history = await fetch_bid_history(lot_uuid) + if bid_history: + bid_data = parse_bid_history(bid_history, lot_id) + page_data.update(bid_data) + print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour") + self.cache.save_bid_history(lot_id, bid_data['bid_records']) + elif from_cache and page_data.get('bid_count', 0) > 0: + # Check if cached bid history exists conn = sqlite3.connect(self.cache.db_path) cursor = conn.cursor() cursor.execute(""" @@ -352,37 +422,8 @@ class TroostwijkScraper: """, (lot_id,)) has_history = cursor.fetchone()[0] > 0 conn.close() - - if not has_history: - print(f" Fetching bid history...") - bid_history = await fetch_bid_history(lot_uuid) - if bid_history: - bid_data = parse_bid_history(bid_history, lot_id) - page_data.update(bid_data) - print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour") - self.cache.save_bid_history(lot_id, bid_data['bid_records']) - else: + if has_history: print(f" Bid history cached") - - # Fetch auction data for viewing/pickup times (skip if already in DB) - auction_id = page_data.get('auction_id') - if auction_id: - # Check if lot already has viewing/pickup times - conn = sqlite3.connect(self.cache.db_path) - cursor = conn.cursor() - cursor.execute(""" - SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ? - """, (lot_id,)) - times = cursor.fetchone() - conn.close() - - has_times = times and (times[0] or times[1]) - - if not has_times: - auction_data = await fetch_auction_data(auction_id) - if auction_data: - auction_times = format_auction_data(auction_data) - page_data.update(auction_times) else: print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")