remove-field-scraped_at_timestamp

2025-12-08 13:07:45 +01:00
parent d67cb15748
commit 207916c1fe
1 changed files with 108 additions and 67 deletions
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -84,12 +84,15 @@ class TroostwijkScraper:
        self.last_request_time = time.time()
-    async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]:
+    async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[Dict]:
        """Get page content with caching and strict rate limiting
        Args:
            fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
                      (useful for auction listing pages where we just need HTML structure)
        Returns:
            Dict with 'content' and 'from_cache' keys
        """
        if use_cache:
            cache_start = time.time()
@@ -97,7 +100,7 @@ class TroostwijkScraper:
            if cached:
                cache_time = (time.time() - cache_start) * 1000
                print(f"  CACHE HIT: {url} ({cache_time:.0f}ms)")
-                return cached['content']
+                return {'content': cached['content'], 'from_cache': True}
        await self._rate_limit()
@@ -118,7 +121,7 @@ class TroostwijkScraper:
            total_time = time.time() - fetch_start
            self.cache.set(url, content, 200)
            print(f"    [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
-            return content
+            return {'content': content, 'from_cache': False}
        except Exception as e:
            print(f"  ERROR: {e}")
@@ -158,21 +161,22 @@ class TroostwijkScraper:
        print(f"{'='*60}")
        # Use fast mode - we only need HTML structure for link extraction
-        content = await self._get_page(page, url, fast_mode=True)
+        result = await self._get_page(page, url, fast_mode=True)
-        if not content:
+        if not result:
            return []
-        auction_urls = self._extract_auction_urls_from_listing(content)
+        auction_urls = self._extract_auction_urls_from_listing(result['content'])
        print(f"→ Found {len(auction_urls)} auction URLs")
        return auction_urls
    async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
        """Crawl an auction page and extract lot URLs"""
        # Use fast mode for auction pages - we only need the HTML structure, not all assets
-        content = await self._get_page(page, auction_url, fast_mode=True)
+        result = await self._get_page(page, auction_url, fast_mode=True)
-        if not content:
+        if not result:
            return []
        content = result['content']
        parse_start = time.time()
        page_data = self.parser.parse_page(content, auction_url)
        parse_time = (time.time() - parse_start) * 1000
@@ -199,10 +203,12 @@ class TroostwijkScraper:
        page_id = self.parser.extract_lot_id(url)
        print(f"\n[PAGE {page_id}]")
-        content = await self._get_page(page, url)
+        result = await self._get_page(page, url)
-        if not content:
+        if not result:
            return None
        content = result['content']
        from_cache = result['from_cache']
        page_data = self.parser.parse_page(content, url)
        if not page_data:
            return None
@@ -245,10 +251,12 @@ class TroostwijkScraper:
                except:
                    pass
-            # Fetch bidding data from GraphQL API (skip if already complete in DB)
+            # Fetch all API data concurrently (or use cache if HTML was cached)
            lot_id = page_data.get('lot_id')
            auction_id = page_data.get('auction_id')
-            # Check if lot already has complete API data in database
+            if from_cache:
                # Check if we have cached API data in database
                import sqlite3
                conn = sqlite3.connect(self.cache.db_path)
                cursor = conn.cursor()
@@ -259,20 +267,67 @@ class TroostwijkScraper:
                existing = cursor.fetchone()
                conn.close()
-            # Skip API if we have complete data (followers_count indicates API was called)
+                # Use cached API data if available and not null
-            skip_api = existing and existing[0] is not None
+                if existing and existing[0] is not None:
            if skip_api:
                    print(f"  Using cached API data")
                bidding_data = None
                # Load cached data for display
                    page_data['followers_count'] = existing[0]
                    page_data['estimated_min_price'] = existing[1]
                    page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
                    page_data['bid_count'] = existing[3] or 0
                    bidding_data = None
                    bid_history_data = None
                else:
-                print(f"  Fetching bidding data from API...")
+                    print(f"  Fetching lot data from API (concurrent)...")
-                bidding_data = await fetch_lot_bidding_data(lot_id)
+                    # Make concurrent API calls
                    api_tasks = [fetch_lot_bidding_data(lot_id)]
                    if auction_id:
                        api_tasks.append(fetch_auction_data(auction_id))
                    results = await asyncio.gather(*api_tasks, return_exceptions=True)
                    bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
                    bid_history_data = None  # Will fetch after we have lot_uuid
            else:
                # Fresh page fetch - make concurrent API calls for all data
                print(f"  Fetching lot data from API (concurrent)...")
                api_tasks = [fetch_lot_bidding_data(lot_id)]
                task_map = {'bidding': 0}  # Track which index corresponds to which task
                # Add auction data fetch if we need viewing/pickup times
                if auction_id:
                    conn = sqlite3.connect(self.cache.db_path)
                    cursor = conn.cursor()
                    cursor.execute("""
                        SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
                    """, (lot_id,))
                    times = cursor.fetchone()
                    conn.close()
                    has_times = times and (times[0] or times[1])
                    if not has_times:
                        task_map['auction'] = len(api_tasks)
                        api_tasks.append(fetch_auction_data(auction_id))
                # Add bid history fetch if we have lot_uuid and expect bids
                if lot_uuid:
                    task_map['bid_history'] = len(api_tasks)
                    api_tasks.append(fetch_bid_history(lot_uuid))
                # Execute all API calls concurrently
                results = await asyncio.gather(*api_tasks, return_exceptions=True)
                bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
                # Process auction data if it was fetched
                if 'auction' in task_map and len(results) > task_map['auction']:
                    auction_data = results[task_map['auction']]
                    if not isinstance(auction_data, Exception) and auction_data:
                        auction_times = format_auction_data(auction_data)
                        page_data.update(auction_times)
                # Process bid history if it was fetched
                bid_history_data = None
                if 'bid_history' in task_map and len(results) > task_map['bid_history']:
                    bid_history_data = results[task_map['bid_history']]
                    if isinstance(bid_history_data, Exception):
                        bid_history_data = None
            if bidding_data:
                formatted_data = format_bid_data(bidding_data)
@@ -342,18 +397,15 @@ class TroostwijkScraper:
                if not lot_uuid:
                    lot_uuid = lot_details_lot.get('id')
-                # Fetch bid history for intelligence (skip if already in DB)
+                # Process bid history if we fetched it concurrently
-                if lot_uuid and page_data.get('bid_count', 0) > 0:
+                if not from_cache and 'bid_history_data' in locals() and bid_history_data and page_data.get('bid_count', 0) > 0:
-                    # Check if bid history already exists
+                    bid_data = parse_bid_history(bid_history_data, lot_id)
-                    conn = sqlite3.connect(self.cache.db_path)
+                    page_data.update(bid_data)
-                    cursor = conn.cursor()
+                    print(f"  >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
-                    cursor.execute("""
+                    self.cache.save_bid_history(lot_id, bid_data['bid_records'])
-                        SELECT COUNT(*) FROM bid_history WHERE lot_id = ?
+                elif not from_cache and lot_uuid and page_data.get('bid_count', 0) > 0:
-                    """, (lot_id,))
+                    # Fallback: fetch bid history if we didn't get it in the concurrent batch
-                    has_history = cursor.fetchone()[0] > 0
+                    # (This happens when lot_uuid wasn't available before the first API call)
                    conn.close()
                    if not has_history:
                    print(f"  Fetching bid history...")
                    bid_history = await fetch_bid_history(lot_uuid)
                    if bid_history:
@@ -361,28 +413,17 @@ class TroostwijkScraper:
                        page_data.update(bid_data)
                        print(f"  >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
                        self.cache.save_bid_history(lot_id, bid_data['bid_records'])
-                    else:
+                elif from_cache and page_data.get('bid_count', 0) > 0:
-                        print(f"  Bid history cached")
+                    # Check if cached bid history exists
                # Fetch auction data for viewing/pickup times (skip if already in DB)
                auction_id = page_data.get('auction_id')
                if auction_id:
                    # Check if lot already has viewing/pickup times
                    conn = sqlite3.connect(self.cache.db_path)
                    cursor = conn.cursor()
                    cursor.execute("""
-                        SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
+                        SELECT COUNT(*) FROM bid_history WHERE lot_id = ?
                    """, (lot_id,))
-                    times = cursor.fetchone()
+                    has_history = cursor.fetchone()[0] > 0
                    conn.close()
-
+                    if has_history:
-                    has_times = times and (times[0] or times[1])
+                        print(f"  Bid history cached")
                    if not has_times:
                        auction_data = await fetch_auction_data(auction_id)
                        if auction_data:
                            auction_times = format_auction_data(auction_data)
                            page_data.update(auction_times)
            else:
                print(f"  Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")