remove-field-scraped_at_timestamp

2025-12-08 13:07:45 +01:00
parent d67cb15748
commit 207916c1fe
1 changed files with 108 additions and 67 deletions
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -84,12 +84,15 @@ class TroostwijkScraper:
        self.last_request_time = time.time()
-    async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]:
+    async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[Dict]:
        """Get page content with caching and strict rate limiting
        Args:
            fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
                      (useful for auction listing pages where we just need HTML structure)
        Returns:
            Dict with 'content' and 'from_cache' keys
        """
        if use_cache:
            cache_start = time.time()
@@ -97,7 +100,7 @@ class TroostwijkScraper:
            if cached:
                cache_time = (time.time() - cache_start) * 1000
                print(f"  CACHE HIT: {url} ({cache_time:.0f}ms)")
-                return cached['content']
+                return {'content': cached['content'], 'from_cache': True}
        await self._rate_limit()
@@ -118,7 +121,7 @@ class TroostwijkScraper:
            total_time = time.time() - fetch_start
            self.cache.set(url, content, 200)
            print(f"    [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
-            return content
+            return {'content': content, 'from_cache': False}
        except Exception as e:
            print(f"  ERROR: {e}")
@@ -158,21 +161,22 @@ class TroostwijkScraper:
        print(f"{'='*60}")
        # Use fast mode - we only need HTML structure for link extraction
-        content = await self._get_page(page, url, fast_mode=True)
+        result = await self._get_page(page, url, fast_mode=True)
-        if not content:
+        if not result:
            return []
-        auction_urls = self._extract_auction_urls_from_listing(content)
+        auction_urls = self._extract_auction_urls_from_listing(result['content'])
        print(f"→ Found {len(auction_urls)} auction URLs")
        return auction_urls
    async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
        """Crawl an auction page and extract lot URLs"""
        # Use fast mode for auction pages - we only need the HTML structure, not all assets
-        content = await self._get_page(page, auction_url, fast_mode=True)
+        result = await self._get_page(page, auction_url, fast_mode=True)
-        if not content:
+        if not result:
            return []
        content = result['content']
        parse_start = time.time()
        page_data = self.parser.parse_page(content, auction_url)
        parse_time = (time.time() - parse_start) * 1000
@@ -199,10 +203,12 @@ class TroostwijkScraper:
        page_id = self.parser.extract_lot_id(url)
        print(f"\n[PAGE {page_id}]")
-        content = await self._get_page(page, url)
+        result = await self._get_page(page, url)
-        if not content:
+        if not result:
            return None
        content = result['content']
        from_cache = result['from_cache']
        page_data = self.parser.parse_page(content, url)
        if not page_data:
            return None
@@ -245,34 +251,83 @@ class TroostwijkScraper:
                except:
                    pass
-            # Fetch bidding data from GraphQL API (skip if already complete in DB)
+            # Fetch all API data concurrently (or use cache if HTML was cached)
            lot_id = page_data.get('lot_id')
            auction_id = page_data.get('auction_id')
-            # Check if lot already has complete API data in database
+            if from_cache:
-            import sqlite3
+                # Check if we have cached API data in database
-            conn = sqlite3.connect(self.cache.db_path)
+                import sqlite3
-            cursor = conn.cursor()
+                conn = sqlite3.connect(self.cache.db_path)
-            cursor.execute("""
+                cursor = conn.cursor()
-                SELECT followers_count, estimated_min_price, current_bid, bid_count
+                cursor.execute("""
-                FROM lots WHERE lot_id = ?
+                    SELECT followers_count, estimated_min_price, current_bid, bid_count
-            """, (lot_id,))
+                    FROM lots WHERE lot_id = ?
-            existing = cursor.fetchone()
+                """, (lot_id,))
-            conn.close()
+                existing = cursor.fetchone()
                conn.close()
-            # Skip API if we have complete data (followers_count indicates API was called)
+                # Use cached API data if available and not null
-            skip_api = existing and existing[0] is not None
+                if existing and existing[0] is not None:
-
+                    print(f"  Using cached API data")
-            if skip_api:
+                    page_data['followers_count'] = existing[0]
-                print(f"  Using cached API data")
+                    page_data['estimated_min_price'] = existing[1]
-                bidding_data = None
+                    page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
-                # Load cached data for display
+                    page_data['bid_count'] = existing[3] or 0
-                page_data['followers_count'] = existing[0]
+                    bidding_data = None
-                page_data['estimated_min_price'] = existing[1]
+                    bid_history_data = None
-                page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
+                else:
-                page_data['bid_count'] = existing[3] or 0
+                    print(f"  Fetching lot data from API (concurrent)...")
                    # Make concurrent API calls
                    api_tasks = [fetch_lot_bidding_data(lot_id)]
                    if auction_id:
                        api_tasks.append(fetch_auction_data(auction_id))
                    results = await asyncio.gather(*api_tasks, return_exceptions=True)
                    bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
                    bid_history_data = None  # Will fetch after we have lot_uuid
            else:
-                print(f"  Fetching bidding data from API...")
+                # Fresh page fetch - make concurrent API calls for all data
-                bidding_data = await fetch_lot_bidding_data(lot_id)
+                print(f"  Fetching lot data from API (concurrent)...")
                api_tasks = [fetch_lot_bidding_data(lot_id)]
                task_map = {'bidding': 0}  # Track which index corresponds to which task
                # Add auction data fetch if we need viewing/pickup times
                if auction_id:
                    conn = sqlite3.connect(self.cache.db_path)
                    cursor = conn.cursor()
                    cursor.execute("""
                        SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
                    """, (lot_id,))
                    times = cursor.fetchone()
                    conn.close()
                    has_times = times and (times[0] or times[1])
                    if not has_times:
                        task_map['auction'] = len(api_tasks)
                        api_tasks.append(fetch_auction_data(auction_id))
                # Add bid history fetch if we have lot_uuid and expect bids
                if lot_uuid:
                    task_map['bid_history'] = len(api_tasks)
                    api_tasks.append(fetch_bid_history(lot_uuid))
                # Execute all API calls concurrently
                results = await asyncio.gather(*api_tasks, return_exceptions=True)
                bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
                # Process auction data if it was fetched
                if 'auction' in task_map and len(results) > task_map['auction']:
                    auction_data = results[task_map['auction']]
                    if not isinstance(auction_data, Exception) and auction_data:
                        auction_times = format_auction_data(auction_data)
                        page_data.update(auction_times)
                # Process bid history if it was fetched
                bid_history_data = None
                if 'bid_history' in task_map and len(results) > task_map['bid_history']:
                    bid_history_data = results[task_map['bid_history']]
                    if isinstance(bid_history_data, Exception):
                        bid_history_data = None
            if bidding_data:
                formatted_data = format_bid_data(bidding_data)
@@ -342,9 +397,24 @@ class TroostwijkScraper:
                if not lot_uuid:
                    lot_uuid = lot_details_lot.get('id')
-                # Fetch bid history for intelligence (skip if already in DB)
+                # Process bid history if we fetched it concurrently
-                if lot_uuid and page_data.get('bid_count', 0) > 0:
+                if not from_cache and 'bid_history_data' in locals() and bid_history_data and page_data.get('bid_count', 0) > 0:
-                    # Check if bid history already exists
+                    bid_data = parse_bid_history(bid_history_data, lot_id)
                    page_data.update(bid_data)
                    print(f"  >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
                    self.cache.save_bid_history(lot_id, bid_data['bid_records'])
                elif not from_cache and lot_uuid and page_data.get('bid_count', 0) > 0:
                    # Fallback: fetch bid history if we didn't get it in the concurrent batch
                    # (This happens when lot_uuid wasn't available before the first API call)
                    print(f"  Fetching bid history...")
                    bid_history = await fetch_bid_history(lot_uuid)
                    if bid_history:
                        bid_data = parse_bid_history(bid_history, lot_id)
                        page_data.update(bid_data)
                        print(f"  >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
                        self.cache.save_bid_history(lot_id, bid_data['bid_records'])
                elif from_cache and page_data.get('bid_count', 0) > 0:
                    # Check if cached bid history exists
                    conn = sqlite3.connect(self.cache.db_path)
                    cursor = conn.cursor()
                    cursor.execute("""
@@ -352,37 +422,8 @@ class TroostwijkScraper:
                    """, (lot_id,))
                    has_history = cursor.fetchone()[0] > 0
                    conn.close()
-
+                    if has_history:
                    if not has_history:
                        print(f"  Fetching bid history...")
                        bid_history = await fetch_bid_history(lot_uuid)
                        if bid_history:
                            bid_data = parse_bid_history(bid_history, lot_id)
                            page_data.update(bid_data)
                            print(f"  >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
                            self.cache.save_bid_history(lot_id, bid_data['bid_records'])
                    else:
                        print(f"  Bid history cached")
                # Fetch auction data for viewing/pickup times (skip if already in DB)
                auction_id = page_data.get('auction_id')
                if auction_id:
                    # Check if lot already has viewing/pickup times
                    conn = sqlite3.connect(self.cache.db_path)
                    cursor = conn.cursor()
                    cursor.execute("""
                        SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
                    """, (lot_id,))
                    times = cursor.fetchone()
                    conn.close()
                    has_times = times and (times[0] or times[1])
                    if not has_times:
                        auction_data = await fetch_auction_data(auction_id)
                        if auction_data:
                            auction_times = format_auction_data(auction_data)
                            page_data.update(auction_times)
            else:
                print(f"  Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")