performance

2025-12-07 08:56:08 +01:00
parent c56d63d6fa
commit 7e6629641f
1 changed files with 91 additions and 27 deletions
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -214,10 +214,34 @@ class TroostwijkScraper:
                except:
                    pass

-            # Fetch bidding data from GraphQL API
+            # Fetch bidding data from GraphQL API (skip if already complete in DB)
            lot_id = page_data.get('lot_id')
-            print(f"  Fetching bidding data from API...")
-            bidding_data = await fetch_lot_bidding_data(lot_id)
+
+            # Check if lot already has complete API data in database
+            import sqlite3
+            conn = sqlite3.connect(self.cache.db_path)
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT followers_count, estimated_min_price, current_bid, bid_count
+                FROM lots WHERE lot_id = ?
+            """, (lot_id,))
+            existing = cursor.fetchone()
+            conn.close()
+
+            # Skip API if we have complete data (followers_count indicates API was called)
+            skip_api = existing and existing[0] is not None
+
+            if skip_api:
+                print(f"  Using cached API data")
+                bidding_data = None
+                # Load cached data for display
+                page_data['followers_count'] = existing[0]
+                page_data['estimated_min_price'] = existing[1]
+                page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
+                page_data['bid_count'] = existing[3] or 0
+            else:
+                print(f"  Fetching bidding data from API...")
+                bidding_data = await fetch_lot_bidding_data(lot_id)

            if bidding_data:
                formatted_data = format_bid_data(bidding_data)
@@ -287,25 +311,47 @@ class TroostwijkScraper:
                if not lot_uuid:
                    lot_uuid = lot_details_lot.get('id')

-                # Fetch bid history for intelligence
+                # Fetch bid history for intelligence (skip if already in DB)
                if lot_uuid and page_data.get('bid_count', 0) > 0:
-                    print(f"  Fetching bid history...")
-                    bid_history = await fetch_bid_history(lot_uuid)
-                    if bid_history:
-                        bid_data = parse_bid_history(bid_history, lot_id)
-                        page_data.update(bid_data)
-                        print(f"  >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
+                    # Check if bid history already exists
+                    conn = sqlite3.connect(self.cache.db_path)
+                    cursor = conn.cursor()
+                    cursor.execute("""
+                        SELECT COUNT(*) FROM bid_history WHERE lot_id = ?
+                    """, (lot_id,))
+                    has_history = cursor.fetchone()[0] > 0
+                    conn.close()

-                        # Save bid history to database
-                        self.cache.save_bid_history(lot_id, bid_data['bid_records'])
+                    if not has_history:
+                        print(f"  Fetching bid history...")
+                        bid_history = await fetch_bid_history(lot_uuid)
+                        if bid_history:
+                            bid_data = parse_bid_history(bid_history, lot_id)
+                            page_data.update(bid_data)
+                            print(f"  >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
+                            self.cache.save_bid_history(lot_id, bid_data['bid_records'])
+                    else:
+                        print(f"  Bid history cached")

-                # Fetch auction data for viewing/pickup times if we have auction_id
+                # Fetch auction data for viewing/pickup times (skip if already in DB)
                auction_id = page_data.get('auction_id')
                if auction_id:
-                    auction_data = await fetch_auction_data(auction_id)
-                    if auction_data:
-                        auction_times = format_auction_data(auction_data)
-                        page_data.update(auction_times)
+                    # Check if lot already has viewing/pickup times
+                    conn = sqlite3.connect(self.cache.db_path)
+                    cursor = conn.cursor()
+                    cursor.execute("""
+                        SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
+                    """, (lot_id,))
+                    times = cursor.fetchone()
+                    conn.close()
+
+                    has_times = times and (times[0] or times[1])
+
+                    if not has_times:
+                        auction_data = await fetch_auction_data(auction_id)
+                        if auction_data:
+                            auction_times = format_auction_data(auction_data)
+                            page_data.update(auction_times)
            else:
                print(f"  Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")

@@ -318,17 +364,35 @@ class TroostwijkScraper:
                print(f"  Images: {len(images)}")

                if self.download_images:
-                    # Download all images concurrently for this lot
-                    import aiohttp
-                    async with aiohttp.ClientSession() as session:
-                        download_tasks = [
-                            self._download_image(session, img_url, page_data['lot_id'], i)
-                            for i, img_url in enumerate(images)
-                        ]
-                        results = await asyncio.gather(*download_tasks, return_exceptions=True)
+                    # Check which images are already downloaded
+                    import sqlite3
+                    conn = sqlite3.connect(self.cache.db_path)
+                    cursor = conn.cursor()
+                    cursor.execute("""
+                        SELECT url FROM images
+                        WHERE lot_id = ? AND downloaded = 1
+                    """, (page_data['lot_id'],))
+                    already_downloaded = {row[0] for row in cursor.fetchall()}
+                    conn.close()

-                        downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
-                        print(f"    Downloaded: {downloaded_count}/{len(images)} images")
+                    # Only download missing images
+                    images_to_download = [
+                        (i, img_url) for i, img_url in enumerate(images)
+                        if img_url not in already_downloaded
+                    ]
+
+                    if images_to_download:
+                        import aiohttp
+                        async with aiohttp.ClientSession() as session:
+                            download_tasks = [
+                                self._download_image(session, img_url, page_data['lot_id'], i)
+                                for i, img_url in images_to_download
+                            ]
+                            results = await asyncio.gather(*download_tasks, return_exceptions=True)
+                            downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
+                            print(f"    Downloaded: {downloaded_count}/{len(images_to_download)} new images")
+                    else:
+                        print(f"    All {len(images)} images already cached")

        return page_data