From ce2fa60ee95a322ceba98ce2ec6b42f373bc971b Mon Sep 17 00:00:00 2001
From: Tour <tour@example.com>
Date: Sun, 7 Dec 2025 16:53:44 +0100
Subject: [PATCH] upgrade-speed-auctions

---
 src/scraper.py | 69 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 13 deletions(-)

diff --git a/src/scraper.py b/src/scraper.py
index bb2e6f2..b353212 100644
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -84,22 +84,40 @@ class TroostwijkScraper:
 
         self.last_request_time = time.time()
 
-    async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
-        """Get page content with caching and strict rate limiting"""
+    async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]:
+        """Get page content with caching and strict rate limiting
+
+        Args:
+            fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
+                      (useful for auction listing pages where we just need HTML structure)
+        """
         if use_cache:
+            cache_start = time.time()
             cached = self.cache.get(url)
             if cached:
-                print(f"  CACHE HIT: {url}")
+                cache_time = (time.time() - cache_start) * 1000
+                print(f"  CACHE HIT: {url} ({cache_time:.0f}ms)")
                 return cached['content']
 
         await self._rate_limit()
 
         try:
+            fetch_start = time.time()
             print(f"  FETCHING: {url}")
-            await page.goto(url, wait_until='networkidle', timeout=30000)
-            await asyncio.sleep(random.uniform(0.3, 0.7))
+
+            # Use faster loading strategy for auction pages (we only need HTML, not all assets)
+            wait_strategy = 'domcontentloaded' if fast_mode else 'networkidle'
+            await page.goto(url, wait_until=wait_strategy, timeout=30000)
+            goto_time = time.time() - fetch_start
+
+            # Shorter delay for fast mode
+            delay = random.uniform(0.1, 0.3) if fast_mode else random.uniform(0.3, 0.7)
+            await asyncio.sleep(delay)
+
             content = await page.content()
+            total_time = time.time() - fetch_start
             self.cache.set(url, content, 200)
+            print(f"    [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
             return content
 
         except Exception as e:
@@ -139,7 +157,8 @@ class TroostwijkScraper:
         print(f"LISTING PAGE {page_num}: {url}")
         print(f"{'='*60}")
 
-        content = await self._get_page(page, url)
+        # Use fast mode - we only need HTML structure for link extraction
+        content = await self._get_page(page, url, fast_mode=True)
         if not content:
             return []
 
@@ -149,16 +168,27 @@ class TroostwijkScraper:
 
     async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
         """Crawl an auction page and extract lot URLs"""
-        content = await self._get_page(page, auction_url)
+        # Use fast mode for auction pages - we only need the HTML structure, not all assets
+        content = await self._get_page(page, auction_url, fast_mode=True)
         if not content:
             return []
 
+        parse_start = time.time()
         page_data = self.parser.parse_page(content, auction_url)
-        if page_data and page_data.get('type') == 'auction':
-            self.cache.save_auction(page_data)
-            print(f"    → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
+        parse_time = (time.time() - parse_start) * 1000
 
-        return self._extract_lot_urls_from_auction(content, auction_url)
+        if page_data and page_data.get('type') == 'auction':
+            save_start = time.time()
+            self.cache.save_auction(page_data)
+            save_time = (time.time() - save_start) * 1000
+            print(f"    → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
+            print(f"    [Parse: {parse_time:.0f}ms, Save: {save_time:.0f}ms]")
+
+        extract_start = time.time()
+        lot_urls = self._extract_lot_urls_from_auction(content, auction_url)
+        extract_time = (time.time() - extract_start) * 1000
+        print(f"    [Extract lots: {extract_time:.0f}ms]")
+        return lot_urls
 
     async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
         """Crawl a page (auction or lot)"""
@@ -446,12 +476,25 @@ class TroostwijkScraper:
             print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
             print("="*60)
 
+            phase2_start = time.time()
             for i, auction_url in enumerate(all_auction_urls):
-                print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
+                auction_start = time.time()
+                auction_id = self.parser.extract_lot_id(auction_url)
+                print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {auction_id}")
                 lot_urls = await self.crawl_auction_for_lots(page, auction_url)
+                auction_elapsed = time.time() - auction_start
                 if lot_urls:
                     all_lot_urls.extend(lot_urls)
-                    print(f"    → Found {len(lot_urls)} lots")
+                    print(f"    → Found {len(lot_urls)} lots (took {auction_elapsed:.2f}s)")
+                else:
+                    print(f"    → No lots found (took {auction_elapsed:.2f}s)")
+
+                # Progress estimation
+                avg_time = (time.time() - phase2_start) / (i + 1)
+                remaining = len(all_auction_urls) - (i + 1)
+                eta_seconds = avg_time * remaining
+                eta_minutes = eta_seconds / 60
+                print(f"    → Progress: {len(all_lot_urls)} lots total | ETA: {eta_minutes:.1f} min ({avg_time:.2f}s/auction)")
 
             all_lot_urls = list(set(all_lot_urls))
             print(f"\n{'='*60}")