upgrade-speed-auctions

2025-12-07 16:53:44 +01:00
parent b1905164bd
commit ce2fa60ee9
1 changed files with 56 additions and 13 deletions
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -84,22 +84,40 @@ class TroostwijkScraper:

        self.last_request_time = time.time()

-    async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
-        """Get page content with caching and strict rate limiting"""
+    async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]:
+        """Get page content with caching and strict rate limiting
+
+        Args:
+            fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
+                      (useful for auction listing pages where we just need HTML structure)
+        """
        if use_cache:
+            cache_start = time.time()
            cached = self.cache.get(url)
            if cached:
-                print(f"  CACHE HIT: {url}")
+                cache_time = (time.time() - cache_start) * 1000
+                print(f"  CACHE HIT: {url} ({cache_time:.0f}ms)")
                return cached['content']

        await self._rate_limit()

        try:
+            fetch_start = time.time()
            print(f"  FETCHING: {url}")
-            await page.goto(url, wait_until='networkidle', timeout=30000)
-            await asyncio.sleep(random.uniform(0.3, 0.7))
+
+            # Use faster loading strategy for auction pages (we only need HTML, not all assets)
+            wait_strategy = 'domcontentloaded' if fast_mode else 'networkidle'
+            await page.goto(url, wait_until=wait_strategy, timeout=30000)
+            goto_time = time.time() - fetch_start
+
+            # Shorter delay for fast mode
+            delay = random.uniform(0.1, 0.3) if fast_mode else random.uniform(0.3, 0.7)
+            await asyncio.sleep(delay)
+
            content = await page.content()
+            total_time = time.time() - fetch_start
            self.cache.set(url, content, 200)
+            print(f"    [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
            return content

        except Exception as e:
@@ -139,7 +157,8 @@ class TroostwijkScraper:
        print(f"LISTING PAGE {page_num}: {url}")
        print(f"{'='*60}")

-        content = await self._get_page(page, url)
+        # Use fast mode - we only need HTML structure for link extraction
+        content = await self._get_page(page, url, fast_mode=True)
        if not content:
            return []

@@ -149,16 +168,27 @@ class TroostwijkScraper:

    async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
        """Crawl an auction page and extract lot URLs"""
-        content = await self._get_page(page, auction_url)
+        # Use fast mode for auction pages - we only need the HTML structure, not all assets
+        content = await self._get_page(page, auction_url, fast_mode=True)
        if not content:
            return []

+        parse_start = time.time()
        page_data = self.parser.parse_page(content, auction_url)
-        if page_data and page_data.get('type') == 'auction':
-            self.cache.save_auction(page_data)
-            print(f"    → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
+        parse_time = (time.time() - parse_start) * 1000

-        return self._extract_lot_urls_from_auction(content, auction_url)
+        if page_data and page_data.get('type') == 'auction':
+            save_start = time.time()
+            self.cache.save_auction(page_data)
+            save_time = (time.time() - save_start) * 1000
+            print(f"    → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
+            print(f"    [Parse: {parse_time:.0f}ms, Save: {save_time:.0f}ms]")
+
+        extract_start = time.time()
+        lot_urls = self._extract_lot_urls_from_auction(content, auction_url)
+        extract_time = (time.time() - extract_start) * 1000
+        print(f"    [Extract lots: {extract_time:.0f}ms]")
+        return lot_urls

    async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
        """Crawl a page (auction or lot)"""
@@ -446,12 +476,25 @@ class TroostwijkScraper:
            print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
            print("="*60)

+            phase2_start = time.time()
            for i, auction_url in enumerate(all_auction_urls):
-                print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
+                auction_start = time.time()
+                auction_id = self.parser.extract_lot_id(auction_url)
+                print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {auction_id}")
                lot_urls = await self.crawl_auction_for_lots(page, auction_url)
+                auction_elapsed = time.time() - auction_start
                if lot_urls:
                    all_lot_urls.extend(lot_urls)
-                    print(f"    → Found {len(lot_urls)} lots")
+                    print(f"    → Found {len(lot_urls)} lots (took {auction_elapsed:.2f}s)")
+                else:
+                    print(f"    → No lots found (took {auction_elapsed:.2f}s)")
+
+                # Progress estimation
+                avg_time = (time.time() - phase2_start) / (i + 1)
+                remaining = len(all_auction_urls) - (i + 1)
+                eta_seconds = avg_time * remaining
+                eta_minutes = eta_seconds / 60
+                print(f"    → Progress: {len(all_lot_urls)} lots total | ETA: {eta_minutes:.1f} min ({avg_time:.2f}s/auction)")

            all_lot_urls = list(set(all_lot_urls))
            print(f"\n{'='*60}")