From ce2fa60ee95a322ceba98ce2ec6b42f373bc971b Mon Sep 17 00:00:00 2001 From: Tour Date: Sun, 7 Dec 2025 16:53:44 +0100 Subject: [PATCH] upgrade-speed-auctions --- src/scraper.py | 69 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index bb2e6f2..b353212 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -84,22 +84,40 @@ class TroostwijkScraper: self.last_request_time = time.time() - async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]: - """Get page content with caching and strict rate limiting""" + async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]: + """Get page content with caching and strict rate limiting + + Args: + fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading + (useful for auction listing pages where we just need HTML structure) + """ if use_cache: + cache_start = time.time() cached = self.cache.get(url) if cached: - print(f" CACHE HIT: {url}") + cache_time = (time.time() - cache_start) * 1000 + print(f" CACHE HIT: {url} ({cache_time:.0f}ms)") return cached['content'] await self._rate_limit() try: + fetch_start = time.time() print(f" FETCHING: {url}") - await page.goto(url, wait_until='networkidle', timeout=30000) - await asyncio.sleep(random.uniform(0.3, 0.7)) + + # Use faster loading strategy for auction pages (we only need HTML, not all assets) + wait_strategy = 'domcontentloaded' if fast_mode else 'networkidle' + await page.goto(url, wait_until=wait_strategy, timeout=30000) + goto_time = time.time() - fetch_start + + # Shorter delay for fast mode + delay = random.uniform(0.1, 0.3) if fast_mode else random.uniform(0.3, 0.7) + await asyncio.sleep(delay) + content = await page.content() + total_time = time.time() - fetch_start self.cache.set(url, content, 200) + print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]") return content except Exception as e: @@ -139,7 +157,8 @@ class TroostwijkScraper: print(f"LISTING PAGE {page_num}: {url}") print(f"{'='*60}") - content = await self._get_page(page, url) + # Use fast mode - we only need HTML structure for link extraction + content = await self._get_page(page, url, fast_mode=True) if not content: return [] @@ -149,16 +168,27 @@ class TroostwijkScraper: async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]: """Crawl an auction page and extract lot URLs""" - content = await self._get_page(page, auction_url) + # Use fast mode for auction pages - we only need the HTML structure, not all assets + content = await self._get_page(page, auction_url, fast_mode=True) if not content: return [] + parse_start = time.time() page_data = self.parser.parse_page(content, auction_url) - if page_data and page_data.get('type') == 'auction': - self.cache.save_auction(page_data) - print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)") + parse_time = (time.time() - parse_start) * 1000 - return self._extract_lot_urls_from_auction(content, auction_url) + if page_data and page_data.get('type') == 'auction': + save_start = time.time() + self.cache.save_auction(page_data) + save_time = (time.time() - save_start) * 1000 + print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)") + print(f" [Parse: {parse_time:.0f}ms, Save: {save_time:.0f}ms]") + + extract_start = time.time() + lot_urls = self._extract_lot_urls_from_auction(content, auction_url) + extract_time = (time.time() - extract_start) * 1000 + print(f" [Extract lots: {extract_time:.0f}ms]") + return lot_urls async def crawl_page(self, page: Page, url: str) -> Optional[Dict]: """Crawl a page (auction or lot)""" @@ -446,12 +476,25 @@ class TroostwijkScraper: print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS") print("="*60) + phase2_start = time.time() for i, auction_url in enumerate(all_auction_urls): - print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}") + auction_start = time.time() + auction_id = self.parser.extract_lot_id(auction_url) + print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {auction_id}") lot_urls = await self.crawl_auction_for_lots(page, auction_url) + auction_elapsed = time.time() - auction_start if lot_urls: all_lot_urls.extend(lot_urls) - print(f" → Found {len(lot_urls)} lots") + print(f" → Found {len(lot_urls)} lots (took {auction_elapsed:.2f}s)") + else: + print(f" → No lots found (took {auction_elapsed:.2f}s)") + + # Progress estimation + avg_time = (time.time() - phase2_start) / (i + 1) + remaining = len(all_auction_urls) - (i + 1) + eta_seconds = avg_time * remaining + eta_minutes = eta_seconds / 60 + print(f" → Progress: {len(all_lot_urls)} lots total | ETA: {eta_minutes:.1f} min ({avg_time:.2f}s/auction)") all_lot_urls = list(set(all_lot_urls)) print(f"\n{'='*60}")