upgrade-speed-auctions

This commit is contained in:
Tour
2025-12-07 16:53:44 +01:00
parent b1905164bd
commit ce2fa60ee9

View File

@@ -84,22 +84,40 @@ class TroostwijkScraper:
self.last_request_time = time.time()
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
"""Get page content with caching and strict rate limiting"""
async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]:
"""Get page content with caching and strict rate limiting
Args:
fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
(useful for auction listing pages where we just need HTML structure)
"""
if use_cache:
cache_start = time.time()
cached = self.cache.get(url)
if cached:
print(f" CACHE HIT: {url}")
cache_time = (time.time() - cache_start) * 1000
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
return cached['content']
await self._rate_limit()
try:
fetch_start = time.time()
print(f" FETCHING: {url}")
await page.goto(url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(random.uniform(0.3, 0.7))
# Use faster loading strategy for auction pages (we only need HTML, not all assets)
wait_strategy = 'domcontentloaded' if fast_mode else 'networkidle'
await page.goto(url, wait_until=wait_strategy, timeout=30000)
goto_time = time.time() - fetch_start
# Shorter delay for fast mode
delay = random.uniform(0.1, 0.3) if fast_mode else random.uniform(0.3, 0.7)
await asyncio.sleep(delay)
content = await page.content()
total_time = time.time() - fetch_start
self.cache.set(url, content, 200)
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
return content
except Exception as e:
@@ -139,7 +157,8 @@ class TroostwijkScraper:
print(f"LISTING PAGE {page_num}: {url}")
print(f"{'='*60}")
content = await self._get_page(page, url)
# Use fast mode - we only need HTML structure for link extraction
content = await self._get_page(page, url, fast_mode=True)
if not content:
return []
@@ -149,16 +168,27 @@ class TroostwijkScraper:
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
"""Crawl an auction page and extract lot URLs"""
content = await self._get_page(page, auction_url)
# Use fast mode for auction pages - we only need the HTML structure, not all assets
content = await self._get_page(page, auction_url, fast_mode=True)
if not content:
return []
parse_start = time.time()
page_data = self.parser.parse_page(content, auction_url)
if page_data and page_data.get('type') == 'auction':
self.cache.save_auction(page_data)
print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
parse_time = (time.time() - parse_start) * 1000
return self._extract_lot_urls_from_auction(content, auction_url)
if page_data and page_data.get('type') == 'auction':
save_start = time.time()
self.cache.save_auction(page_data)
save_time = (time.time() - save_start) * 1000
print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
print(f" [Parse: {parse_time:.0f}ms, Save: {save_time:.0f}ms]")
extract_start = time.time()
lot_urls = self._extract_lot_urls_from_auction(content, auction_url)
extract_time = (time.time() - extract_start) * 1000
print(f" [Extract lots: {extract_time:.0f}ms]")
return lot_urls
async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
"""Crawl a page (auction or lot)"""
@@ -446,12 +476,25 @@ class TroostwijkScraper:
print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
print("="*60)
phase2_start = time.time()
for i, auction_url in enumerate(all_auction_urls):
print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
auction_start = time.time()
auction_id = self.parser.extract_lot_id(auction_url)
print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {auction_id}")
lot_urls = await self.crawl_auction_for_lots(page, auction_url)
auction_elapsed = time.time() - auction_start
if lot_urls:
all_lot_urls.extend(lot_urls)
print(f" → Found {len(lot_urls)} lots")
print(f" → Found {len(lot_urls)} lots (took {auction_elapsed:.2f}s)")
else:
print(f" → No lots found (took {auction_elapsed:.2f}s)")
# Progress estimation
avg_time = (time.time() - phase2_start) / (i + 1)
remaining = len(all_auction_urls) - (i + 1)
eta_seconds = avg_time * remaining
eta_minutes = eta_seconds / 60
print(f" → Progress: {len(all_lot_urls)} lots total | ETA: {eta_minutes:.1f} min ({avg_time:.2f}s/auction)")
all_lot_urls = list(set(all_lot_urls))
print(f"\n{'='*60}")