upgrade-speed-auctions
This commit is contained in:
@@ -84,22 +84,40 @@ class TroostwijkScraper:
|
||||
|
||||
self.last_request_time = time.time()
|
||||
|
||||
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
|
||||
"""Get page content with caching and strict rate limiting"""
|
||||
async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]:
|
||||
"""Get page content with caching and strict rate limiting
|
||||
|
||||
Args:
|
||||
fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
|
||||
(useful for auction listing pages where we just need HTML structure)
|
||||
"""
|
||||
if use_cache:
|
||||
cache_start = time.time()
|
||||
cached = self.cache.get(url)
|
||||
if cached:
|
||||
print(f" CACHE HIT: {url}")
|
||||
cache_time = (time.time() - cache_start) * 1000
|
||||
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
||||
return cached['content']
|
||||
|
||||
await self._rate_limit()
|
||||
|
||||
try:
|
||||
fetch_start = time.time()
|
||||
print(f" FETCHING: {url}")
|
||||
await page.goto(url, wait_until='networkidle', timeout=30000)
|
||||
await asyncio.sleep(random.uniform(0.3, 0.7))
|
||||
|
||||
# Use faster loading strategy for auction pages (we only need HTML, not all assets)
|
||||
wait_strategy = 'domcontentloaded' if fast_mode else 'networkidle'
|
||||
await page.goto(url, wait_until=wait_strategy, timeout=30000)
|
||||
goto_time = time.time() - fetch_start
|
||||
|
||||
# Shorter delay for fast mode
|
||||
delay = random.uniform(0.1, 0.3) if fast_mode else random.uniform(0.3, 0.7)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
content = await page.content()
|
||||
total_time = time.time() - fetch_start
|
||||
self.cache.set(url, content, 200)
|
||||
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
@@ -139,7 +157,8 @@ class TroostwijkScraper:
|
||||
print(f"LISTING PAGE {page_num}: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
content = await self._get_page(page, url)
|
||||
# Use fast mode - we only need HTML structure for link extraction
|
||||
content = await self._get_page(page, url, fast_mode=True)
|
||||
if not content:
|
||||
return []
|
||||
|
||||
@@ -149,16 +168,27 @@ class TroostwijkScraper:
|
||||
|
||||
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
|
||||
"""Crawl an auction page and extract lot URLs"""
|
||||
content = await self._get_page(page, auction_url)
|
||||
# Use fast mode for auction pages - we only need the HTML structure, not all assets
|
||||
content = await self._get_page(page, auction_url, fast_mode=True)
|
||||
if not content:
|
||||
return []
|
||||
|
||||
parse_start = time.time()
|
||||
page_data = self.parser.parse_page(content, auction_url)
|
||||
if page_data and page_data.get('type') == 'auction':
|
||||
self.cache.save_auction(page_data)
|
||||
print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
|
||||
parse_time = (time.time() - parse_start) * 1000
|
||||
|
||||
return self._extract_lot_urls_from_auction(content, auction_url)
|
||||
if page_data and page_data.get('type') == 'auction':
|
||||
save_start = time.time()
|
||||
self.cache.save_auction(page_data)
|
||||
save_time = (time.time() - save_start) * 1000
|
||||
print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
|
||||
print(f" [Parse: {parse_time:.0f}ms, Save: {save_time:.0f}ms]")
|
||||
|
||||
extract_start = time.time()
|
||||
lot_urls = self._extract_lot_urls_from_auction(content, auction_url)
|
||||
extract_time = (time.time() - extract_start) * 1000
|
||||
print(f" [Extract lots: {extract_time:.0f}ms]")
|
||||
return lot_urls
|
||||
|
||||
async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
|
||||
"""Crawl a page (auction or lot)"""
|
||||
@@ -446,12 +476,25 @@ class TroostwijkScraper:
|
||||
print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
|
||||
print("="*60)
|
||||
|
||||
phase2_start = time.time()
|
||||
for i, auction_url in enumerate(all_auction_urls):
|
||||
print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
|
||||
auction_start = time.time()
|
||||
auction_id = self.parser.extract_lot_id(auction_url)
|
||||
print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {auction_id}")
|
||||
lot_urls = await self.crawl_auction_for_lots(page, auction_url)
|
||||
auction_elapsed = time.time() - auction_start
|
||||
if lot_urls:
|
||||
all_lot_urls.extend(lot_urls)
|
||||
print(f" → Found {len(lot_urls)} lots")
|
||||
print(f" → Found {len(lot_urls)} lots (took {auction_elapsed:.2f}s)")
|
||||
else:
|
||||
print(f" → No lots found (took {auction_elapsed:.2f}s)")
|
||||
|
||||
# Progress estimation
|
||||
avg_time = (time.time() - phase2_start) / (i + 1)
|
||||
remaining = len(all_auction_urls) - (i + 1)
|
||||
eta_seconds = avg_time * remaining
|
||||
eta_minutes = eta_seconds / 60
|
||||
print(f" → Progress: {len(all_lot_urls)} lots total | ETA: {eta_minutes:.1f} min ({avg_time:.2f}s/auction)")
|
||||
|
||||
all_lot_urls = list(set(all_lot_urls))
|
||||
print(f"\n{'='*60}")
|
||||
|
||||
Reference in New Issue
Block a user