#!/usr/bin/env python3 """ Core scaev module for Scaev Auctions """ import sqlite3 import asyncio import time import random import json import re from pathlib import Path from typing import Dict, List, Optional, Set from urllib.parse import urljoin from playwright.async_api import async_playwright, Page from config import ( BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR ) from cache import CacheManager from parse import DataParser class TroostwijkScraper: """Main scraper class for Troostwijk Auctions""" def __init__(self): self.base_url = BASE_URL self.cache = CacheManager() self.parser = DataParser() self.visited_lots: Set[str] = set() self.last_request_time = 0 self.download_images = DOWNLOAD_IMAGES async def _download_image(self, url: str, lot_id: str, index: int) -> Optional[str]: """Download an image and save it locally""" if not self.download_images: return None try: import aiohttp lot_dir = Path(IMAGES_DIR) / lot_id lot_dir.mkdir(exist_ok=True) ext = url.split('.')[-1].split('?')[0] if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']: ext = 'jpg' filepath = lot_dir / f"{index:03d}.{ext}" if filepath.exists(): return str(filepath) await self._rate_limit() async with aiohttp.ClientSession() as session: async with session.get(url, timeout=30) as response: if response.status == 200: content = await response.read() with open(filepath, 'wb') as f: f.write(content) with sqlite3.connect(self.cache.db_path) as conn: conn.execute("UPDATE images\n" "SET local_path = ?, downloaded = 1\n" "WHERE lot_id = ? AND url = ?\n" "", (str(filepath), lot_id, url)) conn.commit() return str(filepath) except Exception as e: print(f" ERROR downloading image: {e}") return None async def _rate_limit(self): """ENSURE EXACTLY 0.5s BETWEEN REQUESTS""" current_time = time.time() time_since_last = current_time - self.last_request_time if time_since_last < RATE_LIMIT_SECONDS: await asyncio.sleep(RATE_LIMIT_SECONDS - time_since_last) self.last_request_time = time.time() async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]: """Get page content with caching and strict rate limiting""" if use_cache: cached = self.cache.get(url) if cached: print(f" CACHE HIT: {url}") return cached['content'] await self._rate_limit() try: print(f" FETCHING: {url}") await page.goto(url, wait_until='networkidle', timeout=30000) await asyncio.sleep(random.uniform(0.3, 0.7)) content = await page.content() self.cache.set(url, content, 200) return content except Exception as e: print(f" ERROR: {e}") self.cache.set(url, "", 500) return None def _extract_auction_urls_from_listing(self, content: str) -> List[str]: """Extract auction URLs from listing page""" pattern = r'href=["\']([/]a/[^"\']+)["\']' matches = re.findall(pattern, content, re.IGNORECASE) return list(set(urljoin(self.base_url, match) for match in matches)) def _extract_lot_urls_from_auction(self, content: str, auction_url: str) -> List[str]: """Extract lot URLs from an auction page""" # Try Next.js data first try: match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) if match: data = json.loads(match.group(1)) lots = data.get('props', {}).get('pageProps', {}).get('auction', {}).get('lots', []) if lots: return list(set(f"{self.base_url}/l/{lot.get('urlSlug', '')}" for lot in lots if lot.get('urlSlug'))) except: pass # Fallback to HTML parsing pattern = r'href=["\']([/]l/[^"\']+)["\']' matches = re.findall(pattern, content, re.IGNORECASE) return list(set(urljoin(self.base_url, match) for match in matches)) async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]: """Crawl a single listing page and return auction URLs""" url = f"{self.base_url}/auctions?page={page_num}" print(f"\n{'='*60}") print(f"LISTING PAGE {page_num}: {url}") print(f"{'='*60}") content = await self._get_page(page, url) if not content: return [] auction_urls = self._extract_auction_urls_from_listing(content) print(f"→ Found {len(auction_urls)} auction URLs") return auction_urls async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]: """Crawl an auction page and extract lot URLs""" content = await self._get_page(page, auction_url) if not content: return [] page_data = self.parser.parse_page(content, auction_url) if page_data and page_data.get('type') == 'auction': self.cache.save_auction(page_data) print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)") return self._extract_lot_urls_from_auction(content, auction_url) async def crawl_page(self, page: Page, url: str) -> Optional[Dict]: """Crawl a page (auction or lot)""" if url in self.visited_lots: print(f" → Skipping (already visited): {url}") return None page_id = self.parser.extract_lot_id(url) print(f"\n[PAGE {page_id}]") content = await self._get_page(page, url) if not content: return None page_data = self.parser.parse_page(content, url) if not page_data: return None self.visited_lots.add(url) if page_data.get('type') == 'auction': print(f" → Type: AUCTION") print(f" → Title: {page_data.get('title', 'N/A')[:60]}...") print(f" → Location: {page_data.get('location', 'N/A')}") print(f" → Lots: {page_data.get('lots_count', 0)}") self.cache.save_auction(page_data) elif page_data.get('type') == 'lot': print(f" → Type: LOT") print(f" → Title: {page_data.get('title', 'N/A')[:60]}...") print(f" → Bid: {page_data.get('current_bid', 'N/A')}") print(f" → Location: {page_data.get('location', 'N/A')}") self.cache.save_lot(page_data) images = page_data.get('images', []) if images: self.cache.save_images(page_data['lot_id'], images) print(f" → Images: {len(images)}") if self.download_images: for i, img_url in enumerate(images): local_path = await self._download_image(img_url, page_data['lot_id'], i) if local_path: print(f" ✓ Downloaded: {Path(local_path).name}") return page_data async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]: """Main crawl function""" async with async_playwright() as p: print("Launching browser...") browser = await p.chromium.launch( headless=True, args=[ '--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled' ] ) page = await browser.new_page( viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ) await page.set_extra_http_headers({ 'Accept-Language': 'en-US,en;q=0.9', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' }) all_auction_urls = [] all_lot_urls = [] # Phase 1: Collect auction URLs print("\n" + "="*60) print("PHASE 1: COLLECTING AUCTION URLs FROM LISTING PAGES") print("="*60) for page_num in range(1, max_pages + 1): auction_urls = await self.crawl_listing_page(page, page_num) if not auction_urls: print(f"No auctions found on page {page_num}, stopping") break all_auction_urls.extend(auction_urls) print(f" → Total auctions collected so far: {len(all_auction_urls)}") all_auction_urls = list(set(all_auction_urls)) print(f"\n{'='*60}") print(f"PHASE 1 COMPLETE: {len(all_auction_urls)} UNIQUE AUCTIONS") print(f"{'='*60}") # Phase 2: Extract lot URLs from each auction print("\n" + "="*60) print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS") print("="*60) for i, auction_url in enumerate(all_auction_urls): print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}") lot_urls = await self.crawl_auction_for_lots(page, auction_url) if lot_urls: all_lot_urls.extend(lot_urls) print(f" → Found {len(lot_urls)} lots") all_lot_urls = list(set(all_lot_urls)) print(f"\n{'='*60}") print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS") print(f"{'='*60}") # Phase 3: Scrape each lot page print("\n" + "="*60) print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES") print("="*60) results = [] for i, lot_url in enumerate(all_lot_urls): print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="") page_data = await self.crawl_page(page, lot_url) if page_data: results.append(page_data) await browser.close() return results