Files
scaev/src/scraper.py
Tour 021a75396e a
2025-12-04 15:26:33 +01:00

279 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Core scaev module for Scaev Auctions
"""
import sqlite3
import asyncio
import time
import random
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Set
from urllib.parse import urljoin
from playwright.async_api import async_playwright, Page
from config import (
BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR
)
from cache import CacheManager
from parse import DataParser
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
def __init__(self):
self.base_url = BASE_URL
self.cache = CacheManager()
self.parser = DataParser()
self.visited_lots: Set[str] = set()
self.last_request_time = 0
self.download_images = DOWNLOAD_IMAGES
async def _download_image(self, url: str, lot_id: str, index: int) -> Optional[str]:
"""Download an image and save it locally"""
if not self.download_images:
return None
try:
import aiohttp
lot_dir = Path(IMAGES_DIR) / lot_id
lot_dir.mkdir(exist_ok=True)
ext = url.split('.')[-1].split('?')[0]
if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']:
ext = 'jpg'
filepath = lot_dir / f"{index:03d}.{ext}"
if filepath.exists():
return str(filepath)
await self._rate_limit()
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=30) as response:
if response.status == 200:
content = await response.read()
with open(filepath, 'wb') as f:
f.write(content)
with sqlite3.connect(self.cache.db_path) as conn:
conn.execute("UPDATE images\n"
"SET local_path = ?, downloaded = 1\n"
"WHERE lot_id = ? AND url = ?\n"
"", (str(filepath), lot_id, url))
conn.commit()
return str(filepath)
except Exception as e:
print(f" ERROR downloading image: {e}")
return None
async def _rate_limit(self):
"""ENSURE EXACTLY 0.5s BETWEEN REQUESTS"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < RATE_LIMIT_SECONDS:
await asyncio.sleep(RATE_LIMIT_SECONDS - time_since_last)
self.last_request_time = time.time()
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
"""Get page content with caching and strict rate limiting"""
if use_cache:
cached = self.cache.get(url)
if cached:
print(f" CACHE HIT: {url}")
return cached['content']
await self._rate_limit()
try:
print(f" FETCHING: {url}")
await page.goto(url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(random.uniform(0.3, 0.7))
content = await page.content()
self.cache.set(url, content, 200)
return content
except Exception as e:
print(f" ERROR: {e}")
self.cache.set(url, "", 500)
return None
def _extract_auction_urls_from_listing(self, content: str) -> List[str]:
"""Extract auction URLs from listing page"""
pattern = r'href=["\']([/]a/[^"\']+)["\']'
matches = re.findall(pattern, content, re.IGNORECASE)
return list(set(urljoin(self.base_url, match) for match in matches))
def _extract_lot_urls_from_auction(self, content: str, auction_url: str) -> List[str]:
"""Extract lot URLs from an auction page"""
# Try Next.js data first
try:
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if match:
data = json.loads(match.group(1))
lots = data.get('props', {}).get('pageProps', {}).get('auction', {}).get('lots', [])
if lots:
return list(set(f"{self.base_url}/l/{lot.get('urlSlug', '')}"
for lot in lots if lot.get('urlSlug')))
except:
pass
# Fallback to HTML parsing
pattern = r'href=["\']([/]l/[^"\']+)["\']'
matches = re.findall(pattern, content, re.IGNORECASE)
return list(set(urljoin(self.base_url, match) for match in matches))
async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
"""Crawl a single listing page and return auction URLs"""
url = f"{self.base_url}/auctions?page={page_num}"
print(f"\n{'='*60}")
print(f"LISTING PAGE {page_num}: {url}")
print(f"{'='*60}")
content = await self._get_page(page, url)
if not content:
return []
auction_urls = self._extract_auction_urls_from_listing(content)
print(f"→ Found {len(auction_urls)} auction URLs")
return auction_urls
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
"""Crawl an auction page and extract lot URLs"""
content = await self._get_page(page, auction_url)
if not content:
return []
page_data = self.parser.parse_page(content, auction_url)
if page_data and page_data.get('type') == 'auction':
self.cache.save_auction(page_data)
print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
return self._extract_lot_urls_from_auction(content, auction_url)
async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
"""Crawl a page (auction or lot)"""
if url in self.visited_lots:
print(f" → Skipping (already visited): {url}")
return None
page_id = self.parser.extract_lot_id(url)
print(f"\n[PAGE {page_id}]")
content = await self._get_page(page, url)
if not content:
return None
page_data = self.parser.parse_page(content, url)
if not page_data:
return None
self.visited_lots.add(url)
if page_data.get('type') == 'auction':
print(f" → Type: AUCTION")
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
print(f" → Location: {page_data.get('location', 'N/A')}")
print(f" → Lots: {page_data.get('lots_count', 0)}")
self.cache.save_auction(page_data)
elif page_data.get('type') == 'lot':
print(f" → Type: LOT")
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
print(f" → Bid: {page_data.get('current_bid', 'N/A')}")
print(f" → Location: {page_data.get('location', 'N/A')}")
self.cache.save_lot(page_data)
images = page_data.get('images', [])
if images:
self.cache.save_images(page_data['lot_id'], images)
print(f" → Images: {len(images)}")
if self.download_images:
for i, img_url in enumerate(images):
local_path = await self._download_image(img_url, page_data['lot_id'], i)
if local_path:
print(f" ✓ Downloaded: {Path(local_path).name}")
return page_data
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
"""Main crawl function"""
async with async_playwright() as p:
print("Launching browser...")
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled'
]
)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
)
await page.set_extra_http_headers({
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
})
all_auction_urls = []
all_lot_urls = []
# Phase 1: Collect auction URLs
print("\n" + "="*60)
print("PHASE 1: COLLECTING AUCTION URLs FROM LISTING PAGES")
print("="*60)
for page_num in range(1, max_pages + 1):
auction_urls = await self.crawl_listing_page(page, page_num)
if not auction_urls:
print(f"No auctions found on page {page_num}, stopping")
break
all_auction_urls.extend(auction_urls)
print(f" → Total auctions collected so far: {len(all_auction_urls)}")
all_auction_urls = list(set(all_auction_urls))
print(f"\n{'='*60}")
print(f"PHASE 1 COMPLETE: {len(all_auction_urls)} UNIQUE AUCTIONS")
print(f"{'='*60}")
# Phase 2: Extract lot URLs from each auction
print("\n" + "="*60)
print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
print("="*60)
for i, auction_url in enumerate(all_auction_urls):
print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
lot_urls = await self.crawl_auction_for_lots(page, auction_url)
if lot_urls:
all_lot_urls.extend(lot_urls)
print(f" → Found {len(lot_urls)} lots")
all_lot_urls = list(set(all_lot_urls))
print(f"\n{'='*60}")
print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
print(f"{'='*60}")
# Phase 3: Scrape each lot page
print("\n" + "="*60)
print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
print("="*60)
results = []
for i, lot_url in enumerate(all_lot_urls):
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
page_data = await self.crawl_page(page, lot_url)
if page_data:
results.append(page_data)
await browser.close()
return results