279 lines
11 KiB
Python
279 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Core scaev module for Scaev Auctions
|
|
"""
|
|
import sqlite3
|
|
import asyncio
|
|
import time
|
|
import random
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set
|
|
from urllib.parse import urljoin
|
|
|
|
from playwright.async_api import async_playwright, Page
|
|
|
|
from config import (
|
|
BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR
|
|
)
|
|
from cache import CacheManager
|
|
from parse import DataParser
|
|
|
|
class TroostwijkScraper:
|
|
"""Main scraper class for Troostwijk Auctions"""
|
|
|
|
def __init__(self):
|
|
self.base_url = BASE_URL
|
|
self.cache = CacheManager()
|
|
self.parser = DataParser()
|
|
self.visited_lots: Set[str] = set()
|
|
self.last_request_time = 0
|
|
self.download_images = DOWNLOAD_IMAGES
|
|
|
|
async def _download_image(self, url: str, lot_id: str, index: int) -> Optional[str]:
|
|
"""Download an image and save it locally"""
|
|
if not self.download_images:
|
|
return None
|
|
|
|
try:
|
|
import aiohttp
|
|
lot_dir = Path(IMAGES_DIR) / lot_id
|
|
lot_dir.mkdir(exist_ok=True)
|
|
|
|
ext = url.split('.')[-1].split('?')[0]
|
|
if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']:
|
|
ext = 'jpg'
|
|
|
|
filepath = lot_dir / f"{index:03d}.{ext}"
|
|
if filepath.exists():
|
|
return str(filepath)
|
|
|
|
await self._rate_limit()
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url, timeout=30) as response:
|
|
if response.status == 200:
|
|
content = await response.read()
|
|
with open(filepath, 'wb') as f:
|
|
f.write(content)
|
|
|
|
with sqlite3.connect(self.cache.db_path) as conn:
|
|
conn.execute("UPDATE images\n"
|
|
"SET local_path = ?, downloaded = 1\n"
|
|
"WHERE lot_id = ? AND url = ?\n"
|
|
"", (str(filepath), lot_id, url))
|
|
conn.commit()
|
|
return str(filepath)
|
|
|
|
except Exception as e:
|
|
print(f" ERROR downloading image: {e}")
|
|
return None
|
|
|
|
async def _rate_limit(self):
|
|
"""ENSURE EXACTLY 0.5s BETWEEN REQUESTS"""
|
|
current_time = time.time()
|
|
time_since_last = current_time - self.last_request_time
|
|
|
|
if time_since_last < RATE_LIMIT_SECONDS:
|
|
await asyncio.sleep(RATE_LIMIT_SECONDS - time_since_last)
|
|
|
|
self.last_request_time = time.time()
|
|
|
|
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
|
|
"""Get page content with caching and strict rate limiting"""
|
|
if use_cache:
|
|
cached = self.cache.get(url)
|
|
if cached:
|
|
print(f" CACHE HIT: {url}")
|
|
return cached['content']
|
|
|
|
await self._rate_limit()
|
|
|
|
try:
|
|
print(f" FETCHING: {url}")
|
|
await page.goto(url, wait_until='networkidle', timeout=30000)
|
|
await asyncio.sleep(random.uniform(0.3, 0.7))
|
|
content = await page.content()
|
|
self.cache.set(url, content, 200)
|
|
return content
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
self.cache.set(url, "", 500)
|
|
return None
|
|
|
|
def _extract_auction_urls_from_listing(self, content: str) -> List[str]:
|
|
"""Extract auction URLs from listing page"""
|
|
pattern = r'href=["\']([/]a/[^"\']+)["\']'
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
return list(set(urljoin(self.base_url, match) for match in matches))
|
|
|
|
def _extract_lot_urls_from_auction(self, content: str, auction_url: str) -> List[str]:
|
|
"""Extract lot URLs from an auction page"""
|
|
# Try Next.js data first
|
|
try:
|
|
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
if match:
|
|
data = json.loads(match.group(1))
|
|
lots = data.get('props', {}).get('pageProps', {}).get('auction', {}).get('lots', [])
|
|
if lots:
|
|
return list(set(f"{self.base_url}/l/{lot.get('urlSlug', '')}"
|
|
for lot in lots if lot.get('urlSlug')))
|
|
except:
|
|
pass
|
|
|
|
# Fallback to HTML parsing
|
|
pattern = r'href=["\']([/]l/[^"\']+)["\']'
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
return list(set(urljoin(self.base_url, match) for match in matches))
|
|
|
|
async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
|
|
"""Crawl a single listing page and return auction URLs"""
|
|
url = f"{self.base_url}/auctions?page={page_num}"
|
|
print(f"\n{'='*60}")
|
|
print(f"LISTING PAGE {page_num}: {url}")
|
|
print(f"{'='*60}")
|
|
|
|
content = await self._get_page(page, url)
|
|
if not content:
|
|
return []
|
|
|
|
auction_urls = self._extract_auction_urls_from_listing(content)
|
|
print(f"→ Found {len(auction_urls)} auction URLs")
|
|
return auction_urls
|
|
|
|
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
|
|
"""Crawl an auction page and extract lot URLs"""
|
|
content = await self._get_page(page, auction_url)
|
|
if not content:
|
|
return []
|
|
|
|
page_data = self.parser.parse_page(content, auction_url)
|
|
if page_data and page_data.get('type') == 'auction':
|
|
self.cache.save_auction(page_data)
|
|
print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
|
|
|
|
return self._extract_lot_urls_from_auction(content, auction_url)
|
|
|
|
async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
|
|
"""Crawl a page (auction or lot)"""
|
|
if url in self.visited_lots:
|
|
print(f" → Skipping (already visited): {url}")
|
|
return None
|
|
|
|
page_id = self.parser.extract_lot_id(url)
|
|
print(f"\n[PAGE {page_id}]")
|
|
|
|
content = await self._get_page(page, url)
|
|
if not content:
|
|
return None
|
|
|
|
page_data = self.parser.parse_page(content, url)
|
|
if not page_data:
|
|
return None
|
|
|
|
self.visited_lots.add(url)
|
|
|
|
if page_data.get('type') == 'auction':
|
|
print(f" → Type: AUCTION")
|
|
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
|
|
print(f" → Location: {page_data.get('location', 'N/A')}")
|
|
print(f" → Lots: {page_data.get('lots_count', 0)}")
|
|
self.cache.save_auction(page_data)
|
|
|
|
elif page_data.get('type') == 'lot':
|
|
print(f" → Type: LOT")
|
|
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
|
|
print(f" → Bid: {page_data.get('current_bid', 'N/A')}")
|
|
print(f" → Location: {page_data.get('location', 'N/A')}")
|
|
self.cache.save_lot(page_data)
|
|
|
|
images = page_data.get('images', [])
|
|
if images:
|
|
self.cache.save_images(page_data['lot_id'], images)
|
|
print(f" → Images: {len(images)}")
|
|
|
|
if self.download_images:
|
|
for i, img_url in enumerate(images):
|
|
local_path = await self._download_image(img_url, page_data['lot_id'], i)
|
|
if local_path:
|
|
print(f" ✓ Downloaded: {Path(local_path).name}")
|
|
|
|
return page_data
|
|
|
|
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
|
|
"""Main crawl function"""
|
|
async with async_playwright() as p:
|
|
print("Launching browser...")
|
|
browser = await p.chromium.launch(
|
|
headless=True,
|
|
args=[
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-blink-features=AutomationControlled'
|
|
]
|
|
)
|
|
|
|
page = await browser.new_page(
|
|
viewport={'width': 1920, 'height': 1080},
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
|
|
)
|
|
|
|
await page.set_extra_http_headers({
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
|
})
|
|
|
|
all_auction_urls = []
|
|
all_lot_urls = []
|
|
|
|
# Phase 1: Collect auction URLs
|
|
print("\n" + "="*60)
|
|
print("PHASE 1: COLLECTING AUCTION URLs FROM LISTING PAGES")
|
|
print("="*60)
|
|
|
|
for page_num in range(1, max_pages + 1):
|
|
auction_urls = await self.crawl_listing_page(page, page_num)
|
|
if not auction_urls:
|
|
print(f"No auctions found on page {page_num}, stopping")
|
|
break
|
|
all_auction_urls.extend(auction_urls)
|
|
print(f" → Total auctions collected so far: {len(all_auction_urls)}")
|
|
|
|
all_auction_urls = list(set(all_auction_urls))
|
|
print(f"\n{'='*60}")
|
|
print(f"PHASE 1 COMPLETE: {len(all_auction_urls)} UNIQUE AUCTIONS")
|
|
print(f"{'='*60}")
|
|
|
|
# Phase 2: Extract lot URLs from each auction
|
|
print("\n" + "="*60)
|
|
print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
|
|
print("="*60)
|
|
|
|
for i, auction_url in enumerate(all_auction_urls):
|
|
print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
|
|
lot_urls = await self.crawl_auction_for_lots(page, auction_url)
|
|
if lot_urls:
|
|
all_lot_urls.extend(lot_urls)
|
|
print(f" → Found {len(lot_urls)} lots")
|
|
|
|
all_lot_urls = list(set(all_lot_urls))
|
|
print(f"\n{'='*60}")
|
|
print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
|
|
print(f"{'='*60}")
|
|
|
|
# Phase 3: Scrape each lot page
|
|
print("\n" + "="*60)
|
|
print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
|
|
print("="*60)
|
|
|
|
results = []
|
|
for i, lot_url in enumerate(all_lot_urls):
|
|
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
|
|
page_data = await self.crawl_page(page, lot_url)
|
|
if page_data:
|
|
results.append(page_data)
|
|
|
|
await browser.close()
|
|
return results |