This commit is contained in:
Tour
2025-12-04 14:49:58 +01:00
commit 79e14be37a
22 changed files with 2765 additions and 0 deletions

178
src/cache.py Normal file
View File

@@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
Cache Manager module for SQLite-based caching and data storage
"""
import sqlite3
import time
import zlib
from datetime import datetime
from typing import Dict, List, Optional
import config
class CacheManager:
"""Manages page caching and data storage using SQLite"""
def __init__(self, db_path: str = None):
self.db_path = db_path or config.CACHE_DB
self._init_db()
def _init_db(self):
"""Initialize cache and data storage database"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS cache (
url TEXT PRIMARY KEY,
content BLOB,
timestamp REAL,
status_code INTEGER
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS auctions (
auction_id TEXT PRIMARY KEY,
url TEXT UNIQUE,
title TEXT,
location TEXT,
lots_count INTEGER,
first_lot_closing_time TEXT,
scraped_at TEXT
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS lots (
lot_id TEXT PRIMARY KEY,
auction_id TEXT,
url TEXT UNIQUE,
title TEXT,
current_bid TEXT,
bid_count INTEGER,
closing_time TEXT,
viewing_time TEXT,
pickup_date TEXT,
location TEXT,
description TEXT,
category TEXT,
scraped_at TEXT,
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS images (
id INTEGER PRIMARY KEY AUTOINCREMENT,
lot_id TEXT,
url TEXT,
local_path TEXT,
downloaded INTEGER DEFAULT 0,
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
conn.commit()
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
"""Get cached page if it exists and is not too old"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"SELECT content, timestamp, status_code FROM cache WHERE url = ?",
(url,)
)
row = cursor.fetchone()
if row:
content, timestamp, status_code = row
age_hours = (time.time() - timestamp) / 3600
if age_hours <= max_age_hours:
try:
content = zlib.decompress(content).decode('utf-8')
except Exception as e:
print(f" ⚠️ Failed to decompress cache for {url}: {e}")
return None
return {
'content': content,
'timestamp': timestamp,
'status_code': status_code,
'cached': True
}
return None
def set(self, url: str, content: str, status_code: int = 200):
"""Cache a page with compression"""
with sqlite3.connect(self.db_path) as conn:
compressed_content = zlib.compress(content.encode('utf-8'), level=9)
original_size = len(content.encode('utf-8'))
compressed_size = len(compressed_content)
ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0
conn.execute(
"INSERT OR REPLACE INTO cache (url, content, timestamp, status_code) VALUES (?, ?, ?, ?)",
(url, compressed_content, time.time(), status_code)
)
conn.commit()
print(f" → Cached: {url} (compressed {ratio:.1f}%)")
def clear_old(self, max_age_hours: int = 168):
"""Clear old cache entries to prevent database bloat"""
cutoff_time = time.time() - (max_age_hours * 3600)
with sqlite3.connect(self.db_path) as conn:
deleted = conn.execute("DELETE FROM cache WHERE timestamp < ?", (cutoff_time,)).rowcount
conn.commit()
if deleted > 0:
print(f" → Cleared {deleted} old cache entries")
def save_auction(self, auction_data: Dict):
"""Save auction data to database"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT OR REPLACE INTO auctions
(auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
auction_data['auction_id'],
auction_data['url'],
auction_data['title'],
auction_data['location'],
auction_data.get('lots_count', 0),
auction_data.get('first_lot_closing_time', ''),
auction_data['scraped_at']
))
conn.commit()
def save_lot(self, lot_data: Dict):
"""Save lot data to database"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT OR REPLACE INTO lots
(lot_id, auction_id, url, title, current_bid, bid_count, closing_time,
viewing_time, pickup_date, location, description, category, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
lot_data['lot_id'],
lot_data.get('auction_id', ''),
lot_data['url'],
lot_data['title'],
lot_data.get('current_bid', ''),
lot_data.get('bid_count', 0),
lot_data.get('closing_time', ''),
lot_data.get('viewing_time', ''),
lot_data.get('pickup_date', ''),
lot_data.get('location', ''),
lot_data.get('description', ''),
lot_data.get('category', ''),
lot_data['scraped_at']
))
conn.commit()
def save_images(self, lot_id: str, image_urls: List[str]):
"""Save image URLs for a lot"""
with sqlite3.connect(self.db_path) as conn:
for url in image_urls:
conn.execute("""
INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?)
""", (lot_id, url))
conn.commit()

26
src/config.py Normal file
View File

@@ -0,0 +1,26 @@
#!/usr/bin/env python3
"""
Configuration module for Scaev Auctions Scraper
"""
import sys
from pathlib import Path
# Require Python 3.10+
if sys.version_info < (3, 10):
print("ERROR: This script requires Python 3.10 or higher")
print(f"Current version: {sys.version}")
sys.exit(1)
# ==================== CONFIGURATION ====================
BASE_URL = "https://www.troostwijkauctions.com"
CACHE_DB = "/mnt/okcomputer/output/cache.db"
OUTPUT_DIR = "/mnt/okcomputer/output"
IMAGES_DIR = "/mnt/okcomputer/output/images"
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
MAX_PAGES = 50 # Number of listing pages to crawl
DOWNLOAD_IMAGES = False # Set to True to download images
# Setup directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(IMAGES_DIR).mkdir(parents=True, exist_ok=True)

81
src/main.py Normal file
View File

@@ -0,0 +1,81 @@
#!/usr/bin/env python3
"""
Troostwijk Auctions Scraper - Main Entry Point
Focuses on extracting auction lots with caching and rate limiting
"""
import sys
import asyncio
import json
import csv
import sqlite3
from datetime import datetime
from pathlib import Path
import config
from cache import CacheManager
from scraper import TroostwijkScraper
def main():
"""Main execution"""
# Check for test mode
if len(sys.argv) > 1 and sys.argv[1] == "--test":
# Import test function only when needed to avoid circular imports
from test import test_extraction
test_url = sys.argv[2] if len(sys.argv) > 2 else None
if test_url:
test_extraction(test_url)
else:
test_extraction()
return
print("Troostwijk Auctions Scraper")
print("=" * 60)
print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
print(f"Cache database: {config.CACHE_DB}")
print(f"Output directory: {config.OUTPUT_DIR}")
print(f"Max listing pages: {config.MAX_PAGES}")
print("=" * 60)
scraper = TroostwijkScraper()
try:
# Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
scraper.cache.clear_old(max_age_hours=168)
# Run the crawler
results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES))
# Export results to files
print("\n" + "="*60)
print("EXPORTING RESULTS TO FILES")
print("="*60)
files = scraper.export_to_files()
print("\n" + "="*60)
print("CRAWLING COMPLETED SUCCESSFULLY")
print("="*60)
print(f"Total pages scraped: {len(results)}")
print(f"\nAuctions JSON: {files['auctions_json']}")
print(f"Auctions CSV: {files['auctions_csv']}")
print(f"Lots JSON: {files['lots_json']}")
print(f"Lots CSV: {files['lots_csv']}")
# Count auctions vs lots
auctions = [r for r in results if r.get('type') == 'auction']
lots = [r for r in results if r.get('type') == 'lot']
print(f"\n Auctions: {len(auctions)}")
print(f" Lots: {len(lots)}")
except KeyboardInterrupt:
print("\nScraping interrupted by user - partial results saved in output directory")
except Exception as e:
print(f"\nERROR during scraping: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
from cache import CacheManager
from scraper import TroostwijkScraper
main()

303
src/parse.py Normal file
View File

@@ -0,0 +1,303 @@
#!/usr/bin/env python3
"""
Parser module for extracting data from HTML/JSON content
"""
import json
import re
import html
from datetime import datetime
from urllib.parse import urljoin, urlparse
from typing import Dict, List, Optional
from config import BASE_URL
class DataParser:
"""Handles all data extraction from HTML/JSON content"""
@staticmethod
def extract_lot_id(url: str) -> str:
"""Extract lot ID from URL"""
path = urlparse(url).path
match = re.search(r'/lots/(\d+)', path)
if match:
return match.group(1)
match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path)
if match:
return match.group(1)
return path.split('/')[-1] if path else ""
@staticmethod
def clean_text(text: str) -> str:
"""Clean extracted text"""
text = html.unescape(text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
@staticmethod
def format_timestamp(timestamp) -> str:
"""Convert Unix timestamp to readable date"""
try:
if isinstance(timestamp, (int, float)) and timestamp > 0:
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
return str(timestamp) if timestamp else ''
except:
return str(timestamp) if timestamp else ''
@staticmethod
def format_currency(amount) -> str:
"""Format currency amount"""
if isinstance(amount, (int, float)):
return f"{amount:,.2f}" if amount > 0 else "€0"
return str(amount) if amount else "€0"
def parse_page(self, content: str, url: str) -> Optional[Dict]:
"""Parse page and determine if it's an auction or lot"""
next_data = self._extract_nextjs_data(content, url)
if next_data:
return next_data
content = re.sub(r'\s+', ' ', content)
return {
'type': 'lot',
'url': url,
'lot_id': self.extract_lot_id(url),
'title': self._extract_meta_content(content, 'og:title'),
'current_bid': self._extract_current_bid(content),
'bid_count': self._extract_bid_count(content),
'closing_time': self._extract_end_date(content),
'location': self._extract_location(content),
'description': self._extract_description(content),
'category': self._extract_category(content),
'images': self._extract_images(content),
'scraped_at': datetime.now().isoformat()
}
def _extract_nextjs_data(self, content: str, url: str) -> Optional[Dict]:
"""Extract data from Next.js __NEXT_DATA__ JSON"""
try:
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
return None
data = json.loads(match.group(1))
page_props = data.get('props', {}).get('pageProps', {})
if 'lot' in page_props:
return self._parse_lot_json(page_props.get('lot', {}), url)
if 'auction' in page_props:
return self._parse_auction_json(page_props.get('auction', {}), url)
return None
except Exception as e:
print(f" → Error parsing __NEXT_DATA__: {e}")
return None
def _parse_lot_json(self, lot_data: Dict, url: str) -> Dict:
"""Parse lot data from JSON"""
location_data = lot_data.get('location', {})
city = location_data.get('city', '')
country = location_data.get('countryCode', '').upper()
location = f"{city}, {country}" if city and country else (city or country)
current_bid = lot_data.get('currentBid') or lot_data.get('highestBid') or lot_data.get('startingBid')
if current_bid is None or current_bid == 0:
bidding = lot_data.get('bidding', {})
current_bid = bidding.get('currentBid') or bidding.get('amount')
current_bid_str = self.format_currency(current_bid) if current_bid and current_bid > 0 else "No bids"
bid_count = lot_data.get('bidCount', 0)
if bid_count == 0:
bid_count = lot_data.get('bidding', {}).get('bidCount', 0)
description = lot_data.get('description', {})
if isinstance(description, dict):
description = description.get('description', '')
else:
description = str(description)
category = lot_data.get('category', {})
category_name = category.get('name', '') if isinstance(category, dict) else ''
return {
'type': 'lot',
'lot_id': lot_data.get('displayId', ''),
'auction_id': lot_data.get('auctionId', ''),
'url': url,
'title': lot_data.get('title', ''),
'current_bid': current_bid_str,
'bid_count': bid_count,
'closing_time': self.format_timestamp(lot_data.get('endDate', '')),
'viewing_time': self._extract_viewing_time(lot_data),
'pickup_date': self._extract_pickup_date(lot_data),
'location': location,
'description': description,
'category': category_name,
'images': self._extract_images_from_json(lot_data),
'scraped_at': datetime.now().isoformat()
}
def _parse_auction_json(self, auction_data: Dict, url: str) -> Dict:
"""Parse auction data from JSON"""
is_auction = 'lots' in auction_data and isinstance(auction_data['lots'], list)
is_lot = 'lotNumber' in auction_data or 'currentBid' in auction_data
if is_auction:
lots = auction_data.get('lots', [])
first_lot_closing = None
if lots:
first_lot_closing = self.format_timestamp(lots[0].get('endDate', ''))
return {
'type': 'auction',
'auction_id': auction_data.get('displayId', ''),
'url': url,
'title': auction_data.get('name', ''),
'location': self._extract_location_from_json(auction_data),
'lots_count': len(lots),
'first_lot_closing_time': first_lot_closing or self.format_timestamp(auction_data.get('minEndDate', '')),
'scraped_at': datetime.now().isoformat(),
'lots': lots
}
elif is_lot:
return self._parse_lot_json(auction_data, url)
return None
def _extract_viewing_time(self, auction_data: Dict) -> str:
"""Extract viewing time from auction data"""
viewing_days = auction_data.get('viewingDays', [])
if viewing_days:
first = viewing_days[0]
start = self.format_timestamp(first.get('startDate', ''))
end = self.format_timestamp(first.get('endDate', ''))
if start and end:
return f"{start} - {end}"
return start or end
return ''
def _extract_pickup_date(self, auction_data: Dict) -> str:
"""Extract pickup date from auction data"""
collection_days = auction_data.get('collectionDays', [])
if collection_days:
first = collection_days[0]
start = self.format_timestamp(first.get('startDate', ''))
end = self.format_timestamp(first.get('endDate', ''))
if start and end:
return f"{start} - {end}"
return start or end
return ''
def _extract_images_from_json(self, auction_data: Dict) -> List[str]:
"""Extract all image URLs from auction data"""
images = []
if auction_data.get('image', {}).get('url'):
images.append(auction_data['image']['url'])
if isinstance(auction_data.get('images'), list):
for img in auction_data['images']:
if isinstance(img, dict) and img.get('url'):
images.append(img['url'])
elif isinstance(img, str):
images.append(img)
return images
def _extract_location_from_json(self, auction_data: Dict) -> str:
"""Extract location from auction JSON data"""
for days in [auction_data.get('viewingDays', []), auction_data.get('collectionDays', [])]:
if days:
first_location = days[0]
city = first_location.get('city', '')
country = first_location.get('countryCode', '').upper()
if city:
return f"{city}, {country}" if country else city
return ''
def _extract_meta_content(self, content: str, property_name: str) -> str:
"""Extract content from meta tags"""
pattern = rf'<meta[^>]*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']'
match = re.search(pattern, content, re.IGNORECASE)
return self.clean_text(match.group(1)) if match else ""
def _extract_current_bid(self, content: str) -> str:
"""Extract current bid amount"""
patterns = [
r'"currentBid"\s*:\s*"([^"]+)"',
r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)',
r'(?:Current bid|Huidig bod)[:\s]*</?\w*>\s*(€[\d,.\s]+)',
r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)',
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
bid = match.group(1).strip()
if bid and bid.lower() not in ['huidig bod', 'current bid']:
if not bid.startswith(''):
bid = f"{bid}"
return bid
return "€0"
def _extract_bid_count(self, content: str) -> int:
"""Extract number of bids"""
match = re.search(r'(\d+)\s*bids?', content, re.IGNORECASE)
if match:
try:
return int(match.group(1))
except:
pass
return 0
def _extract_end_date(self, content: str) -> str:
"""Extract auction end date"""
patterns = [
r'Ends?[:\s]+([A-Za-z0-9,:\s]+)',
r'endTime["\']:\s*["\']([^"\']+)["\']',
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
return match.group(1).strip()
return ""
def _extract_location(self, content: str) -> str:
"""Extract location"""
patterns = [
r'(?:Location|Locatie)[:\s]*</?\w*>\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)',
r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<br|</|$)',
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
location = self.clean_text(match.group(1))
if location.lower() not in ['locatie', 'location', 'huidig bod', 'current bid']:
location = re.sub(r'[,.\s]+$', '', location)
if len(location) > 2:
return location
return ""
def _extract_description(self, content: str) -> str:
"""Extract description"""
pattern = r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']'
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
return self.clean_text(match.group(1))[:500] if match else ""
def _extract_category(self, content: str) -> str:
"""Extract category from breadcrumb or meta tags"""
pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)</a>'
match = re.search(pattern, content, re.IGNORECASE)
if match:
return self.clean_text(match.group(1))
return self._extract_meta_content(content, 'category')
def _extract_images(self, content: str) -> List[str]:
"""Extract image URLs"""
pattern = r'<img[^>]*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>'
matches = re.findall(pattern, content, re.IGNORECASE)
images = []
for match in matches:
if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']):
continue
full_url = urljoin(BASE_URL, match)
images.append(full_url)
return images[:5] # Limit to 5 images

279
src/scraper.py Normal file
View File

@@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""
Core scraper module for Scaev Auctions
"""
import sqlite3
import asyncio
import time
import random
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Set
from urllib.parse import urljoin
from playwright.async_api import async_playwright, Page
from config import (
BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR
)
from cache import CacheManager
from parse import DataParser
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
def __init__(self):
self.base_url = BASE_URL
self.cache = CacheManager()
self.parser = DataParser()
self.visited_lots: Set[str] = set()
self.last_request_time = 0
self.download_images = DOWNLOAD_IMAGES
async def _download_image(self, url: str, lot_id: str, index: int) -> Optional[str]:
"""Download an image and save it locally"""
if not self.download_images:
return None
try:
import aiohttp
lot_dir = Path(IMAGES_DIR) / lot_id
lot_dir.mkdir(exist_ok=True)
ext = url.split('.')[-1].split('?')[0]
if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']:
ext = 'jpg'
filepath = lot_dir / f"{index:03d}.{ext}"
if filepath.exists():
return str(filepath)
await self._rate_limit()
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=30) as response:
if response.status == 200:
content = await response.read()
with open(filepath, 'wb') as f:
f.write(content)
with sqlite3.connect(self.cache.db_path) as conn:
conn.execute("UPDATE images\n"
"SET local_path = ?, downloaded = 1\n"
"WHERE lot_id = ? AND url = ?\n"
"", (str(filepath), lot_id, url))
conn.commit()
return str(filepath)
except Exception as e:
print(f" ERROR downloading image: {e}")
return None
async def _rate_limit(self):
"""ENSURE EXACTLY 0.5s BETWEEN REQUESTS"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < RATE_LIMIT_SECONDS:
await asyncio.sleep(RATE_LIMIT_SECONDS - time_since_last)
self.last_request_time = time.time()
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
"""Get page content with caching and strict rate limiting"""
if use_cache:
cached = self.cache.get(url)
if cached:
print(f" CACHE HIT: {url}")
return cached['content']
await self._rate_limit()
try:
print(f" FETCHING: {url}")
await page.goto(url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(random.uniform(0.3, 0.7))
content = await page.content()
self.cache.set(url, content, 200)
return content
except Exception as e:
print(f" ERROR: {e}")
self.cache.set(url, "", 500)
return None
def _extract_auction_urls_from_listing(self, content: str) -> List[str]:
"""Extract auction URLs from listing page"""
pattern = r'href=["\']([/]a/[^"\']+)["\']'
matches = re.findall(pattern, content, re.IGNORECASE)
return list(set(urljoin(self.base_url, match) for match in matches))
def _extract_lot_urls_from_auction(self, content: str, auction_url: str) -> List[str]:
"""Extract lot URLs from an auction page"""
# Try Next.js data first
try:
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if match:
data = json.loads(match.group(1))
lots = data.get('props', {}).get('pageProps', {}).get('auction', {}).get('lots', [])
if lots:
return list(set(f"{self.base_url}/l/{lot.get('urlSlug', '')}"
for lot in lots if lot.get('urlSlug')))
except:
pass
# Fallback to HTML parsing
pattern = r'href=["\']([/]l/[^"\']+)["\']'
matches = re.findall(pattern, content, re.IGNORECASE)
return list(set(urljoin(self.base_url, match) for match in matches))
async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
"""Crawl a single listing page and return auction URLs"""
url = f"{self.base_url}/auctions?page={page_num}"
print(f"\n{'='*60}")
print(f"LISTING PAGE {page_num}: {url}")
print(f"{'='*60}")
content = await self._get_page(page, url)
if not content:
return []
auction_urls = self._extract_auction_urls_from_listing(content)
print(f"→ Found {len(auction_urls)} auction URLs")
return auction_urls
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
"""Crawl an auction page and extract lot URLs"""
content = await self._get_page(page, auction_url)
if not content:
return []
page_data = self.parser.parse_page(content, auction_url)
if page_data and page_data.get('type') == 'auction':
self.cache.save_auction(page_data)
print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
return self._extract_lot_urls_from_auction(content, auction_url)
async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
"""Crawl a page (auction or lot)"""
if url in self.visited_lots:
print(f" → Skipping (already visited): {url}")
return None
page_id = self.parser.extract_lot_id(url)
print(f"\n[PAGE {page_id}]")
content = await self._get_page(page, url)
if not content:
return None
page_data = self.parser.parse_page(content, url)
if not page_data:
return None
self.visited_lots.add(url)
if page_data.get('type') == 'auction':
print(f" → Type: AUCTION")
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
print(f" → Location: {page_data.get('location', 'N/A')}")
print(f" → Lots: {page_data.get('lots_count', 0)}")
self.cache.save_auction(page_data)
elif page_data.get('type') == 'lot':
print(f" → Type: LOT")
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
print(f" → Bid: {page_data.get('current_bid', 'N/A')}")
print(f" → Location: {page_data.get('location', 'N/A')}")
self.cache.save_lot(page_data)
images = page_data.get('images', [])
if images:
self.cache.save_images(page_data['lot_id'], images)
print(f" → Images: {len(images)}")
if self.download_images:
for i, img_url in enumerate(images):
local_path = await self._download_image(img_url, page_data['lot_id'], i)
if local_path:
print(f" ✓ Downloaded: {Path(local_path).name}")
return page_data
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
"""Main crawl function"""
async with async_playwright() as p:
print("Launching browser...")
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled'
]
)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
)
await page.set_extra_http_headers({
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
})
all_auction_urls = []
all_lot_urls = []
# Phase 1: Collect auction URLs
print("\n" + "="*60)
print("PHASE 1: COLLECTING AUCTION URLs FROM LISTING PAGES")
print("="*60)
for page_num in range(1, max_pages + 1):
auction_urls = await self.crawl_listing_page(page, page_num)
if not auction_urls:
print(f"No auctions found on page {page_num}, stopping")
break
all_auction_urls.extend(auction_urls)
print(f" → Total auctions collected so far: {len(all_auction_urls)}")
all_auction_urls = list(set(all_auction_urls))
print(f"\n{'='*60}")
print(f"PHASE 1 COMPLETE: {len(all_auction_urls)} UNIQUE AUCTIONS")
print(f"{'='*60}")
# Phase 2: Extract lot URLs from each auction
print("\n" + "="*60)
print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
print("="*60)
for i, auction_url in enumerate(all_auction_urls):
print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
lot_urls = await self.crawl_auction_for_lots(page, auction_url)
if lot_urls:
all_lot_urls.extend(lot_urls)
print(f" → Found {len(lot_urls)} lots")
all_lot_urls = list(set(all_lot_urls))
print(f"\n{'='*60}")
print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
print(f"{'='*60}")
# Phase 3: Scrape each lot page
print("\n" + "="*60)
print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
print("="*60)
results = []
for i, lot_url in enumerate(all_lot_urls):
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
page_data = await self.crawl_page(page, lot_url)
if page_data:
results.append(page_data)
await browser.close()
return results

142
src/test.py Normal file
View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
Test module for debugging extraction patterns
"""
import sys
import sqlite3
import time
import re
import json
from datetime import datetime
from pathlib import Path
from typing import Optional
import config
from cache import CacheManager
from scraper import TroostwijkScraper
def test_extraction(
test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
"""Test extraction on a specific cached URL to debug patterns"""
scraper = TroostwijkScraper()
# Try to get from cache
cached = scraper.cache.get(test_url)
if not cached:
print(f"ERROR: URL not found in cache: {test_url}")
print(f"\nAvailable cached URLs:")
with sqlite3.connect(config.CACHE_DB) as conn:
cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
for row in cursor.fetchall():
print(f" - {row[0]}")
return
content = cached['content']
print(f"\n{'=' * 60}")
print(f"TESTING EXTRACTION FROM: {test_url}")
print(f"{'=' * 60}")
print(f"Content length: {len(content)} chars")
print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
# Test each extraction method
page_data = scraper._parse_page(content, test_url)
print(f"\n{'=' * 60}")
print("EXTRACTED DATA:")
print(f"{'=' * 60}")
if not page_data:
print("ERROR: No data extracted!")
return
print(f"Page Type: {page_data.get('type', 'UNKNOWN')}")
print()
for key, value in page_data.items():
if key == 'images':
print(f"{key:.<20}: {len(value)} images")
for img in value[:3]:
print(f"{'':.<20} - {img}")
elif key == 'lots':
print(f"{key:.<20}: {len(value)} lots in auction")
else:
display_value = str(value)[:100] if value else "(empty)"
# Handle Unicode characters that Windows console can't display
try:
print(f"{key:.<20}: {display_value}")
except UnicodeEncodeError:
safe_value = display_value.encode('ascii', 'replace').decode('ascii')
print(f"{key:.<20}: {safe_value}")
# Validation checks
print(f"\n{'=' * 60}")
print("VALIDATION CHECKS:")
print(f"{'=' * 60}")
issues = []
if page_data.get('type') == 'lot':
if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']:
issues.append("[!] Current bid not extracted correctly")
else:
print("[OK] Current bid looks valid:", page_data.get('current_bid'))
if page_data.get('location') in ['Locatie', 'Location', '']:
issues.append("[!] Location not extracted correctly")
else:
print("[OK] Location looks valid:", page_data.get('location'))
if page_data.get('title') in ['', '...']:
issues.append("[!] Title not extracted correctly")
else:
print("[OK] Title looks valid:", page_data.get('title', '')[:50])
if issues:
print(f"\n[ISSUES FOUND]")
for issue in issues:
print(f" {issue}")
else:
print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
# Debug: Show raw HTML snippets for problematic fields
print(f"\n{'=' * 60}")
print("DEBUG: RAW HTML SNIPPETS")
print(f"{'=' * 60}")
# Look for bid-related content
print(f"\n1. Bid patterns in content:")
bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
for i, match in enumerate(bid_matches[:5], 1):
print(f" {i}. {match}")
# Look for location content
print(f"\n2. Location patterns in content:")
loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
for i, match in enumerate(loc_matches[:5], 1):
print(f" {i}. ...{match}...")
# Look for JSON data
print(f"\n3. JSON/Script data containing auction info:")
json_patterns = [
r'"currentBid"[^,}]+',
r'"location"[^,}]+',
r'"price"[^,}]+',
r'"addressLocality"[^,}]+'
]
for pattern in json_patterns:
matches = re.findall(pattern, content[:50000], re.IGNORECASE)
if matches:
print(f" {pattern}: {matches[:3]}")
# Look for script tags with structured data
script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
if script_matches:
print(f"\n4. Structured data (JSON-LD) found:")
for i, script in enumerate(script_matches[:2], 1):
try:
data = json.loads(script)
print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...")
except:
print(f" Script {i}: {script[:300]}...")