first
This commit is contained in:
178
src/cache.py
Normal file
178
src/cache.py
Normal file
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cache Manager module for SQLite-based caching and data storage
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import time
|
||||
import zlib
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import config
|
||||
|
||||
class CacheManager:
|
||||
"""Manages page caching and data storage using SQLite"""
|
||||
|
||||
def __init__(self, db_path: str = None):
|
||||
self.db_path = db_path or config.CACHE_DB
|
||||
self._init_db()
|
||||
|
||||
def _init_db(self):
|
||||
"""Initialize cache and data storage database"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
content BLOB,
|
||||
timestamp REAL,
|
||||
status_code INTEGER
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS auctions (
|
||||
auction_id TEXT PRIMARY KEY,
|
||||
url TEXT UNIQUE,
|
||||
title TEXT,
|
||||
location TEXT,
|
||||
lots_count INTEGER,
|
||||
first_lot_closing_time TEXT,
|
||||
scraped_at TEXT
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS lots (
|
||||
lot_id TEXT PRIMARY KEY,
|
||||
auction_id TEXT,
|
||||
url TEXT UNIQUE,
|
||||
title TEXT,
|
||||
current_bid TEXT,
|
||||
bid_count INTEGER,
|
||||
closing_time TEXT,
|
||||
viewing_time TEXT,
|
||||
pickup_date TEXT,
|
||||
location TEXT,
|
||||
description TEXT,
|
||||
category TEXT,
|
||||
scraped_at TEXT,
|
||||
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS images (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
lot_id TEXT,
|
||||
url TEXT,
|
||||
local_path TEXT,
|
||||
downloaded INTEGER DEFAULT 0,
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
|
||||
"""Get cached page if it exists and is not too old"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT content, timestamp, status_code FROM cache WHERE url = ?",
|
||||
(url,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
content, timestamp, status_code = row
|
||||
age_hours = (time.time() - timestamp) / 3600
|
||||
|
||||
if age_hours <= max_age_hours:
|
||||
try:
|
||||
content = zlib.decompress(content).decode('utf-8')
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Failed to decompress cache for {url}: {e}")
|
||||
return None
|
||||
|
||||
return {
|
||||
'content': content,
|
||||
'timestamp': timestamp,
|
||||
'status_code': status_code,
|
||||
'cached': True
|
||||
}
|
||||
return None
|
||||
|
||||
def set(self, url: str, content: str, status_code: int = 200):
|
||||
"""Cache a page with compression"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
compressed_content = zlib.compress(content.encode('utf-8'), level=9)
|
||||
original_size = len(content.encode('utf-8'))
|
||||
compressed_size = len(compressed_content)
|
||||
ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0
|
||||
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO cache (url, content, timestamp, status_code) VALUES (?, ?, ?, ?)",
|
||||
(url, compressed_content, time.time(), status_code)
|
||||
)
|
||||
conn.commit()
|
||||
print(f" → Cached: {url} (compressed {ratio:.1f}%)")
|
||||
|
||||
def clear_old(self, max_age_hours: int = 168):
|
||||
"""Clear old cache entries to prevent database bloat"""
|
||||
cutoff_time = time.time() - (max_age_hours * 3600)
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
deleted = conn.execute("DELETE FROM cache WHERE timestamp < ?", (cutoff_time,)).rowcount
|
||||
conn.commit()
|
||||
if deleted > 0:
|
||||
print(f" → Cleared {deleted} old cache entries")
|
||||
|
||||
def save_auction(self, auction_data: Dict):
|
||||
"""Save auction data to database"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO auctions
|
||||
(auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
auction_data['auction_id'],
|
||||
auction_data['url'],
|
||||
auction_data['title'],
|
||||
auction_data['location'],
|
||||
auction_data.get('lots_count', 0),
|
||||
auction_data.get('first_lot_closing_time', ''),
|
||||
auction_data['scraped_at']
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
def save_lot(self, lot_data: Dict):
|
||||
"""Save lot data to database"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO lots
|
||||
(lot_id, auction_id, url, title, current_bid, bid_count, closing_time,
|
||||
viewing_time, pickup_date, location, description, category, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
lot_data['lot_id'],
|
||||
lot_data.get('auction_id', ''),
|
||||
lot_data['url'],
|
||||
lot_data['title'],
|
||||
lot_data.get('current_bid', ''),
|
||||
lot_data.get('bid_count', 0),
|
||||
lot_data.get('closing_time', ''),
|
||||
lot_data.get('viewing_time', ''),
|
||||
lot_data.get('pickup_date', ''),
|
||||
lot_data.get('location', ''),
|
||||
lot_data.get('description', ''),
|
||||
lot_data.get('category', ''),
|
||||
lot_data['scraped_at']
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
def save_images(self, lot_id: str, image_urls: List[str]):
|
||||
"""Save image URLs for a lot"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
for url in image_urls:
|
||||
conn.execute("""
|
||||
INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?)
|
||||
""", (lot_id, url))
|
||||
conn.commit()
|
||||
26
src/config.py
Normal file
26
src/config.py
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Configuration module for Scaev Auctions Scraper
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Require Python 3.10+
|
||||
if sys.version_info < (3, 10):
|
||||
print("ERROR: This script requires Python 3.10 or higher")
|
||||
print(f"Current version: {sys.version}")
|
||||
sys.exit(1)
|
||||
|
||||
# ==================== CONFIGURATION ====================
|
||||
BASE_URL = "https://www.troostwijkauctions.com"
|
||||
CACHE_DB = "/mnt/okcomputer/output/cache.db"
|
||||
OUTPUT_DIR = "/mnt/okcomputer/output"
|
||||
IMAGES_DIR = "/mnt/okcomputer/output/images"
|
||||
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
|
||||
MAX_PAGES = 50 # Number of listing pages to crawl
|
||||
DOWNLOAD_IMAGES = False # Set to True to download images
|
||||
|
||||
# Setup directories
|
||||
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
||||
Path(IMAGES_DIR).mkdir(parents=True, exist_ok=True)
|
||||
81
src/main.py
Normal file
81
src/main.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Troostwijk Auctions Scraper - Main Entry Point
|
||||
Focuses on extracting auction lots with caching and rate limiting
|
||||
"""
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
import json
|
||||
import csv
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import config
|
||||
from cache import CacheManager
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
def main():
|
||||
"""Main execution"""
|
||||
# Check for test mode
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--test":
|
||||
# Import test function only when needed to avoid circular imports
|
||||
from test import test_extraction
|
||||
test_url = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
if test_url:
|
||||
test_extraction(test_url)
|
||||
else:
|
||||
test_extraction()
|
||||
return
|
||||
|
||||
print("Troostwijk Auctions Scraper")
|
||||
print("=" * 60)
|
||||
print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
|
||||
print(f"Cache database: {config.CACHE_DB}")
|
||||
print(f"Output directory: {config.OUTPUT_DIR}")
|
||||
print(f"Max listing pages: {config.MAX_PAGES}")
|
||||
print("=" * 60)
|
||||
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
try:
|
||||
# Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
|
||||
scraper.cache.clear_old(max_age_hours=168)
|
||||
|
||||
# Run the crawler
|
||||
results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES))
|
||||
|
||||
# Export results to files
|
||||
print("\n" + "="*60)
|
||||
print("EXPORTING RESULTS TO FILES")
|
||||
print("="*60)
|
||||
|
||||
files = scraper.export_to_files()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("CRAWLING COMPLETED SUCCESSFULLY")
|
||||
print("="*60)
|
||||
print(f"Total pages scraped: {len(results)}")
|
||||
print(f"\nAuctions JSON: {files['auctions_json']}")
|
||||
print(f"Auctions CSV: {files['auctions_csv']}")
|
||||
print(f"Lots JSON: {files['lots_json']}")
|
||||
print(f"Lots CSV: {files['lots_csv']}")
|
||||
|
||||
# Count auctions vs lots
|
||||
auctions = [r for r in results if r.get('type') == 'auction']
|
||||
lots = [r for r in results if r.get('type') == 'lot']
|
||||
print(f"\n Auctions: {len(auctions)}")
|
||||
print(f" Lots: {len(lots)}")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nScraping interrupted by user - partial results saved in output directory")
|
||||
except Exception as e:
|
||||
print(f"\nERROR during scraping: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
from cache import CacheManager
|
||||
from scraper import TroostwijkScraper
|
||||
main()
|
||||
303
src/parse.py
Normal file
303
src/parse.py
Normal file
@@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parser module for extracting data from HTML/JSON content
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
import html
|
||||
from datetime import datetime
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from config import BASE_URL
|
||||
|
||||
|
||||
class DataParser:
|
||||
"""Handles all data extraction from HTML/JSON content"""
|
||||
|
||||
@staticmethod
|
||||
def extract_lot_id(url: str) -> str:
|
||||
"""Extract lot ID from URL"""
|
||||
path = urlparse(url).path
|
||||
match = re.search(r'/lots/(\d+)', path)
|
||||
if match:
|
||||
return match.group(1)
|
||||
match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return path.split('/')[-1] if path else ""
|
||||
|
||||
@staticmethod
|
||||
def clean_text(text: str) -> str:
|
||||
"""Clean extracted text"""
|
||||
text = html.unescape(text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
def format_timestamp(timestamp) -> str:
|
||||
"""Convert Unix timestamp to readable date"""
|
||||
try:
|
||||
if isinstance(timestamp, (int, float)) and timestamp > 0:
|
||||
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
|
||||
return str(timestamp) if timestamp else ''
|
||||
except:
|
||||
return str(timestamp) if timestamp else ''
|
||||
|
||||
@staticmethod
|
||||
def format_currency(amount) -> str:
|
||||
"""Format currency amount"""
|
||||
if isinstance(amount, (int, float)):
|
||||
return f"€{amount:,.2f}" if amount > 0 else "€0"
|
||||
return str(amount) if amount else "€0"
|
||||
|
||||
def parse_page(self, content: str, url: str) -> Optional[Dict]:
|
||||
"""Parse page and determine if it's an auction or lot"""
|
||||
next_data = self._extract_nextjs_data(content, url)
|
||||
if next_data:
|
||||
return next_data
|
||||
|
||||
content = re.sub(r'\s+', ' ', content)
|
||||
return {
|
||||
'type': 'lot',
|
||||
'url': url,
|
||||
'lot_id': self.extract_lot_id(url),
|
||||
'title': self._extract_meta_content(content, 'og:title'),
|
||||
'current_bid': self._extract_current_bid(content),
|
||||
'bid_count': self._extract_bid_count(content),
|
||||
'closing_time': self._extract_end_date(content),
|
||||
'location': self._extract_location(content),
|
||||
'description': self._extract_description(content),
|
||||
'category': self._extract_category(content),
|
||||
'images': self._extract_images(content),
|
||||
'scraped_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
def _extract_nextjs_data(self, content: str, url: str) -> Optional[Dict]:
|
||||
"""Extract data from Next.js __NEXT_DATA__ JSON"""
|
||||
try:
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
if 'lot' in page_props:
|
||||
return self._parse_lot_json(page_props.get('lot', {}), url)
|
||||
if 'auction' in page_props:
|
||||
return self._parse_auction_json(page_props.get('auction', {}), url)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f" → Error parsing __NEXT_DATA__: {e}")
|
||||
return None
|
||||
|
||||
def _parse_lot_json(self, lot_data: Dict, url: str) -> Dict:
|
||||
"""Parse lot data from JSON"""
|
||||
location_data = lot_data.get('location', {})
|
||||
city = location_data.get('city', '')
|
||||
country = location_data.get('countryCode', '').upper()
|
||||
location = f"{city}, {country}" if city and country else (city or country)
|
||||
|
||||
current_bid = lot_data.get('currentBid') or lot_data.get('highestBid') or lot_data.get('startingBid')
|
||||
if current_bid is None or current_bid == 0:
|
||||
bidding = lot_data.get('bidding', {})
|
||||
current_bid = bidding.get('currentBid') or bidding.get('amount')
|
||||
|
||||
current_bid_str = self.format_currency(current_bid) if current_bid and current_bid > 0 else "No bids"
|
||||
|
||||
bid_count = lot_data.get('bidCount', 0)
|
||||
if bid_count == 0:
|
||||
bid_count = lot_data.get('bidding', {}).get('bidCount', 0)
|
||||
|
||||
description = lot_data.get('description', {})
|
||||
if isinstance(description, dict):
|
||||
description = description.get('description', '')
|
||||
else:
|
||||
description = str(description)
|
||||
|
||||
category = lot_data.get('category', {})
|
||||
category_name = category.get('name', '') if isinstance(category, dict) else ''
|
||||
|
||||
return {
|
||||
'type': 'lot',
|
||||
'lot_id': lot_data.get('displayId', ''),
|
||||
'auction_id': lot_data.get('auctionId', ''),
|
||||
'url': url,
|
||||
'title': lot_data.get('title', ''),
|
||||
'current_bid': current_bid_str,
|
||||
'bid_count': bid_count,
|
||||
'closing_time': self.format_timestamp(lot_data.get('endDate', '')),
|
||||
'viewing_time': self._extract_viewing_time(lot_data),
|
||||
'pickup_date': self._extract_pickup_date(lot_data),
|
||||
'location': location,
|
||||
'description': description,
|
||||
'category': category_name,
|
||||
'images': self._extract_images_from_json(lot_data),
|
||||
'scraped_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
def _parse_auction_json(self, auction_data: Dict, url: str) -> Dict:
|
||||
"""Parse auction data from JSON"""
|
||||
is_auction = 'lots' in auction_data and isinstance(auction_data['lots'], list)
|
||||
is_lot = 'lotNumber' in auction_data or 'currentBid' in auction_data
|
||||
|
||||
if is_auction:
|
||||
lots = auction_data.get('lots', [])
|
||||
first_lot_closing = None
|
||||
if lots:
|
||||
first_lot_closing = self.format_timestamp(lots[0].get('endDate', ''))
|
||||
|
||||
return {
|
||||
'type': 'auction',
|
||||
'auction_id': auction_data.get('displayId', ''),
|
||||
'url': url,
|
||||
'title': auction_data.get('name', ''),
|
||||
'location': self._extract_location_from_json(auction_data),
|
||||
'lots_count': len(lots),
|
||||
'first_lot_closing_time': first_lot_closing or self.format_timestamp(auction_data.get('minEndDate', '')),
|
||||
'scraped_at': datetime.now().isoformat(),
|
||||
'lots': lots
|
||||
}
|
||||
elif is_lot:
|
||||
return self._parse_lot_json(auction_data, url)
|
||||
return None
|
||||
|
||||
def _extract_viewing_time(self, auction_data: Dict) -> str:
|
||||
"""Extract viewing time from auction data"""
|
||||
viewing_days = auction_data.get('viewingDays', [])
|
||||
if viewing_days:
|
||||
first = viewing_days[0]
|
||||
start = self.format_timestamp(first.get('startDate', ''))
|
||||
end = self.format_timestamp(first.get('endDate', ''))
|
||||
if start and end:
|
||||
return f"{start} - {end}"
|
||||
return start or end
|
||||
return ''
|
||||
|
||||
def _extract_pickup_date(self, auction_data: Dict) -> str:
|
||||
"""Extract pickup date from auction data"""
|
||||
collection_days = auction_data.get('collectionDays', [])
|
||||
if collection_days:
|
||||
first = collection_days[0]
|
||||
start = self.format_timestamp(first.get('startDate', ''))
|
||||
end = self.format_timestamp(first.get('endDate', ''))
|
||||
if start and end:
|
||||
return f"{start} - {end}"
|
||||
return start or end
|
||||
return ''
|
||||
|
||||
def _extract_images_from_json(self, auction_data: Dict) -> List[str]:
|
||||
"""Extract all image URLs from auction data"""
|
||||
images = []
|
||||
if auction_data.get('image', {}).get('url'):
|
||||
images.append(auction_data['image']['url'])
|
||||
if isinstance(auction_data.get('images'), list):
|
||||
for img in auction_data['images']:
|
||||
if isinstance(img, dict) and img.get('url'):
|
||||
images.append(img['url'])
|
||||
elif isinstance(img, str):
|
||||
images.append(img)
|
||||
return images
|
||||
|
||||
def _extract_location_from_json(self, auction_data: Dict) -> str:
|
||||
"""Extract location from auction JSON data"""
|
||||
for days in [auction_data.get('viewingDays', []), auction_data.get('collectionDays', [])]:
|
||||
if days:
|
||||
first_location = days[0]
|
||||
city = first_location.get('city', '')
|
||||
country = first_location.get('countryCode', '').upper()
|
||||
if city:
|
||||
return f"{city}, {country}" if country else city
|
||||
return ''
|
||||
|
||||
def _extract_meta_content(self, content: str, property_name: str) -> str:
|
||||
"""Extract content from meta tags"""
|
||||
pattern = rf'<meta[^>]*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']'
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
return self.clean_text(match.group(1)) if match else ""
|
||||
|
||||
def _extract_current_bid(self, content: str) -> str:
|
||||
"""Extract current bid amount"""
|
||||
patterns = [
|
||||
r'"currentBid"\s*:\s*"([^"]+)"',
|
||||
r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)',
|
||||
r'(?:Current bid|Huidig bod)[:\s]*</?\w*>\s*(€[\d,.\s]+)',
|
||||
r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
bid = match.group(1).strip()
|
||||
if bid and bid.lower() not in ['huidig bod', 'current bid']:
|
||||
if not bid.startswith('€'):
|
||||
bid = f"€{bid}"
|
||||
return bid
|
||||
return "€0"
|
||||
|
||||
def _extract_bid_count(self, content: str) -> int:
|
||||
"""Extract number of bids"""
|
||||
match = re.search(r'(\d+)\s*bids?', content, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
return int(match.group(1))
|
||||
except:
|
||||
pass
|
||||
return 0
|
||||
|
||||
def _extract_end_date(self, content: str) -> str:
|
||||
"""Extract auction end date"""
|
||||
patterns = [
|
||||
r'Ends?[:\s]+([A-Za-z0-9,:\s]+)',
|
||||
r'endTime["\']:\s*["\']([^"\']+)["\']',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
return ""
|
||||
|
||||
def _extract_location(self, content: str) -> str:
|
||||
"""Extract location"""
|
||||
patterns = [
|
||||
r'(?:Location|Locatie)[:\s]*</?\w*>\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)',
|
||||
r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<br|</|$)',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
location = self.clean_text(match.group(1))
|
||||
if location.lower() not in ['locatie', 'location', 'huidig bod', 'current bid']:
|
||||
location = re.sub(r'[,.\s]+$', '', location)
|
||||
if len(location) > 2:
|
||||
return location
|
||||
return ""
|
||||
|
||||
def _extract_description(self, content: str) -> str:
|
||||
"""Extract description"""
|
||||
pattern = r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']'
|
||||
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
|
||||
return self.clean_text(match.group(1))[:500] if match else ""
|
||||
|
||||
def _extract_category(self, content: str) -> str:
|
||||
"""Extract category from breadcrumb or meta tags"""
|
||||
pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)</a>'
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
return self.clean_text(match.group(1))
|
||||
return self._extract_meta_content(content, 'category')
|
||||
|
||||
def _extract_images(self, content: str) -> List[str]:
|
||||
"""Extract image URLs"""
|
||||
pattern = r'<img[^>]*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>'
|
||||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||||
|
||||
images = []
|
||||
for match in matches:
|
||||
if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']):
|
||||
continue
|
||||
full_url = urljoin(BASE_URL, match)
|
||||
images.append(full_url)
|
||||
|
||||
return images[:5] # Limit to 5 images
|
||||
279
src/scraper.py
Normal file
279
src/scraper.py
Normal file
@@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Core scraper module for Scaev Auctions
|
||||
"""
|
||||
import sqlite3
|
||||
import asyncio
|
||||
import time
|
||||
import random
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from playwright.async_api import async_playwright, Page
|
||||
|
||||
from config import (
|
||||
BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR
|
||||
)
|
||||
from cache import CacheManager
|
||||
from parse import DataParser
|
||||
|
||||
class TroostwijkScraper:
|
||||
"""Main scraper class for Troostwijk Auctions"""
|
||||
|
||||
def __init__(self):
|
||||
self.base_url = BASE_URL
|
||||
self.cache = CacheManager()
|
||||
self.parser = DataParser()
|
||||
self.visited_lots: Set[str] = set()
|
||||
self.last_request_time = 0
|
||||
self.download_images = DOWNLOAD_IMAGES
|
||||
|
||||
async def _download_image(self, url: str, lot_id: str, index: int) -> Optional[str]:
|
||||
"""Download an image and save it locally"""
|
||||
if not self.download_images:
|
||||
return None
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
lot_dir = Path(IMAGES_DIR) / lot_id
|
||||
lot_dir.mkdir(exist_ok=True)
|
||||
|
||||
ext = url.split('.')[-1].split('?')[0]
|
||||
if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']:
|
||||
ext = 'jpg'
|
||||
|
||||
filepath = lot_dir / f"{index:03d}.{ext}"
|
||||
if filepath.exists():
|
||||
return str(filepath)
|
||||
|
||||
await self._rate_limit()
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
content = await response.read()
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
with sqlite3.connect(self.cache.db_path) as conn:
|
||||
conn.execute("UPDATE images\n"
|
||||
"SET local_path = ?, downloaded = 1\n"
|
||||
"WHERE lot_id = ? AND url = ?\n"
|
||||
"", (str(filepath), lot_id, url))
|
||||
conn.commit()
|
||||
return str(filepath)
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR downloading image: {e}")
|
||||
return None
|
||||
|
||||
async def _rate_limit(self):
|
||||
"""ENSURE EXACTLY 0.5s BETWEEN REQUESTS"""
|
||||
current_time = time.time()
|
||||
time_since_last = current_time - self.last_request_time
|
||||
|
||||
if time_since_last < RATE_LIMIT_SECONDS:
|
||||
await asyncio.sleep(RATE_LIMIT_SECONDS - time_since_last)
|
||||
|
||||
self.last_request_time = time.time()
|
||||
|
||||
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
|
||||
"""Get page content with caching and strict rate limiting"""
|
||||
if use_cache:
|
||||
cached = self.cache.get(url)
|
||||
if cached:
|
||||
print(f" CACHE HIT: {url}")
|
||||
return cached['content']
|
||||
|
||||
await self._rate_limit()
|
||||
|
||||
try:
|
||||
print(f" FETCHING: {url}")
|
||||
await page.goto(url, wait_until='networkidle', timeout=30000)
|
||||
await asyncio.sleep(random.uniform(0.3, 0.7))
|
||||
content = await page.content()
|
||||
self.cache.set(url, content, 200)
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
self.cache.set(url, "", 500)
|
||||
return None
|
||||
|
||||
def _extract_auction_urls_from_listing(self, content: str) -> List[str]:
|
||||
"""Extract auction URLs from listing page"""
|
||||
pattern = r'href=["\']([/]a/[^"\']+)["\']'
|
||||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||||
return list(set(urljoin(self.base_url, match) for match in matches))
|
||||
|
||||
def _extract_lot_urls_from_auction(self, content: str, auction_url: str) -> List[str]:
|
||||
"""Extract lot URLs from an auction page"""
|
||||
# Try Next.js data first
|
||||
try:
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if match:
|
||||
data = json.loads(match.group(1))
|
||||
lots = data.get('props', {}).get('pageProps', {}).get('auction', {}).get('lots', [])
|
||||
if lots:
|
||||
return list(set(f"{self.base_url}/l/{lot.get('urlSlug', '')}"
|
||||
for lot in lots if lot.get('urlSlug')))
|
||||
except:
|
||||
pass
|
||||
|
||||
# Fallback to HTML parsing
|
||||
pattern = r'href=["\']([/]l/[^"\']+)["\']'
|
||||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||||
return list(set(urljoin(self.base_url, match) for match in matches))
|
||||
|
||||
async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
|
||||
"""Crawl a single listing page and return auction URLs"""
|
||||
url = f"{self.base_url}/auctions?page={page_num}"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"LISTING PAGE {page_num}: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
content = await self._get_page(page, url)
|
||||
if not content:
|
||||
return []
|
||||
|
||||
auction_urls = self._extract_auction_urls_from_listing(content)
|
||||
print(f"→ Found {len(auction_urls)} auction URLs")
|
||||
return auction_urls
|
||||
|
||||
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
|
||||
"""Crawl an auction page and extract lot URLs"""
|
||||
content = await self._get_page(page, auction_url)
|
||||
if not content:
|
||||
return []
|
||||
|
||||
page_data = self.parser.parse_page(content, auction_url)
|
||||
if page_data and page_data.get('type') == 'auction':
|
||||
self.cache.save_auction(page_data)
|
||||
print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
|
||||
|
||||
return self._extract_lot_urls_from_auction(content, auction_url)
|
||||
|
||||
async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
|
||||
"""Crawl a page (auction or lot)"""
|
||||
if url in self.visited_lots:
|
||||
print(f" → Skipping (already visited): {url}")
|
||||
return None
|
||||
|
||||
page_id = self.parser.extract_lot_id(url)
|
||||
print(f"\n[PAGE {page_id}]")
|
||||
|
||||
content = await self._get_page(page, url)
|
||||
if not content:
|
||||
return None
|
||||
|
||||
page_data = self.parser.parse_page(content, url)
|
||||
if not page_data:
|
||||
return None
|
||||
|
||||
self.visited_lots.add(url)
|
||||
|
||||
if page_data.get('type') == 'auction':
|
||||
print(f" → Type: AUCTION")
|
||||
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||
print(f" → Location: {page_data.get('location', 'N/A')}")
|
||||
print(f" → Lots: {page_data.get('lots_count', 0)}")
|
||||
self.cache.save_auction(page_data)
|
||||
|
||||
elif page_data.get('type') == 'lot':
|
||||
print(f" → Type: LOT")
|
||||
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||
print(f" → Bid: {page_data.get('current_bid', 'N/A')}")
|
||||
print(f" → Location: {page_data.get('location', 'N/A')}")
|
||||
self.cache.save_lot(page_data)
|
||||
|
||||
images = page_data.get('images', [])
|
||||
if images:
|
||||
self.cache.save_images(page_data['lot_id'], images)
|
||||
print(f" → Images: {len(images)}")
|
||||
|
||||
if self.download_images:
|
||||
for i, img_url in enumerate(images):
|
||||
local_path = await self._download_image(img_url, page_data['lot_id'], i)
|
||||
if local_path:
|
||||
print(f" ✓ Downloaded: {Path(local_path).name}")
|
||||
|
||||
return page_data
|
||||
|
||||
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
|
||||
"""Main crawl function"""
|
||||
async with async_playwright() as p:
|
||||
print("Launching browser...")
|
||||
browser = await p.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled'
|
||||
]
|
||||
)
|
||||
|
||||
page = await browser.new_page(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
|
||||
)
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
})
|
||||
|
||||
all_auction_urls = []
|
||||
all_lot_urls = []
|
||||
|
||||
# Phase 1: Collect auction URLs
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 1: COLLECTING AUCTION URLs FROM LISTING PAGES")
|
||||
print("="*60)
|
||||
|
||||
for page_num in range(1, max_pages + 1):
|
||||
auction_urls = await self.crawl_listing_page(page, page_num)
|
||||
if not auction_urls:
|
||||
print(f"No auctions found on page {page_num}, stopping")
|
||||
break
|
||||
all_auction_urls.extend(auction_urls)
|
||||
print(f" → Total auctions collected so far: {len(all_auction_urls)}")
|
||||
|
||||
all_auction_urls = list(set(all_auction_urls))
|
||||
print(f"\n{'='*60}")
|
||||
print(f"PHASE 1 COMPLETE: {len(all_auction_urls)} UNIQUE AUCTIONS")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Phase 2: Extract lot URLs from each auction
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
|
||||
print("="*60)
|
||||
|
||||
for i, auction_url in enumerate(all_auction_urls):
|
||||
print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
|
||||
lot_urls = await self.crawl_auction_for_lots(page, auction_url)
|
||||
if lot_urls:
|
||||
all_lot_urls.extend(lot_urls)
|
||||
print(f" → Found {len(lot_urls)} lots")
|
||||
|
||||
all_lot_urls = list(set(all_lot_urls))
|
||||
print(f"\n{'='*60}")
|
||||
print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Phase 3: Scrape each lot page
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
|
||||
print("="*60)
|
||||
|
||||
results = []
|
||||
for i, lot_url in enumerate(all_lot_urls):
|
||||
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
|
||||
page_data = await self.crawl_page(page, lot_url)
|
||||
if page_data:
|
||||
results.append(page_data)
|
||||
|
||||
await browser.close()
|
||||
return results
|
||||
142
src/test.py
Normal file
142
src/test.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test module for debugging extraction patterns
|
||||
"""
|
||||
|
||||
import sys
|
||||
import sqlite3
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import config
|
||||
from cache import CacheManager
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
|
||||
def test_extraction(
|
||||
test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
|
||||
"""Test extraction on a specific cached URL to debug patterns"""
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
# Try to get from cache
|
||||
cached = scraper.cache.get(test_url)
|
||||
if not cached:
|
||||
print(f"ERROR: URL not found in cache: {test_url}")
|
||||
print(f"\nAvailable cached URLs:")
|
||||
with sqlite3.connect(config.CACHE_DB) as conn:
|
||||
cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
|
||||
for row in cursor.fetchall():
|
||||
print(f" - {row[0]}")
|
||||
return
|
||||
|
||||
content = cached['content']
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"TESTING EXTRACTION FROM: {test_url}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Content length: {len(content)} chars")
|
||||
print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
|
||||
|
||||
# Test each extraction method
|
||||
page_data = scraper._parse_page(content, test_url)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print("EXTRACTED DATA:")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
if not page_data:
|
||||
print("ERROR: No data extracted!")
|
||||
return
|
||||
|
||||
print(f"Page Type: {page_data.get('type', 'UNKNOWN')}")
|
||||
print()
|
||||
|
||||
for key, value in page_data.items():
|
||||
if key == 'images':
|
||||
print(f"{key:.<20}: {len(value)} images")
|
||||
for img in value[:3]:
|
||||
print(f"{'':.<20} - {img}")
|
||||
elif key == 'lots':
|
||||
print(f"{key:.<20}: {len(value)} lots in auction")
|
||||
else:
|
||||
display_value = str(value)[:100] if value else "(empty)"
|
||||
# Handle Unicode characters that Windows console can't display
|
||||
try:
|
||||
print(f"{key:.<20}: {display_value}")
|
||||
except UnicodeEncodeError:
|
||||
safe_value = display_value.encode('ascii', 'replace').decode('ascii')
|
||||
print(f"{key:.<20}: {safe_value}")
|
||||
|
||||
# Validation checks
|
||||
print(f"\n{'=' * 60}")
|
||||
print("VALIDATION CHECKS:")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
issues = []
|
||||
|
||||
if page_data.get('type') == 'lot':
|
||||
if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']:
|
||||
issues.append("[!] Current bid not extracted correctly")
|
||||
else:
|
||||
print("[OK] Current bid looks valid:", page_data.get('current_bid'))
|
||||
|
||||
if page_data.get('location') in ['Locatie', 'Location', '']:
|
||||
issues.append("[!] Location not extracted correctly")
|
||||
else:
|
||||
print("[OK] Location looks valid:", page_data.get('location'))
|
||||
|
||||
if page_data.get('title') in ['', '...']:
|
||||
issues.append("[!] Title not extracted correctly")
|
||||
else:
|
||||
print("[OK] Title looks valid:", page_data.get('title', '')[:50])
|
||||
|
||||
if issues:
|
||||
print(f"\n[ISSUES FOUND]")
|
||||
for issue in issues:
|
||||
print(f" {issue}")
|
||||
else:
|
||||
print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
|
||||
|
||||
# Debug: Show raw HTML snippets for problematic fields
|
||||
print(f"\n{'=' * 60}")
|
||||
print("DEBUG: RAW HTML SNIPPETS")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# Look for bid-related content
|
||||
print(f"\n1. Bid patterns in content:")
|
||||
bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
|
||||
for i, match in enumerate(bid_matches[:5], 1):
|
||||
print(f" {i}. {match}")
|
||||
|
||||
# Look for location content
|
||||
print(f"\n2. Location patterns in content:")
|
||||
loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
|
||||
for i, match in enumerate(loc_matches[:5], 1):
|
||||
print(f" {i}. ...{match}...")
|
||||
|
||||
# Look for JSON data
|
||||
print(f"\n3. JSON/Script data containing auction info:")
|
||||
json_patterns = [
|
||||
r'"currentBid"[^,}]+',
|
||||
r'"location"[^,}]+',
|
||||
r'"price"[^,}]+',
|
||||
r'"addressLocality"[^,}]+'
|
||||
]
|
||||
for pattern in json_patterns:
|
||||
matches = re.findall(pattern, content[:50000], re.IGNORECASE)
|
||||
if matches:
|
||||
print(f" {pattern}: {matches[:3]}")
|
||||
|
||||
# Look for script tags with structured data
|
||||
script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
|
||||
if script_matches:
|
||||
print(f"\n4. Structured data (JSON-LD) found:")
|
||||
for i, script in enumerate(script_matches[:2], 1):
|
||||
try:
|
||||
data = json.loads(script)
|
||||
print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...")
|
||||
except:
|
||||
print(f" Script {i}: {script[:300]}...")
|
||||
Reference in New Issue
Block a user