This commit is contained in:
Tour
2025-12-03 11:44:11 +01:00
commit 8b71d5e113
4 changed files with 1145 additions and 0 deletions

744
main.py Normal file
View File

@@ -0,0 +1,744 @@
#!/usr/bin/env python3
"""
Troostwijk Auctions Scraper
Focuses on extracting auction lots with caching and rate limiting
"""
import asyncio
import json
import csv
import re
import sqlite3
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse
from pathlib import Path
from typing import List, Dict, Optional, Set
import random
# Import Playwright - REQUIRED for bypassing Cloudflare
from playwright.async_api import async_playwright, Browser, Page
# ==================== CONFIGURATION ====================
BASE_URL = "https://www.troostwijkauctions.com"
CACHE_DB = "/mnt/okcomputer/output/cache.db"
OUTPUT_DIR = "/mnt/okcomputer/output"
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests - YOUR REQUIREMENT
MAX_PAGES = 50 # Number of listing pages to crawl (adjust as needed)
# Setup directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
class CacheManager:
"""Manages page caching using SQLite - EVERY PAGE IS CACHED"""
def __init__(self, db_path: str):
self.db_path = db_path
self._init_db()
def _init_db(self):
"""Initialize cache database"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS cache (
url TEXT PRIMARY KEY,
content TEXT,
timestamp REAL,
status_code INTEGER
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
""")
conn.commit()
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
"""Get cached page if it exists and is not too old"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"SELECT content, timestamp, status_code FROM cache WHERE url = ?",
(url,)
)
row = cursor.fetchone()
if row:
content, timestamp, status_code = row
age_hours = (time.time() - timestamp) / 3600
if age_hours <= max_age_hours:
return {
'content': content,
'timestamp': timestamp,
'status_code': status_code,
'cached': True
}
return None
def set(self, url: str, content: str, status_code: int = 200):
"""Cache a page - EVERY SUCCESSFUL REQUEST IS CACHED"""
with sqlite3.connect(self.db_path) as conn:
conn.execute(
"INSERT OR REPLACE INTO cache (url, content, timestamp, status_code) VALUES (?, ?, ?, ?)",
(url, content, time.time(), status_code)
)
conn.commit()
print(f" → Cached: {url}")
def clear_old(self, max_age_hours: int = 168): # Default: 1 week
"""Clear old cache entries to prevent database bloat"""
cutoff_time = time.time() - (max_age_hours * 3600)
with sqlite3.connect(self.db_path) as conn:
deleted = conn.execute("DELETE FROM cache WHERE timestamp < ?", (cutoff_time,)).rowcount
conn.commit()
if deleted > 0:
print(f" → Cleared {deleted} old cache entries")
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
def __init__(self):
self.base_url = BASE_URL
self.cache = CacheManager(CACHE_DB)
self.visited_lots: Set[str] = set()
self.output_data: List[Dict] = []
self.last_request_time = 0
async def _rate_limit(self):
"""ENSURE EXACTLY 0.5s BETWEEN REQUESTS - YOUR REQUIREMENT"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < RATE_LIMIT_SECONDS:
delay = RATE_LIMIT_SECONDS - time_since_last
await asyncio.sleep(delay)
self.last_request_time = time.time()
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
"""Get page content with caching and strict rate limiting"""
# Check cache first - AVOID UNNECESSARY REQUESTS
if use_cache:
cached = self.cache.get(url)
if cached:
print(f" CACHE HIT: {url}")
return cached['content']
# Rate limit before making request - YOUR 0.5s REQUIREMENT
await self._rate_limit()
try:
print(f" FETCHING: {url}")
await page.goto(url, wait_until='networkidle', timeout=30000)
# Small additional wait for dynamic content
await asyncio.sleep(random.uniform(0.3, 0.7))
content = await page.content()
# Cache the successful result
self.cache.set(url, content, 200)
return content
except Exception as e:
print(f" ERROR: {e}")
# Cache the error to avoid retrying too soon
self.cache.set(url, "", 500)
return None
def _extract_lot_urls_from_listing(self, content: str) -> List[str]:
"""Extract lot URLs from auction listing page"""
# Pattern matches /lots/ followed by digits
pattern = r'href=["\']([/]a/[^"\']+)["\']'
matches = re.findall(pattern, content, re.IGNORECASE)
lot_urls = []
for match in matches:
full_url = urljoin(self.base_url, match)
lot_urls.append(full_url)
# Remove duplicates
return list(set(lot_urls))
def _extract_lot_id(self, url: str) -> str:
"""Extract lot ID from URL"""
path = urlparse(url).path
# Try /lots/ pattern first (legacy)
match = re.search(r'/lots/(\d+)', path)
if match:
return match.group(1)
# Try /a/ pattern (current format: /a/title-A7-12345)
match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path)
if match:
return match.group(1)
# Fallback: return last part of path
return path.split('/')[-1] if path else ""
def _parse_lot_page(self, content: str, url: str) -> Dict:
"""Parse individual lot page and extract data"""
# First try to extract from __NEXT_DATA__ JSON (Next.js sites)
next_data = self._extract_nextjs_data(content)
if next_data:
return next_data
# Fallback to HTML parsing
content = re.sub(r'\s+', ' ', content)
data = {
'url': url,
'lot_id': self._extract_lot_id(url),
'title': self._extract_meta_content(content, 'og:title'),
'current_bid': self._extract_current_bid(content),
'bid_count': self._extract_bid_count(content),
'end_date': self._extract_end_date(content),
'location': self._extract_location(content),
'description': self._extract_description(content),
'category': self._extract_category(content),
'images': self._extract_images(content),
'scraped_at': datetime.now().isoformat()
}
return data
def _extract_nextjs_data(self, content: str) -> Optional[Dict]:
"""Extract data from Next.js __NEXT_DATA__ JSON"""
try:
# Find the __NEXT_DATA__ script tag
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
return None
data = json.loads(match.group(1))
# Navigate to pageProps
page_props = data.get('props', {}).get('pageProps', {})
# Check if this is an auction page (contains lot data)
if 'auction' in page_props:
# This is a single lot/auction page
auction = page_props.get('auction', {})
# Extract main data
result = {
'url': self.base_url + '/a/' + auction.get('urlSlug', ''),
'lot_id': auction.get('displayId', ''),
'title': auction.get('name', ''),
'current_bid': '', # Need to check if this has bid info
'bid_count': 0,
'end_date': self._format_timestamp(auction.get('minEndDate', '')),
'location': self._extract_location_from_json(auction),
'description': auction.get('description', ''),
'category': auction.get('category', {}).get('name', '') if isinstance(auction.get('category'), dict) else '',
'images': [auction['image']['url']] if auction.get('image') and auction['image'].get('url') else [],
'scraped_at': datetime.now().isoformat()
}
return result
return None
except Exception as e:
print(f" → Error parsing __NEXT_DATA__: {e}")
return None
def _format_timestamp(self, timestamp: any) -> str:
"""Convert Unix timestamp to readable date"""
try:
if isinstance(timestamp, (int, float)) and timestamp > 0:
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
return str(timestamp) if timestamp else ''
except:
return str(timestamp) if timestamp else ''
def _extract_location_from_json(self, auction_data: Dict) -> str:
"""Extract location from auction JSON data"""
# Try viewingDays first
viewing_days = auction_data.get('viewingDays', [])
if viewing_days and len(viewing_days) > 0:
first_location = viewing_days[0]
city = first_location.get('city', '')
country = first_location.get('countryCode', '').upper()
if city:
return f"{city}, {country}" if country else city
# Try collectionDays
collection_days = auction_data.get('collectionDays', [])
if collection_days and len(collection_days) > 0:
first_location = collection_days[0]
city = first_location.get('city', '')
country = first_location.get('countryCode', '').upper()
if city:
return f"{city}, {country}" if country else city
return ''
def _extract_meta_content(self, content: str, property_name: str) -> str:
"""Extract content from meta tags"""
pattern = rf'<meta[^>]*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']'
match = re.search(pattern, content, re.IGNORECASE)
if match:
return self._clean_text(match.group(1))
return ""
def _extract_current_bid(self, content: str) -> str:
"""Extract current bid amount"""
patterns = [
# JSON data patterns (most reliable)
r'"currentBid"\s*:\s*"([^"]+)"',
r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)',
r'currentBid["\']?\s*:\s*["\']?([€\d,.\s]+)["\']?',
# HTML patterns - look for bid amount AFTER the label
r'(?:Current bid|Huidig bod)[:\s]*</?\w*>\s*(€[\d,.\s]+)',
r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)',
r'<[^>]*bid-amount[^>]*>[\s]*(€[\d,.\s]+)',
# Meta tags
r'<meta[^>]*property=["\']auction:currentBid["\'][^>]*content=["\']([^"\']+)["\']',
# Structured data
r'"price"\s*:\s*"([€\d,.\s]+)"',
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
if match:
bid = match.group(1).strip()
# Validate it's not just the label
if bid and bid.lower() not in ['huidig bod', 'current bid', 'locatie', 'location']:
# Clean up the bid value
if not bid.startswith(''):
bid = f"{bid}"
return bid
return "€0"
def _extract_bid_count(self, content: str) -> int:
"""Extract number of bids"""
patterns = [
r'(\d+)\s*bids?',
r'bidCount["\']:\s*["\']?(\d+)["\']?'
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
try:
return int(match.group(1))
except:
return 0
return 0
def _extract_end_date(self, content: str) -> str:
"""Extract auction end date"""
patterns = [
r'Ends?[:\s]+([A-Za-z0-9,:\s]+)',
r'endTime["\']:\s*["\']([^"\']+)["\']',
r'class="[^"]*end[^"]*".*?>([A-Za-z0-9,:\s]+)<'
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
return match.group(1).strip()
return ""
def _extract_location(self, content: str) -> str:
"""Extract location"""
patterns = [
# JSON data patterns (most reliable)
r'"location"\s*:\s*"([^"]+)"',
r'"address"\s*:\s*"([^"]+)"',
r'"addressLocality"\s*:\s*"([^"]+)"',
# HTML patterns - look for location AFTER the label
r'(?:Location|Locatie)[:\s]*</?\w*>\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)',
r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<br|</|$)',
r'<[^>]*location[^>]*>[\s]*([A-Za-zÀ-ÿ0-9\s,.-]+?)</[^>]*>',
# Icon or label based
r'<i[^>]*location[^>]*></i>\s*([A-Za-zÀ-ÿ0-9\s,.-]+)',
# Meta tags
r'<meta[^>]*property=["\']auction:location["\'][^>]*content=["\']([^"\']+)["\']',
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
if match:
location = self._clean_text(match.group(1))
# Validate it's not just the label
if location and location.lower() not in ['locatie', 'location', 'huidig bod', 'current bid']:
# Remove trailing punctuation and whitespace
location = re.sub(r'[,.\s]+$', '', location)
if len(location) > 2: # Must be more than 2 chars
return location
return ""
def _extract_description(self, content: str) -> str:
"""Extract description"""
patterns = [
r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']',
r'class="[^"]*description[^"]*".*?>([^<]+)<'
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
if match:
return self._clean_text(match.group(1))[:500]
return ""
def _extract_category(self, content: str) -> str:
"""Extract category from breadcrumb or meta tags"""
# Try breadcrumb first
pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)</a>'
match = re.search(pattern, content, re.IGNORECASE)
if match:
return self._clean_text(match.group(1))
# Try meta
return self._extract_meta_content(content, 'category')
def _extract_images(self, content: str) -> List[str]:
"""Extract image URLs"""
pattern = r'<img[^>]*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>'
matches = re.findall(pattern, content, re.IGNORECASE)
images = []
for match in matches:
if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']):
continue
full_url = urljoin(self.base_url, match)
images.append(full_url)
return images[:5] # Limit to 5 images
def _clean_text(self, text: str) -> str:
"""Clean extracted text"""
import html
text = html.unescape(text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
"""Crawl a single listing page and return lot URLs"""
url = f"{self.base_url}/auctions?page={page_num}"
print(f"\n{'='*60}")
print(f"LISTING PAGE {page_num}: {url}")
print(f"{'='*60}")
content = await self._get_page(page, url)
if not content:
return []
lot_urls = self._extract_lot_urls_from_listing(content)
print(f"→ Found {len(lot_urls)} lot URLs")
return lot_urls
async def crawl_lot(self, page: Page, url: str) -> Optional[Dict]:
"""Crawl an individual lot page"""
if url in self.visited_lots:
print(f" → Skipping (already visited): {url}")
return None
lot_id = self._extract_lot_id(url)
print(f"\n[LOT {lot_id}]")
content = await self._get_page(page, url)
if not content:
return None
lot_data = self._parse_lot_page(content, url)
self.visited_lots.add(url)
print(f" → Title: {lot_data.get('title', 'N/A')[:60]}...")
print(f" → Bid: {lot_data.get('current_bid', 'N/A')}")
print(f" → Location: {lot_data.get('location', 'N/A')}")
return lot_data
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
"""Main crawl function"""
async with async_playwright() as p:
print("Launching browser...")
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled'
]
)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
)
# Set extra headers
await page.set_extra_http_headers({
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
})
all_lot_urls = []
# First pass: collect all lot URLs from listing pages
print("\n" + "="*60)
print("PHASE 1: COLLECTING LOT URLs FROM LISTING PAGES")
print("="*60)
for page_num in range(1, max_pages + 1):
lot_urls = await self.crawl_listing_page(page, page_num)
if not lot_urls:
print(f"No lots found on page {page_num}, stopping")
break
all_lot_urls.extend(lot_urls)
print(f" → Total lots collected so far: {len(all_lot_urls)}")
# Remove duplicates
all_lot_urls = list(set(all_lot_urls))
print(f"\n{'='*60}")
print(f"PHASE 1 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS TO SCRAPE")
print(f"{'='*60}")
# Second pass: scrape each lot page
print("\n" + "="*60)
print("PHASE 2: SCRAPING INDIVIDUAL LOT PAGES")
print("="*60)
results = []
for i, lot_url in enumerate(all_lot_urls):
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
lot_data = await self.crawl_lot(page, lot_url)
if lot_data:
results.append(lot_data)
# Save progress after each successful scrape
if (i + 1) % 10 == 0: # Save every 10 lots
self._save_intermediate(results)
await browser.close()
return results
def _save_intermediate(self, data: List[Dict]):
"""Save intermediate results"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{OUTPUT_DIR}/troostwijk_lots_partial_{timestamp}.json"
with open(filename, 'w', encoding='utf-8') as f:
json.dump({
'count': len(data),
'lots': data
}, f, indent=2, ensure_ascii=False)
print(f"\n → PROGRESS SAVED: {filename}")
def save_final_results(self, data: List[Dict]):
"""Save final results in multiple formats"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save JSON
json_file = f"{OUTPUT_DIR}/troostwijk_lots_final_{timestamp}.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump({
'count': len(data),
'scraped_at': datetime.now().isoformat(),
'rate_limit_seconds': RATE_LIMIT_SECONDS,
'lots': data
}, f, indent=2, ensure_ascii=False)
# Save CSV
csv_file = f"{OUTPUT_DIR}/troostwijk_lots_final_{timestamp}.csv"
if data:
flat_data = []
for item in data:
flat_item = item.copy()
flat_item['images'] = ', '.join(flat_item.get('images', []))
flat_data.append(flat_item)
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
fieldnames = ['url', 'lot_id', 'title', 'current_bid', 'bid_count',
'end_date', 'location', 'description', 'category', 'images', 'scraped_at']
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(flat_data)
return json_file, csv_file
def test_extraction(test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
"""Test extraction on a specific cached URL to debug patterns"""
scraper = TroostwijkScraper()
# Try to get from cache
cached = scraper.cache.get(test_url)
if not cached:
print(f"ERROR: URL not found in cache: {test_url}")
print(f"\nAvailable cached URLs:")
with sqlite3.connect(CACHE_DB) as conn:
cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
for row in cursor.fetchall():
print(f" - {row[0]}")
return
content = cached['content']
print(f"\n{'='*60}")
print(f"TESTING EXTRACTION FROM: {test_url}")
print(f"{'='*60}")
print(f"Content length: {len(content)} chars")
print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
# Test each extraction method
lot_data = scraper._parse_lot_page(content, test_url)
print(f"\n{'='*60}")
print("EXTRACTED DATA:")
print(f"{'='*60}")
for key, value in lot_data.items():
if key == 'images':
print(f"{key:.<20}: {len(value)} images")
for img in value[:3]:
print(f"{'':.<20} - {img}")
else:
display_value = str(value)[:100] if value else "(empty)"
# Handle Unicode characters that Windows console can't display
try:
print(f"{key:.<20}: {display_value}")
except UnicodeEncodeError:
safe_value = display_value.encode('ascii', 'replace').decode('ascii')
print(f"{key:.<20}: {safe_value}")
# Validation checks
print(f"\n{'='*60}")
print("VALIDATION CHECKS:")
print(f"{'='*60}")
issues = []
if lot_data['current_bid'] in ['Huidig bod', 'Current bid', '€0', '']:
issues.append("[!] Current bid not extracted correctly")
else:
print("[OK] Current bid looks valid:", lot_data['current_bid'])
if lot_data['location'] in ['Locatie', 'Location', '']:
issues.append("[!] Location not extracted correctly")
else:
print("[OK] Location looks valid:", lot_data['location'])
if lot_data['title'] in ['', '...']:
issues.append("[!] Title not extracted correctly")
else:
print("[OK] Title looks valid:", lot_data['title'][:50])
if issues:
print(f"\n[ISSUES FOUND]")
for issue in issues:
print(f" {issue}")
else:
print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
# Debug: Show raw HTML snippets for problematic fields
print(f"\n{'='*60}")
print("DEBUG: RAW HTML SNIPPETS")
print(f"{'='*60}")
# Look for bid-related content
print(f"\n1. Bid patterns in content:")
bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
for i, match in enumerate(bid_matches[:5], 1):
print(f" {i}. {match}")
# Look for location content
print(f"\n2. Location patterns in content:")
loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
for i, match in enumerate(loc_matches[:5], 1):
print(f" {i}. ...{match}...")
# Look for JSON data
print(f"\n3. JSON/Script data containing auction info:")
json_patterns = [
r'"currentBid"[^,}]+',
r'"location"[^,}]+',
r'"price"[^,}]+',
r'"addressLocality"[^,}]+'
]
for pattern in json_patterns:
matches = re.findall(pattern, content[:50000], re.IGNORECASE)
if matches:
print(f" {pattern}: {matches[:3]}")
# Look for script tags with structured data
script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
if script_matches:
print(f"\n4. Structured data (JSON-LD) found:")
for i, script in enumerate(script_matches[:2], 1):
try:
data = json.loads(script)
print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...")
except:
print(f" Script {i}: {script[:300]}...")
def main():
"""Main execution"""
import sys
# Check for test mode
if len(sys.argv) > 1 and sys.argv[1] == "--test":
test_url = sys.argv[2] if len(sys.argv) > 2 else None
if test_url:
test_extraction(test_url)
else:
test_extraction()
return
print("Troostwijk Auctions Scraper")
print("=" * 60)
print(f"Rate limit: {RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
print(f"Cache database: {CACHE_DB}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Max listing pages: {MAX_PAGES}")
print("=" * 60)
scraper = TroostwijkScraper()
try:
# Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
scraper.cache.clear_old(max_age_hours=168)
# Run the crawler
results = asyncio.run(scraper.crawl_auctions(max_pages=MAX_PAGES))
# Save final results
if results:
json_file, csv_file = scraper.save_final_results(results)
print("\n" + "="*60)
print("CRAWLING COMPLETED SUCCESSFULLY")
print("="*60)
print(f"Total lots scraped: {len(results)}")
print(f"JSON file: {json_file}")
print(f"CSV file: {csv_file}")
# Show sample
if results:
print(f"\n{'='*60}")
print("SAMPLE DATA:")
print(f"{'='*60}")
sample = results[0]
for key, value in sample.items():
if key != 'images':
print(f"{key:.<20}: {str(value)[:80]}...")
else:
print("\nNo results collected. Check cache and logs.")
except KeyboardInterrupt:
print("\nScraping interrupted by user - partial results saved in output directory")
except Exception as e:
print(f"\nERROR during scraping: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()