#!/usr/bin/env python3 """ Scaev Auctions Scraper - Main Entry Point Focuses on extracting auction lots with caching and rate limiting """ import sys import asyncio import json import csv from datetime import datetime from pathlib import Path import config from cache import CacheManager from scraper import TroostwijkScraper def mask_db_url(url: str) -> str: try: from urllib.parse import urlparse p = urlparse(url) user = p.username or '' host = p.hostname or '' port = f":{p.port}" if p.port else '' return f"{p.scheme}://{user}:***@{host}{port}{p.path or ''}" except Exception: return url def main(): """Main execution""" # Check for test mode if len(sys.argv) > 1 and sys.argv[1] == "--test": # Import test function only when needed to avoid circular imports from test import test_extraction test_url = sys.argv[2] if len(sys.argv) > 2 else None if test_url: test_extraction(test_url) else: test_extraction() return print("Scaev Auctions Scraper") print("=" * 60) if config.OFFLINE: print("OFFLINE MODE ENABLED — only database and cache will be used (no network)") print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST") print(f"Database URL: {mask_db_url(config.DATABASE_URL)}") print(f"Output directory: {config.OUTPUT_DIR}") print(f"Max listing pages: {config.MAX_PAGES}") print("=" * 60) scraper = TroostwijkScraper() try: # Clear old cache (older than 7 days) - KEEP DATABASE CLEAN scraper.cache.clear_old(max_age_hours=168) # Run the crawler results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES)) # Export results to files print("\n" + "="*60) print("EXPORTING RESULTS TO FILES") print("="*60) files = scraper.export_to_files() print("\n" + "="*60) print("CRAWLING COMPLETED SUCCESSFULLY") print("="*60) print(f"Total pages scraped: {len(results)}") print(f"\nAuctions JSON: {files['auctions_json']}") print(f"Auctions CSV: {files['auctions_csv']}") print(f"Lots JSON: {files['lots_json']}") print(f"Lots CSV: {files['lots_csv']}") # Count auctions vs lots auctions = [r for r in results if r.get('type') == 'auction'] lots = [r for r in results if r.get('type') == 'lot'] print(f"\n Auctions: {len(auctions)}") print(f" Lots: {len(lots)}") except KeyboardInterrupt: print("\nScraping interrupted by user - partial results saved in output directory") except Exception as e: print(f"\nERROR during scraping: {e}") import traceback traceback.print_exc() if __name__ == "__main__": from cache import CacheManager from scraper import TroostwijkScraper main()