scaev/src/main.py

#!/usr/bin/env python3
"""
Scaev Auctions Scraper - Main Entry Point
Focuses on extracting auction lots with caching and rate limiting
"""

import sys
import asyncio
import json
import csv
from datetime import datetime
from pathlib import Path

import config
from cache import CacheManager
from scraper import TroostwijkScraper

def mask_db_url(url: str) -> str:
    try:
        from urllib.parse import urlparse
        p = urlparse(url)
        user = p.username or ''
        host = p.hostname or ''
        port = f":{p.port}" if p.port else ''
        return f"{p.scheme}://{user}:***@{host}{port}{p.path or ''}"
    except Exception:
        return url

def main():
    """Main execution"""
    # Check for test mode
    if len(sys.argv) > 1 and sys.argv[1] == "--test":
        # Import test function only when needed to avoid circular imports
        from test import test_extraction
        test_url = sys.argv[2] if len(sys.argv) > 2 else None
        if test_url:
            test_extraction(test_url)
        else:
            test_extraction()
        return

    print("Scaev Auctions Scraper")
    print("=" * 60)
    if config.OFFLINE:
        print("OFFLINE MODE ENABLED — only database and cache will be used (no network)")
    print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
    print(f"Database URL: {mask_db_url(config.DATABASE_URL)}")
    print(f"Output directory: {config.OUTPUT_DIR}")
    print(f"Max listing pages: {config.MAX_PAGES}")
    print("=" * 60)

    scraper = TroostwijkScraper()

    try:
        # Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
        scraper.cache.clear_old(max_age_hours=168)

        # Run the crawler
        results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES))

        # Export results to files
        print("\n" + "="*60)
        print("EXPORTING RESULTS TO FILES")
        print("="*60)

        files = scraper.export_to_files()

        print("\n" + "="*60)
        print("CRAWLING COMPLETED SUCCESSFULLY")
        print("="*60)
        print(f"Total pages scraped: {len(results)}")
        print(f"\nAuctions JSON: {files['auctions_json']}")
        print(f"Auctions CSV: {files['auctions_csv']}")
        print(f"Lots JSON: {files['lots_json']}")
        print(f"Lots CSV: {files['lots_csv']}")

        # Count auctions vs lots
        auctions = [r for r in results if r.get('type') == 'auction']
        lots = [r for r in results if r.get('type') == 'lot']
        print(f"\n  Auctions: {len(auctions)}")
        print(f"  Lots: {len(lots)}")

    except KeyboardInterrupt:
        print("\nScraping interrupted by user - partial results saved in output directory")
    except Exception as e:
        print(f"\nERROR during scraping: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    from cache import CacheManager
    from scraper import TroostwijkScraper
    main()