Initial

2025-12-09 08:04:16 +01:00
commit e69563d4d6
37 changed files with 7262 additions and 0 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""
+Scaev Auctions Scraper - Main Entry Point
+Focuses on extracting auction lots with caching and rate limiting
+"""
+
+import sys
+import asyncio
+import json
+import csv
+import sqlite3
+from datetime import datetime
+from pathlib import Path
+
+import config
+from cache import CacheManager
+from scraper import TroostwijkScraper
+
+def main():
+    """Main execution"""
+    # Check for test mode
+    if len(sys.argv) > 1 and sys.argv[1] == "--test":
+        # Import test function only when needed to avoid circular imports
+        from test import test_extraction
+        test_url = sys.argv[2] if len(sys.argv) > 2 else None
+        if test_url:
+            test_extraction(test_url)
+        else:
+            test_extraction()
+        return
+
+    print("Scaev Auctions Scraper")
+    print("=" * 60)
+    if config.OFFLINE:
+        print("OFFLINE MODE ENABLED — only database and cache will be used (no network)")
+    print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
+    print(f"Cache database: {config.CACHE_DB}")
+    print(f"Output directory: {config.OUTPUT_DIR}")
+    print(f"Max listing pages: {config.MAX_PAGES}")
+    print("=" * 60)
+
+    scraper = TroostwijkScraper()
+
+    try:
+        # Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
+        scraper.cache.clear_old(max_age_hours=168)
+
+        # Run the crawler
+        results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES))
+
+        # Export results to files
+        print("\n" + "="*60)
+        print("EXPORTING RESULTS TO FILES")
+        print("="*60)
+
+        files = scraper.export_to_files()
+
+        print("\n" + "="*60)
+        print("CRAWLING COMPLETED SUCCESSFULLY")
+        print("="*60)
+        print(f"Total pages scraped: {len(results)}")
+        print(f"\nAuctions JSON: {files['auctions_json']}")
+        print(f"Auctions CSV: {files['auctions_csv']}")
+        print(f"Lots JSON: {files['lots_json']}")
+        print(f"Lots CSV: {files['lots_csv']}")
+
+        # Count auctions vs lots
+        auctions = [r for r in results if r.get('type') == 'auction']
+        lots = [r for r in results if r.get('type') == 'lot']
+        print(f"\n  Auctions: {len(auctions)}")
+        print(f"  Lots: {len(lots)}")
+
+    except KeyboardInterrupt:
+        print("\nScraping interrupted by user - partial results saved in output directory")
+    except Exception as e:
+        print(f"\nERROR during scraping: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    from cache import CacheManager
+    from scraper import TroostwijkScraper
+    main()