This commit is contained in:
Tour
2025-12-09 08:04:16 +01:00
commit e69563d4d6
37 changed files with 7262 additions and 0 deletions

83
src/main.py Normal file
View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""
Scaev Auctions Scraper - Main Entry Point
Focuses on extracting auction lots with caching and rate limiting
"""
import sys
import asyncio
import json
import csv
import sqlite3
from datetime import datetime
from pathlib import Path
import config
from cache import CacheManager
from scraper import TroostwijkScraper
def main():
"""Main execution"""
# Check for test mode
if len(sys.argv) > 1 and sys.argv[1] == "--test":
# Import test function only when needed to avoid circular imports
from test import test_extraction
test_url = sys.argv[2] if len(sys.argv) > 2 else None
if test_url:
test_extraction(test_url)
else:
test_extraction()
return
print("Scaev Auctions Scraper")
print("=" * 60)
if config.OFFLINE:
print("OFFLINE MODE ENABLED — only database and cache will be used (no network)")
print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
print(f"Cache database: {config.CACHE_DB}")
print(f"Output directory: {config.OUTPUT_DIR}")
print(f"Max listing pages: {config.MAX_PAGES}")
print("=" * 60)
scraper = TroostwijkScraper()
try:
# Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
scraper.cache.clear_old(max_age_hours=168)
# Run the crawler
results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES))
# Export results to files
print("\n" + "="*60)
print("EXPORTING RESULTS TO FILES")
print("="*60)
files = scraper.export_to_files()
print("\n" + "="*60)
print("CRAWLING COMPLETED SUCCESSFULLY")
print("="*60)
print(f"Total pages scraped: {len(results)}")
print(f"\nAuctions JSON: {files['auctions_json']}")
print(f"Auctions CSV: {files['auctions_csv']}")
print(f"Lots JSON: {files['lots_json']}")
print(f"Lots CSV: {files['lots_csv']}")
# Count auctions vs lots
auctions = [r for r in results if r.get('type') == 'auction']
lots = [r for r in results if r.get('type') == 'lot']
print(f"\n Auctions: {len(auctions)}")
print(f" Lots: {len(lots)}")
except KeyboardInterrupt:
print("\nScraping interrupted by user - partial results saved in output directory")
except Exception as e:
print(f"\nERROR during scraping: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
from cache import CacheManager
from scraper import TroostwijkScraper
main()