Initial
This commit is contained in:
83
src/main.py
Normal file
83
src/main.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scaev Auctions Scraper - Main Entry Point
|
||||
Focuses on extracting auction lots with caching and rate limiting
|
||||
"""
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
import json
|
||||
import csv
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import config
|
||||
from cache import CacheManager
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
def main():
|
||||
"""Main execution"""
|
||||
# Check for test mode
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--test":
|
||||
# Import test function only when needed to avoid circular imports
|
||||
from test import test_extraction
|
||||
test_url = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
if test_url:
|
||||
test_extraction(test_url)
|
||||
else:
|
||||
test_extraction()
|
||||
return
|
||||
|
||||
print("Scaev Auctions Scraper")
|
||||
print("=" * 60)
|
||||
if config.OFFLINE:
|
||||
print("OFFLINE MODE ENABLED — only database and cache will be used (no network)")
|
||||
print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
|
||||
print(f"Cache database: {config.CACHE_DB}")
|
||||
print(f"Output directory: {config.OUTPUT_DIR}")
|
||||
print(f"Max listing pages: {config.MAX_PAGES}")
|
||||
print("=" * 60)
|
||||
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
try:
|
||||
# Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
|
||||
scraper.cache.clear_old(max_age_hours=168)
|
||||
|
||||
# Run the crawler
|
||||
results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES))
|
||||
|
||||
# Export results to files
|
||||
print("\n" + "="*60)
|
||||
print("EXPORTING RESULTS TO FILES")
|
||||
print("="*60)
|
||||
|
||||
files = scraper.export_to_files()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("CRAWLING COMPLETED SUCCESSFULLY")
|
||||
print("="*60)
|
||||
print(f"Total pages scraped: {len(results)}")
|
||||
print(f"\nAuctions JSON: {files['auctions_json']}")
|
||||
print(f"Auctions CSV: {files['auctions_csv']}")
|
||||
print(f"Lots JSON: {files['lots_json']}")
|
||||
print(f"Lots CSV: {files['lots_csv']}")
|
||||
|
||||
# Count auctions vs lots
|
||||
auctions = [r for r in results if r.get('type') == 'auction']
|
||||
lots = [r for r in results if r.get('type') == 'lot']
|
||||
print(f"\n Auctions: {len(auctions)}")
|
||||
print(f" Lots: {len(lots)}")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nScraping interrupted by user - partial results saved in output directory")
|
||||
except Exception as e:
|
||||
print(f"\nERROR during scraping: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
from cache import CacheManager
|
||||
from scraper import TroostwijkScraper
|
||||
main()
|
||||
Reference in New Issue
Block a user