This commit is contained in:
Tour
2025-12-07 16:00:38 +01:00
parent 1b336c49ba
commit fd69faebcc
2 changed files with 142 additions and 1 deletions

141
export_incremental.py Normal file
View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Export only NEW auctions/lots that haven't been sent to server yet
Prevents UNIQUE constraint errors on server import
"""
import sqlite3
import json
import csv
from datetime import datetime
from pathlib import Path
DB_PATH = "C:/mnt/okcomputer/output/cache.db"
OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
SYNC_STATE_FILE = OUTPUT_DIR / ".server_sync_state"
def get_last_export_timestamp():
"""Get timestamp of last successful export to server"""
if SYNC_STATE_FILE.exists():
return int(SYNC_STATE_FILE.read_text().strip())
return 0
def save_export_timestamp(timestamp: int):
"""Save timestamp of successful export"""
SYNC_STATE_FILE.write_text(str(timestamp))
def export_new_data():
"""Export only records that are NEW since last server import"""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
last_export = get_last_export_timestamp()
current_time = int(datetime.now().timestamp())
print("="*60)
print("INCREMENTAL EXPORT FOR SERVER")
print("="*60)
print(f"Last export: {datetime.fromtimestamp(last_export).strftime('%Y-%m-%d %H:%M:%S') if last_export else 'Never (will export ALL)'}")
print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
print()
# Get new auctions (discovered_at > last_export)
if last_export == 0:
# First run: export all
cursor.execute("SELECT * FROM auctions ORDER BY auction_id")
else:
# Subsequent runs: only new ones
cursor.execute("""
SELECT * FROM auctions
WHERE discovered_at > ?
ORDER BY auction_id
""", (last_export,))
new_auctions = [dict(row) for row in cursor.fetchall()]
# Get new lots (scraped_at_timestamp > last_export)
if last_export == 0:
cursor.execute("SELECT * FROM lots ORDER BY lot_id")
else:
cursor.execute("""
SELECT * FROM lots
WHERE scraped_at_timestamp > ?
ORDER BY lot_id
""", (last_export,))
new_lots = [dict(row) for row in cursor.fetchall()]
conn.close()
# Export to server-ready files
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
files_created = []
# Export auctions
if new_auctions:
auctions_csv = OUTPUT_DIR / f'auctions_{timestamp}.csv'
auctions_json = OUTPUT_DIR / f'auctions_{timestamp}.json'
with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
writer.writeheader()
writer.writerows(new_auctions)
with open(auctions_json, 'w', encoding='utf-8') as f:
json.dump(new_auctions, f, indent=2, ensure_ascii=False)
files_created.extend([auctions_csv, auctions_json])
print(f"✓ Exported {len(new_auctions)} auctions")
print(f" CSV: {auctions_csv}")
print(f" JSON: {auctions_json}")
else:
print("✓ No new auctions to export")
# Export lots
if new_lots:
lots_csv = OUTPUT_DIR / f'lots_{timestamp}.csv'
lots_json = OUTPUT_DIR / f'lots_{timestamp}.json'
with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
writer.writeheader()
writer.writerows(new_lots)
with open(lots_json, 'w', encoding='utf-8') as f:
json.dump(new_lots, f, indent=2, ensure_ascii=False)
files_created.extend([lots_csv, lots_json])
print(f"✓ Exported {len(new_lots)} lots")
print(f" CSV: {lots_csv}")
print(f" JSON: {lots_json}")
else:
print("✓ No new lots to export")
# Save sync state
if new_auctions or new_lots:
save_export_timestamp(current_time)
print()
print("="*60)
print("EXPORT COMPLETE")
print("="*60)
print(f"New auctions: {len(new_auctions)}")
print(f"New lots: {len(new_lots)}")
print()
print("Next export will only include records newer than:")
print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
else:
print()
print("="*60)
print("NOTHING TO EXPORT")
print("="*60)
print("All data already exported to server")
return {
'auctions': len(new_auctions),
'lots': len(new_lots),
'files': [str(f) for f in files_created]
}
if __name__ == "__main__":
export_new_data()

View File

@@ -40,7 +40,7 @@ class TroostwijkScraper:
self.download_images = DOWNLOAD_IMAGES
async def _download_image(self, session: 'aiohttp.ClientSession', url: str, lot_id: str, index: int) -> Optional[str]:
"""Download an image and save it locally (without rate limiting - concurrent within lot)"""
"""Download an image and save it locally (without rate limiting - concurrent within lot)"""
if not self.download_images:
return None