diff --git a/export_incremental.py b/export_incremental.py new file mode 100644 index 0000000..f368d98 --- /dev/null +++ b/export_incremental.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +Export only NEW auctions/lots that haven't been sent to server yet +Prevents UNIQUE constraint errors on server import +""" + +import sqlite3 +import json +import csv +from datetime import datetime +from pathlib import Path + +DB_PATH = "C:/mnt/okcomputer/output/cache.db" +OUTPUT_DIR = Path("C:/mnt/okcomputer/output") +SYNC_STATE_FILE = OUTPUT_DIR / ".server_sync_state" + +def get_last_export_timestamp(): + """Get timestamp of last successful export to server""" + if SYNC_STATE_FILE.exists(): + return int(SYNC_STATE_FILE.read_text().strip()) + return 0 + +def save_export_timestamp(timestamp: int): + """Save timestamp of successful export""" + SYNC_STATE_FILE.write_text(str(timestamp)) + +def export_new_data(): + """Export only records that are NEW since last server import""" + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + last_export = get_last_export_timestamp() + current_time = int(datetime.now().timestamp()) + + print("="*60) + print("INCREMENTAL EXPORT FOR SERVER") + print("="*60) + print(f"Last export: {datetime.fromtimestamp(last_export).strftime('%Y-%m-%d %H:%M:%S') if last_export else 'Never (will export ALL)'}") + print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}") + print() + + # Get new auctions (discovered_at > last_export) + if last_export == 0: + # First run: export all + cursor.execute("SELECT * FROM auctions ORDER BY auction_id") + else: + # Subsequent runs: only new ones + cursor.execute(""" + SELECT * FROM auctions + WHERE discovered_at > ? + ORDER BY auction_id + """, (last_export,)) + + new_auctions = [dict(row) for row in cursor.fetchall()] + + # Get new lots (scraped_at_timestamp > last_export) + if last_export == 0: + cursor.execute("SELECT * FROM lots ORDER BY lot_id") + else: + cursor.execute(""" + SELECT * FROM lots + WHERE scraped_at_timestamp > ? + ORDER BY lot_id + """, (last_export,)) + + new_lots = [dict(row) for row in cursor.fetchall()] + + conn.close() + + # Export to server-ready files + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + files_created = [] + + # Export auctions + if new_auctions: + auctions_csv = OUTPUT_DIR / f'auctions_{timestamp}.csv' + auctions_json = OUTPUT_DIR / f'auctions_{timestamp}.json' + + with open(auctions_csv, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys()) + writer.writeheader() + writer.writerows(new_auctions) + + with open(auctions_json, 'w', encoding='utf-8') as f: + json.dump(new_auctions, f, indent=2, ensure_ascii=False) + + files_created.extend([auctions_csv, auctions_json]) + print(f"✓ Exported {len(new_auctions)} auctions") + print(f" CSV: {auctions_csv}") + print(f" JSON: {auctions_json}") + else: + print("✓ No new auctions to export") + + # Export lots + if new_lots: + lots_csv = OUTPUT_DIR / f'lots_{timestamp}.csv' + lots_json = OUTPUT_DIR / f'lots_{timestamp}.json' + + with open(lots_csv, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=new_lots[0].keys()) + writer.writeheader() + writer.writerows(new_lots) + + with open(lots_json, 'w', encoding='utf-8') as f: + json.dump(new_lots, f, indent=2, ensure_ascii=False) + + files_created.extend([lots_csv, lots_json]) + print(f"✓ Exported {len(new_lots)} lots") + print(f" CSV: {lots_csv}") + print(f" JSON: {lots_json}") + else: + print("✓ No new lots to export") + + # Save sync state + if new_auctions or new_lots: + save_export_timestamp(current_time) + print() + print("="*60) + print("EXPORT COMPLETE") + print("="*60) + print(f"New auctions: {len(new_auctions)}") + print(f"New lots: {len(new_lots)}") + print() + print("Next export will only include records newer than:") + print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}") + else: + print() + print("="*60) + print("NOTHING TO EXPORT") + print("="*60) + print("All data already exported to server") + + return { + 'auctions': len(new_auctions), + 'lots': len(new_lots), + 'files': [str(f) for f in files_created] + } + +if __name__ == "__main__": + export_new_data() diff --git a/src/scraper.py b/src/scraper.py index 6008980..bb2e6f2 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -40,7 +40,7 @@ class TroostwijkScraper: self.download_images = DOWNLOAD_IMAGES async def _download_image(self, session: 'aiohttp.ClientSession', url: str, lot_id: str, index: int) -> Optional[str]: - """Download an image and save it locally (without rate limiting - concurrent within lot)""" + """Download an image and save it locally (without rate limiting - concurrent within lot)""" if not self.download_images: return None