it
This commit is contained in:
141
export_incremental.py
Normal file
141
export_incremental.py
Normal file
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Export only NEW auctions/lots that haven't been sent to server yet
|
||||
Prevents UNIQUE constraint errors on server import
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import csv
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = "C:/mnt/okcomputer/output/cache.db"
|
||||
OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
|
||||
SYNC_STATE_FILE = OUTPUT_DIR / ".server_sync_state"
|
||||
|
||||
def get_last_export_timestamp():
|
||||
"""Get timestamp of last successful export to server"""
|
||||
if SYNC_STATE_FILE.exists():
|
||||
return int(SYNC_STATE_FILE.read_text().strip())
|
||||
return 0
|
||||
|
||||
def save_export_timestamp(timestamp: int):
|
||||
"""Save timestamp of successful export"""
|
||||
SYNC_STATE_FILE.write_text(str(timestamp))
|
||||
|
||||
def export_new_data():
|
||||
"""Export only records that are NEW since last server import"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
last_export = get_last_export_timestamp()
|
||||
current_time = int(datetime.now().timestamp())
|
||||
|
||||
print("="*60)
|
||||
print("INCREMENTAL EXPORT FOR SERVER")
|
||||
print("="*60)
|
||||
print(f"Last export: {datetime.fromtimestamp(last_export).strftime('%Y-%m-%d %H:%M:%S') if last_export else 'Never (will export ALL)'}")
|
||||
print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print()
|
||||
|
||||
# Get new auctions (discovered_at > last_export)
|
||||
if last_export == 0:
|
||||
# First run: export all
|
||||
cursor.execute("SELECT * FROM auctions ORDER BY auction_id")
|
||||
else:
|
||||
# Subsequent runs: only new ones
|
||||
cursor.execute("""
|
||||
SELECT * FROM auctions
|
||||
WHERE discovered_at > ?
|
||||
ORDER BY auction_id
|
||||
""", (last_export,))
|
||||
|
||||
new_auctions = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# Get new lots (scraped_at_timestamp > last_export)
|
||||
if last_export == 0:
|
||||
cursor.execute("SELECT * FROM lots ORDER BY lot_id")
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT * FROM lots
|
||||
WHERE scraped_at_timestamp > ?
|
||||
ORDER BY lot_id
|
||||
""", (last_export,))
|
||||
|
||||
new_lots = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
conn.close()
|
||||
|
||||
# Export to server-ready files
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
files_created = []
|
||||
|
||||
# Export auctions
|
||||
if new_auctions:
|
||||
auctions_csv = OUTPUT_DIR / f'auctions_{timestamp}.csv'
|
||||
auctions_json = OUTPUT_DIR / f'auctions_{timestamp}.json'
|
||||
|
||||
with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(new_auctions)
|
||||
|
||||
with open(auctions_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(new_auctions, f, indent=2, ensure_ascii=False)
|
||||
|
||||
files_created.extend([auctions_csv, auctions_json])
|
||||
print(f"✓ Exported {len(new_auctions)} auctions")
|
||||
print(f" CSV: {auctions_csv}")
|
||||
print(f" JSON: {auctions_json}")
|
||||
else:
|
||||
print("✓ No new auctions to export")
|
||||
|
||||
# Export lots
|
||||
if new_lots:
|
||||
lots_csv = OUTPUT_DIR / f'lots_{timestamp}.csv'
|
||||
lots_json = OUTPUT_DIR / f'lots_{timestamp}.json'
|
||||
|
||||
with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(new_lots)
|
||||
|
||||
with open(lots_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(new_lots, f, indent=2, ensure_ascii=False)
|
||||
|
||||
files_created.extend([lots_csv, lots_json])
|
||||
print(f"✓ Exported {len(new_lots)} lots")
|
||||
print(f" CSV: {lots_csv}")
|
||||
print(f" JSON: {lots_json}")
|
||||
else:
|
||||
print("✓ No new lots to export")
|
||||
|
||||
# Save sync state
|
||||
if new_auctions or new_lots:
|
||||
save_export_timestamp(current_time)
|
||||
print()
|
||||
print("="*60)
|
||||
print("EXPORT COMPLETE")
|
||||
print("="*60)
|
||||
print(f"New auctions: {len(new_auctions)}")
|
||||
print(f"New lots: {len(new_lots)}")
|
||||
print()
|
||||
print("Next export will only include records newer than:")
|
||||
print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
else:
|
||||
print()
|
||||
print("="*60)
|
||||
print("NOTHING TO EXPORT")
|
||||
print("="*60)
|
||||
print("All data already exported to server")
|
||||
|
||||
return {
|
||||
'auctions': len(new_auctions),
|
||||
'lots': len(new_lots),
|
||||
'files': [str(f) for f in files_created]
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
export_new_data()
|
||||
@@ -40,7 +40,7 @@ class TroostwijkScraper:
|
||||
self.download_images = DOWNLOAD_IMAGES
|
||||
|
||||
async def _download_image(self, session: 'aiohttp.ClientSession', url: str, lot_id: str, index: int) -> Optional[str]:
|
||||
"""Download an image and save it locally (without rate limiting - concurrent within lot)"""
|
||||
"""Download an image and save it locally (without rate limiting - concurrent within lot)"""
|
||||
if not self.download_images:
|
||||
return None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user