#!/usr/bin/env python3 """ Migrate existing lot data to extract missing enriched fields """ import sqlite3 import json import re from datetime import datetime import sys sys.path.insert(0, 'src') from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json DB_PATH = "/mnt/okcomputer/output/cache.db" def migrate_lot_attributes(): """Extract attributes from cached lot pages""" print("="*60) print("MIGRATING EXISTING LOT DATA") print("="*60) conn = sqlite3.connect(DB_PATH) # Get cached lot pages cursor = conn.execute(""" SELECT url, content, timestamp FROM cache WHERE url LIKE '%/l/%' ORDER BY timestamp DESC """) import zlib updated_count = 0 for url, content_blob, timestamp in cursor: try: # Get lot_id from URL lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url) if not lot_id_match: lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url) if not lot_id_match: continue lot_id = lot_id_match.group(1) # Check if lot exists in database lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,)) lot_row = lot_cursor.fetchone() if not lot_row: continue _, title, description = lot_row # Decompress and parse __NEXT_DATA__ content = zlib.decompress(content_blob).decode('utf-8') match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) if not match: continue data = json.loads(match.group(1)) lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {}) if not lot_json: continue # Extract basic attributes attrs = extract_attributes_from_lot_json(lot_json) # Extract enriched attributes page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')} enriched = extract_enriched_attributes(lot_json, page_data) # Merge all_attrs = {**attrs, **enriched} # Update database conn.execute(""" UPDATE lots SET brand = ?, model = ?, attributes_json = ?, year_manufactured = ?, condition_score = ?, condition_description = ?, serial_number = ?, manufacturer = ?, damage_description = ? WHERE lot_id = ? """, ( all_attrs.get('brand', ''), all_attrs.get('model', ''), all_attrs.get('attributes_json', ''), all_attrs.get('year_manufactured'), all_attrs.get('condition_score'), all_attrs.get('condition_description', ''), all_attrs.get('serial_number', ''), all_attrs.get('manufacturer', ''), all_attrs.get('damage_description', ''), lot_id )) updated_count += 1 if updated_count % 100 == 0: print(f" Processed {updated_count} lots...") conn.commit() except Exception as e: print(f" Error processing {url}: {e}") continue conn.commit() print(f"\n✓ Updated {updated_count} lots with enriched attributes") # Show stats cursor = conn.execute(""" SELECT COUNT(*) as total, SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year, SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition, SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer, SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand, SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model FROM lots """) stats = cursor.fetchone() print(f"\nENRICHMENT STATISTICS:") print(f" Total lots: {stats[0]:,}") print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)") print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)") print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)") print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)") print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)") conn.close() def main(): print("\nStarting migration of existing data...") print(f"Database: {DB_PATH}\n") migrate_lot_attributes() print(f"\n{'='*60}") print("MIGRATION COMPLETE") print(f"{'='*60}\n") if __name__ == "__main__": main()