Files
scaev/migrate_existing_data.py
2025-12-07 01:59:45 +01:00

149 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""
Migrate existing lot data to extract missing enriched fields
"""
import sqlite3
import json
import re
from datetime import datetime
import sys
sys.path.insert(0, 'src')
from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json
DB_PATH = "/mnt/okcomputer/output/cache.db"
def migrate_lot_attributes():
"""Extract attributes from cached lot pages"""
print("="*60)
print("MIGRATING EXISTING LOT DATA")
print("="*60)
conn = sqlite3.connect(DB_PATH)
# Get cached lot pages
cursor = conn.execute("""
SELECT url, content, timestamp
FROM cache
WHERE url LIKE '%/l/%'
ORDER BY timestamp DESC
""")
import zlib
updated_count = 0
for url, content_blob, timestamp in cursor:
try:
# Get lot_id from URL
lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url)
if not lot_id_match:
lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url)
if not lot_id_match:
continue
lot_id = lot_id_match.group(1)
# Check if lot exists in database
lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,))
lot_row = lot_cursor.fetchone()
if not lot_row:
continue
_, title, description = lot_row
# Decompress and parse __NEXT_DATA__
content = zlib.decompress(content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
continue
data = json.loads(match.group(1))
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
if not lot_json:
continue
# Extract basic attributes
attrs = extract_attributes_from_lot_json(lot_json)
# Extract enriched attributes
page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')}
enriched = extract_enriched_attributes(lot_json, page_data)
# Merge
all_attrs = {**attrs, **enriched}
# Update database
conn.execute("""
UPDATE lots
SET brand = ?,
model = ?,
attributes_json = ?,
year_manufactured = ?,
condition_score = ?,
condition_description = ?,
serial_number = ?,
manufacturer = ?,
damage_description = ?
WHERE lot_id = ?
""", (
all_attrs.get('brand', ''),
all_attrs.get('model', ''),
all_attrs.get('attributes_json', ''),
all_attrs.get('year_manufactured'),
all_attrs.get('condition_score'),
all_attrs.get('condition_description', ''),
all_attrs.get('serial_number', ''),
all_attrs.get('manufacturer', ''),
all_attrs.get('damage_description', ''),
lot_id
))
updated_count += 1
if updated_count % 100 == 0:
print(f" Processed {updated_count} lots...")
conn.commit()
except Exception as e:
print(f" Error processing {url}: {e}")
continue
conn.commit()
print(f"\n✓ Updated {updated_count} lots with enriched attributes")
# Show stats
cursor = conn.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
FROM lots
""")
stats = cursor.fetchone()
print(f"\nENRICHMENT STATISTICS:")
print(f" Total lots: {stats[0]:,}")
print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
conn.close()
def main():
print("\nStarting migration of existing data...")
print(f"Database: {DB_PATH}\n")
migrate_lot_attributes()
print(f"\n{'='*60}")
print("MIGRATION COMPLETE")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()