149 lines
4.8 KiB
Python
149 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migrate existing lot data to extract missing enriched fields
|
|
"""
|
|
import sqlite3
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
import sys
|
|
sys.path.insert(0, 'src')
|
|
|
|
from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json
|
|
|
|
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
|
|
|
def migrate_lot_attributes():
|
|
"""Extract attributes from cached lot pages"""
|
|
print("="*60)
|
|
print("MIGRATING EXISTING LOT DATA")
|
|
print("="*60)
|
|
|
|
conn = sqlite3.connect(DB_PATH)
|
|
|
|
# Get cached lot pages
|
|
cursor = conn.execute("""
|
|
SELECT url, content, timestamp
|
|
FROM cache
|
|
WHERE url LIKE '%/l/%'
|
|
ORDER BY timestamp DESC
|
|
""")
|
|
|
|
import zlib
|
|
updated_count = 0
|
|
|
|
for url, content_blob, timestamp in cursor:
|
|
try:
|
|
# Get lot_id from URL
|
|
lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url)
|
|
if not lot_id_match:
|
|
lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url)
|
|
if not lot_id_match:
|
|
continue
|
|
|
|
lot_id = lot_id_match.group(1)
|
|
|
|
# Check if lot exists in database
|
|
lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,))
|
|
lot_row = lot_cursor.fetchone()
|
|
if not lot_row:
|
|
continue
|
|
|
|
_, title, description = lot_row
|
|
|
|
# Decompress and parse __NEXT_DATA__
|
|
content = zlib.decompress(content_blob).decode('utf-8')
|
|
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
if not match:
|
|
continue
|
|
|
|
data = json.loads(match.group(1))
|
|
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
|
if not lot_json:
|
|
continue
|
|
|
|
# Extract basic attributes
|
|
attrs = extract_attributes_from_lot_json(lot_json)
|
|
|
|
# Extract enriched attributes
|
|
page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')}
|
|
enriched = extract_enriched_attributes(lot_json, page_data)
|
|
|
|
# Merge
|
|
all_attrs = {**attrs, **enriched}
|
|
|
|
# Update database
|
|
conn.execute("""
|
|
UPDATE lots
|
|
SET brand = ?,
|
|
model = ?,
|
|
attributes_json = ?,
|
|
year_manufactured = ?,
|
|
condition_score = ?,
|
|
condition_description = ?,
|
|
serial_number = ?,
|
|
manufacturer = ?,
|
|
damage_description = ?
|
|
WHERE lot_id = ?
|
|
""", (
|
|
all_attrs.get('brand', ''),
|
|
all_attrs.get('model', ''),
|
|
all_attrs.get('attributes_json', ''),
|
|
all_attrs.get('year_manufactured'),
|
|
all_attrs.get('condition_score'),
|
|
all_attrs.get('condition_description', ''),
|
|
all_attrs.get('serial_number', ''),
|
|
all_attrs.get('manufacturer', ''),
|
|
all_attrs.get('damage_description', ''),
|
|
lot_id
|
|
))
|
|
|
|
updated_count += 1
|
|
if updated_count % 100 == 0:
|
|
print(f" Processed {updated_count} lots...")
|
|
conn.commit()
|
|
|
|
except Exception as e:
|
|
print(f" Error processing {url}: {e}")
|
|
continue
|
|
|
|
conn.commit()
|
|
print(f"\n✓ Updated {updated_count} lots with enriched attributes")
|
|
|
|
# Show stats
|
|
cursor = conn.execute("""
|
|
SELECT
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
|
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
|
|
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
|
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
|
|
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
|
|
FROM lots
|
|
""")
|
|
stats = cursor.fetchone()
|
|
|
|
print(f"\nENRICHMENT STATISTICS:")
|
|
print(f" Total lots: {stats[0]:,}")
|
|
print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
|
|
print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
|
|
print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
|
|
print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
|
|
print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
|
|
|
|
conn.close()
|
|
|
|
|
|
def main():
|
|
print("\nStarting migration of existing data...")
|
|
print(f"Database: {DB_PATH}\n")
|
|
|
|
migrate_lot_attributes()
|
|
|
|
print(f"\n{'='*60}")
|
|
print("MIGRATION COMPLETE")
|
|
print(f"{'='*60}\n")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|