enrich data
This commit is contained in:
148
migrate_existing_data.py
Normal file
148
migrate_existing_data.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Migrate existing lot data to extract missing enriched fields
|
||||
"""
|
||||
import sqlite3
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json
|
||||
|
||||
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
||||
|
||||
def migrate_lot_attributes():
|
||||
"""Extract attributes from cached lot pages"""
|
||||
print("="*60)
|
||||
print("MIGRATING EXISTING LOT DATA")
|
||||
print("="*60)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
|
||||
# Get cached lot pages
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content, timestamp
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
ORDER BY timestamp DESC
|
||||
""")
|
||||
|
||||
import zlib
|
||||
updated_count = 0
|
||||
|
||||
for url, content_blob, timestamp in cursor:
|
||||
try:
|
||||
# Get lot_id from URL
|
||||
lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url)
|
||||
if not lot_id_match:
|
||||
lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url)
|
||||
if not lot_id_match:
|
||||
continue
|
||||
|
||||
lot_id = lot_id_match.group(1)
|
||||
|
||||
# Check if lot exists in database
|
||||
lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,))
|
||||
lot_row = lot_cursor.fetchone()
|
||||
if not lot_row:
|
||||
continue
|
||||
|
||||
_, title, description = lot_row
|
||||
|
||||
# Decompress and parse __NEXT_DATA__
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
||||
if not lot_json:
|
||||
continue
|
||||
|
||||
# Extract basic attributes
|
||||
attrs = extract_attributes_from_lot_json(lot_json)
|
||||
|
||||
# Extract enriched attributes
|
||||
page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')}
|
||||
enriched = extract_enriched_attributes(lot_json, page_data)
|
||||
|
||||
# Merge
|
||||
all_attrs = {**attrs, **enriched}
|
||||
|
||||
# Update database
|
||||
conn.execute("""
|
||||
UPDATE lots
|
||||
SET brand = ?,
|
||||
model = ?,
|
||||
attributes_json = ?,
|
||||
year_manufactured = ?,
|
||||
condition_score = ?,
|
||||
condition_description = ?,
|
||||
serial_number = ?,
|
||||
manufacturer = ?,
|
||||
damage_description = ?
|
||||
WHERE lot_id = ?
|
||||
""", (
|
||||
all_attrs.get('brand', ''),
|
||||
all_attrs.get('model', ''),
|
||||
all_attrs.get('attributes_json', ''),
|
||||
all_attrs.get('year_manufactured'),
|
||||
all_attrs.get('condition_score'),
|
||||
all_attrs.get('condition_description', ''),
|
||||
all_attrs.get('serial_number', ''),
|
||||
all_attrs.get('manufacturer', ''),
|
||||
all_attrs.get('damage_description', ''),
|
||||
lot_id
|
||||
))
|
||||
|
||||
updated_count += 1
|
||||
if updated_count % 100 == 0:
|
||||
print(f" Processed {updated_count} lots...")
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error processing {url}: {e}")
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
print(f"\n✓ Updated {updated_count} lots with enriched attributes")
|
||||
|
||||
# Show stats
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
||||
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
|
||||
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
||||
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
|
||||
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
|
||||
FROM lots
|
||||
""")
|
||||
stats = cursor.fetchone()
|
||||
|
||||
print(f"\nENRICHMENT STATISTICS:")
|
||||
print(f" Total lots: {stats[0]:,}")
|
||||
print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
|
||||
print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
|
||||
print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
|
||||
print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
|
||||
print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
print("\nStarting migration of existing data...")
|
||||
print(f"Database: {DB_PATH}\n")
|
||||
|
||||
migrate_lot_attributes()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("MIGRATION COMPLETE")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user