#!/usr/bin/env python3 """ Migration script to re-parse cached HTML pages and update database entries. Fixes issues with incomplete data extraction from earlier scrapes. """ import sys import sqlite3 from pathlib import Path # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from parse import DataParser from config import CACHE_DB def reparse_and_update_lots(db_path: str = CACHE_DB, dry_run: bool = False): """ Re-parse cached HTML pages and update lot entries in the database. This extracts improved data from __NEXT_DATA__ JSON blobs that may have been missed in earlier scraping runs when validation was less strict. """ parser = DataParser() with sqlite3.connect(db_path) as conn: # Get all cached lot pages cursor = conn.execute(""" SELECT url, content FROM cache WHERE url LIKE '%/l/%' ORDER BY timestamp DESC """) cached_pages = cursor.fetchall() print(f"Found {len(cached_pages)} cached lot pages to re-parse") stats = { 'processed': 0, 'updated': 0, 'skipped': 0, 'errors': 0 } for url, compressed_content in cached_pages: try: # Decompress content import zlib content = zlib.decompress(compressed_content).decode('utf-8') # Re-parse using current parser logic parsed_data = parser.parse_page(content, url) if not parsed_data or parsed_data.get('type') != 'lot': stats['skipped'] += 1 continue lot_id = parsed_data.get('lot_id', '') if not lot_id: print(f" ⚠️ No lot_id for {url}") stats['skipped'] += 1 continue # Check if lot exists existing = conn.execute( "SELECT lot_id FROM lots WHERE lot_id = ?", (lot_id,) ).fetchone() if not existing: print(f" → New lot: {lot_id}") # Insert new lot if not dry_run: conn.execute(""" INSERT INTO lots (lot_id, auction_id, url, title, current_bid, bid_count, closing_time, viewing_time, pickup_date, location, description, category, scraped_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( lot_id, parsed_data.get('auction_id', ''), url, parsed_data.get('title', ''), parsed_data.get('current_bid', ''), parsed_data.get('bid_count', 0), parsed_data.get('closing_time', ''), parsed_data.get('viewing_time', ''), parsed_data.get('pickup_date', ''), parsed_data.get('location', ''), parsed_data.get('description', ''), parsed_data.get('category', ''), parsed_data.get('scraped_at', '') )) stats['updated'] += 1 else: # Update existing lot with newly parsed data # Only update fields that are now populated but weren't before if not dry_run: conn.execute(""" UPDATE lots SET auction_id = COALESCE(NULLIF(?, ''), auction_id), title = COALESCE(NULLIF(?, ''), title), current_bid = COALESCE(NULLIF(?, ''), current_bid), bid_count = CASE WHEN ? > 0 THEN ? ELSE bid_count END, closing_time = COALESCE(NULLIF(?, ''), closing_time), viewing_time = COALESCE(NULLIF(?, ''), viewing_time), pickup_date = COALESCE(NULLIF(?, ''), pickup_date), location = COALESCE(NULLIF(?, ''), location), description = COALESCE(NULLIF(?, ''), description), category = COALESCE(NULLIF(?, ''), category) WHERE lot_id = ? """, ( parsed_data.get('auction_id', ''), parsed_data.get('title', ''), parsed_data.get('current_bid', ''), parsed_data.get('bid_count', 0), parsed_data.get('bid_count', 0), parsed_data.get('closing_time', ''), parsed_data.get('viewing_time', ''), parsed_data.get('pickup_date', ''), parsed_data.get('location', ''), parsed_data.get('description', ''), parsed_data.get('category', ''), lot_id )) stats['updated'] += 1 print(f" ✓ Updated: {lot_id[:20]}") # Update images if they exist images = parsed_data.get('images', []) if images and not dry_run: for img_url in images: conn.execute(""" INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?) """, (lot_id, img_url)) stats['processed'] += 1 if stats['processed'] % 100 == 0: print(f" Progress: {stats['processed']}/{len(cached_pages)}") if not dry_run: conn.commit() except Exception as e: print(f" ❌ Error processing {url}: {e}") stats['errors'] += 1 continue if not dry_run: conn.commit() print("\n" + "="*60) print("MIGRATION COMPLETE") print("="*60) print(f"Processed: {stats['processed']}") print(f"Updated: {stats['updated']}") print(f"Skipped: {stats['skipped']}") print(f"Errors: {stats['errors']}") if dry_run: print("\n⚠️ DRY RUN - No changes were made to the database") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Re-parse and update lot entries from cached HTML") parser.add_argument('--db', default=CACHE_DB, help='Path to cache database') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') args = parser.parse_args() print(f"Database: {args.db}") print(f"Dry run: {args.dry_run}") print() reparse_and_update_lots(args.db, args.dry_run)