181 lines
7.1 KiB
Python
181 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migration script to re-parse cached HTML pages and update database entries.
|
|
Fixes issues with incomplete data extraction from earlier scrapes.
|
|
"""
|
|
import sys
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|
|
|
from parse import DataParser
|
|
from config import CACHE_DB
|
|
|
|
|
|
def reparse_and_update_lots(db_path: str = CACHE_DB, dry_run: bool = False):
|
|
"""
|
|
Re-parse cached HTML pages and update lot entries in the database.
|
|
|
|
This extracts improved data from __NEXT_DATA__ JSON blobs that may have been
|
|
missed in earlier scraping runs when validation was less strict.
|
|
"""
|
|
parser = DataParser()
|
|
|
|
with sqlite3.connect(db_path) as conn:
|
|
# Get all cached lot pages
|
|
cursor = conn.execute("""
|
|
SELECT url, content
|
|
FROM cache
|
|
WHERE url LIKE '%/l/%'
|
|
ORDER BY timestamp DESC
|
|
""")
|
|
|
|
cached_pages = cursor.fetchall()
|
|
print(f"Found {len(cached_pages)} cached lot pages to re-parse")
|
|
|
|
stats = {
|
|
'processed': 0,
|
|
'updated': 0,
|
|
'skipped': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
for url, compressed_content in cached_pages:
|
|
try:
|
|
# Decompress content
|
|
import zlib
|
|
content = zlib.decompress(compressed_content).decode('utf-8')
|
|
|
|
# Re-parse using current parser logic
|
|
parsed_data = parser.parse_page(content, url)
|
|
|
|
if not parsed_data or parsed_data.get('type') != 'lot':
|
|
stats['skipped'] += 1
|
|
continue
|
|
|
|
lot_id = parsed_data.get('lot_id', '')
|
|
if not lot_id:
|
|
print(f" ⚠️ No lot_id for {url}")
|
|
stats['skipped'] += 1
|
|
continue
|
|
|
|
# Check if lot exists
|
|
existing = conn.execute(
|
|
"SELECT lot_id FROM lots WHERE lot_id = ?",
|
|
(lot_id,)
|
|
).fetchone()
|
|
|
|
if not existing:
|
|
print(f" → New lot: {lot_id}")
|
|
# Insert new lot
|
|
if not dry_run:
|
|
conn.execute("""
|
|
INSERT INTO lots
|
|
(lot_id, auction_id, url, title, current_bid, bid_count,
|
|
closing_time, viewing_time, pickup_date, location,
|
|
description, category, scraped_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
lot_id,
|
|
parsed_data.get('auction_id', ''),
|
|
url,
|
|
parsed_data.get('title', ''),
|
|
parsed_data.get('current_bid', ''),
|
|
parsed_data.get('bid_count', 0),
|
|
parsed_data.get('closing_time', ''),
|
|
parsed_data.get('viewing_time', ''),
|
|
parsed_data.get('pickup_date', ''),
|
|
parsed_data.get('location', ''),
|
|
parsed_data.get('description', ''),
|
|
parsed_data.get('category', ''),
|
|
parsed_data.get('scraped_at', '')
|
|
))
|
|
stats['updated'] += 1
|
|
else:
|
|
# Update existing lot with newly parsed data
|
|
# Only update fields that are now populated but weren't before
|
|
if not dry_run:
|
|
conn.execute("""
|
|
UPDATE lots SET
|
|
auction_id = COALESCE(NULLIF(?, ''), auction_id),
|
|
title = COALESCE(NULLIF(?, ''), title),
|
|
current_bid = COALESCE(NULLIF(?, ''), current_bid),
|
|
bid_count = CASE WHEN ? > 0 THEN ? ELSE bid_count END,
|
|
closing_time = COALESCE(NULLIF(?, ''), closing_time),
|
|
viewing_time = COALESCE(NULLIF(?, ''), viewing_time),
|
|
pickup_date = COALESCE(NULLIF(?, ''), pickup_date),
|
|
location = COALESCE(NULLIF(?, ''), location),
|
|
description = COALESCE(NULLIF(?, ''), description),
|
|
category = COALESCE(NULLIF(?, ''), category)
|
|
WHERE lot_id = ?
|
|
""", (
|
|
parsed_data.get('auction_id', ''),
|
|
parsed_data.get('title', ''),
|
|
parsed_data.get('current_bid', ''),
|
|
parsed_data.get('bid_count', 0),
|
|
parsed_data.get('bid_count', 0),
|
|
parsed_data.get('closing_time', ''),
|
|
parsed_data.get('viewing_time', ''),
|
|
parsed_data.get('pickup_date', ''),
|
|
parsed_data.get('location', ''),
|
|
parsed_data.get('description', ''),
|
|
parsed_data.get('category', ''),
|
|
lot_id
|
|
))
|
|
stats['updated'] += 1
|
|
|
|
print(f" ✓ Updated: {lot_id[:20]}")
|
|
|
|
# Update images if they exist
|
|
images = parsed_data.get('images', [])
|
|
if images and not dry_run:
|
|
for img_url in images:
|
|
conn.execute("""
|
|
INSERT OR IGNORE INTO images (lot_id, url)
|
|
VALUES (?, ?)
|
|
""", (lot_id, img_url))
|
|
|
|
stats['processed'] += 1
|
|
|
|
if stats['processed'] % 100 == 0:
|
|
print(f" Progress: {stats['processed']}/{len(cached_pages)}")
|
|
if not dry_run:
|
|
conn.commit()
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error processing {url}: {e}")
|
|
stats['errors'] += 1
|
|
continue
|
|
|
|
if not dry_run:
|
|
conn.commit()
|
|
|
|
print("\n" + "="*60)
|
|
print("MIGRATION COMPLETE")
|
|
print("="*60)
|
|
print(f"Processed: {stats['processed']}")
|
|
print(f"Updated: {stats['updated']}")
|
|
print(f"Skipped: {stats['skipped']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
|
|
if dry_run:
|
|
print("\n⚠️ DRY RUN - No changes were made to the database")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Re-parse and update lot entries from cached HTML")
|
|
parser.add_argument('--db', default=CACHE_DB, help='Path to cache database')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
|
|
args = parser.parse_args()
|
|
|
|
print(f"Database: {args.db}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
print()
|
|
|
|
reparse_and_update_lots(args.db, args.dry_run)
|