Files
scaev/fix_auctions_table.py
2025-12-07 02:20:14 +01:00

156 lines
4.8 KiB
Python

"""
Fix auctions table by replacing with correct data from cached auction pages.
The auctions table currently has wrong auction_ids (numeric instead of displayId).
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
from cache import CacheManager
import sqlite3
import zlib
import json
import re
from datetime import datetime
def fix_auctions_table():
"""Rebuild auctions table from cached auction pages"""
cache = CacheManager()
conn = sqlite3.connect(cache.db_path)
cursor = conn.cursor()
# Clear existing auctions table
print("Clearing auctions table...")
cursor.execute("DELETE FROM auctions")
conn.commit()
# Get all auction pages from cache
cursor.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/a/%'
""")
auction_pages = cursor.fetchall()
print(f"Found {len(auction_pages)} auction pages in cache")
total = 0
inserted = 0
errors = 0
print("Extracting auction data from cached pages...")
for url, content_blob in auction_pages:
total += 1
if total % 10 == 0:
print(f"Processed {total}/{len(auction_pages)}...", end='\r')
try:
# Decompress and parse __NEXT_DATA__
content = zlib.decompress(content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
errors += 1
continue
data = json.loads(match.group(1))
page_props = data.get('props', {}).get('pageProps', {})
auction = page_props.get('auction', {})
if not auction:
errors += 1
continue
# Extract auction data
auction_id = auction.get('displayId')
if not auction_id:
errors += 1
continue
title = auction.get('name', '')
# Get location
location = ''
viewing_days = auction.get('viewingDays', [])
if viewing_days and isinstance(viewing_days, list) and len(viewing_days) > 0:
loc = viewing_days[0]
city = loc.get('city', '')
country = loc.get('countryCode', '').upper()
location = f"{city}, {country}" if city and country else (city or country)
lots_count = auction.get('lotCount', 0)
# Get first lot closing time
first_lot_closing = ''
min_end_date = auction.get('minEndDate', '')
if min_end_date:
# Format timestamp
try:
dt = datetime.fromisoformat(min_end_date.replace('Z', '+00:00'))
first_lot_closing = dt.strftime('%Y-%m-%d %H:%M:%S')
except:
first_lot_closing = min_end_date
scraped_at = datetime.now().isoformat()
# Insert into auctions table
cursor.execute("""
INSERT OR REPLACE INTO auctions
(auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (auction_id, url, title, location, lots_count, first_lot_closing, scraped_at))
inserted += 1
except Exception as e:
errors += 1
continue
conn.commit()
print(f"\n\nComplete!")
print(f"Total auction pages processed: {total}")
print(f"Auctions inserted: {inserted}")
print(f"Errors: {errors}")
# Verify fix
cursor.execute("SELECT COUNT(*) FROM auctions")
total_auctions = cursor.fetchone()[0]
print(f"\nTotal auctions in table: {total_auctions}")
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
AND auction_id != ''
""")
orphaned = cursor.fetchone()[0]
print(f"Orphaned lots remaining: {orphaned}")
if orphaned == 0:
print("\nSUCCESS! All lots now have matching auctions!")
else:
# Show sample of remaining orphans
cursor.execute("""
SELECT lot_id, auction_id FROM lots
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
AND auction_id != ''
LIMIT 5
""")
print("\nSample remaining orphaned lots:")
for lot_id, auction_id in cursor.fetchall():
print(f" {lot_id} -> auction_id: {auction_id}")
# Show what auction_ids we do have
cursor.execute("SELECT auction_id FROM auctions LIMIT 10")
print("\nSample auction_ids in auctions table:")
for row in cursor.fetchall():
print(f" {row[0]}")
conn.close()
if __name__ == "__main__":
fix_auctions_table()