enrichment
This commit is contained in:
155
fix_auctions_table.py
Normal file
155
fix_auctions_table.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
Fix auctions table by replacing with correct data from cached auction pages.
|
||||
The auctions table currently has wrong auction_ids (numeric instead of displayId).
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
from cache import CacheManager
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
def fix_auctions_table():
|
||||
"""Rebuild auctions table from cached auction pages"""
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing auctions table
|
||||
print("Clearing auctions table...")
|
||||
cursor.execute("DELETE FROM auctions")
|
||||
conn.commit()
|
||||
|
||||
# Get all auction pages from cache
|
||||
cursor.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/a/%'
|
||||
""")
|
||||
|
||||
auction_pages = cursor.fetchall()
|
||||
print(f"Found {len(auction_pages)} auction pages in cache")
|
||||
|
||||
total = 0
|
||||
inserted = 0
|
||||
errors = 0
|
||||
|
||||
print("Extracting auction data from cached pages...")
|
||||
|
||||
for url, content_blob in auction_pages:
|
||||
total += 1
|
||||
|
||||
if total % 10 == 0:
|
||||
print(f"Processed {total}/{len(auction_pages)}...", end='\r')
|
||||
|
||||
try:
|
||||
# Decompress and parse __NEXT_DATA__
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
|
||||
if not match:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
auction = page_props.get('auction', {})
|
||||
|
||||
if not auction:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Extract auction data
|
||||
auction_id = auction.get('displayId')
|
||||
if not auction_id:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
title = auction.get('name', '')
|
||||
|
||||
# Get location
|
||||
location = ''
|
||||
viewing_days = auction.get('viewingDays', [])
|
||||
if viewing_days and isinstance(viewing_days, list) and len(viewing_days) > 0:
|
||||
loc = viewing_days[0]
|
||||
city = loc.get('city', '')
|
||||
country = loc.get('countryCode', '').upper()
|
||||
location = f"{city}, {country}" if city and country else (city or country)
|
||||
|
||||
lots_count = auction.get('lotCount', 0)
|
||||
|
||||
# Get first lot closing time
|
||||
first_lot_closing = ''
|
||||
min_end_date = auction.get('minEndDate', '')
|
||||
if min_end_date:
|
||||
# Format timestamp
|
||||
try:
|
||||
dt = datetime.fromisoformat(min_end_date.replace('Z', '+00:00'))
|
||||
first_lot_closing = dt.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
first_lot_closing = min_end_date
|
||||
|
||||
scraped_at = datetime.now().isoformat()
|
||||
|
||||
# Insert into auctions table
|
||||
cursor.execute("""
|
||||
INSERT OR REPLACE INTO auctions
|
||||
(auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""", (auction_id, url, title, location, lots_count, first_lot_closing, scraped_at))
|
||||
|
||||
inserted += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n\nComplete!")
|
||||
print(f"Total auction pages processed: {total}")
|
||||
print(f"Auctions inserted: {inserted}")
|
||||
print(f"Errors: {errors}")
|
||||
|
||||
# Verify fix
|
||||
cursor.execute("SELECT COUNT(*) FROM auctions")
|
||||
total_auctions = cursor.fetchone()[0]
|
||||
print(f"\nTotal auctions in table: {total_auctions}")
|
||||
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM lots
|
||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
||||
AND auction_id != ''
|
||||
""")
|
||||
orphaned = cursor.fetchone()[0]
|
||||
|
||||
print(f"Orphaned lots remaining: {orphaned}")
|
||||
|
||||
if orphaned == 0:
|
||||
print("\nSUCCESS! All lots now have matching auctions!")
|
||||
else:
|
||||
# Show sample of remaining orphans
|
||||
cursor.execute("""
|
||||
SELECT lot_id, auction_id FROM lots
|
||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
||||
AND auction_id != ''
|
||||
LIMIT 5
|
||||
""")
|
||||
print("\nSample remaining orphaned lots:")
|
||||
for lot_id, auction_id in cursor.fetchall():
|
||||
print(f" {lot_id} -> auction_id: {auction_id}")
|
||||
|
||||
# Show what auction_ids we do have
|
||||
cursor.execute("SELECT auction_id FROM auctions LIMIT 10")
|
||||
print("\nSample auction_ids in auctions table:")
|
||||
for row in cursor.fetchall():
|
||||
print(f" {row[0]}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_auctions_table()
|
||||
Reference in New Issue
Block a user