enrichment
This commit is contained in:
120
enrich_existing_lots.py
Normal file
120
enrich_existing_lots.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""
|
||||
Enrich existing lots with new intelligence fields:
|
||||
- followers_count
|
||||
- estimated_min_price / estimated_max_price
|
||||
- lot_condition
|
||||
- appearance
|
||||
|
||||
Reads from cached lot pages __NEXT_DATA__ JSON
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
import asyncio
|
||||
from cache import CacheManager
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
from graphql_client import fetch_lot_bidding_data, format_bid_data
|
||||
|
||||
async def enrich_existing_lots():
|
||||
"""Enrich existing lots with new fields from GraphQL API"""
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all lot IDs
|
||||
cursor.execute("SELECT lot_id FROM lots")
|
||||
lot_ids = [r[0] for r in cursor.fetchall()]
|
||||
|
||||
print(f"Found {len(lot_ids)} lots to enrich")
|
||||
print("Fetching enrichment data from GraphQL API...")
|
||||
print("This will take ~{:.1f} minutes (0.5s rate limit)".format(len(lot_ids) * 0.5 / 60))
|
||||
|
||||
enriched = 0
|
||||
failed = 0
|
||||
no_data = 0
|
||||
|
||||
for i, lot_id in enumerate(lot_ids):
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f"Progress: {i+1}/{len(lot_ids)} ({enriched} enriched, {no_data} no data, {failed} failed)", end='\r')
|
||||
|
||||
try:
|
||||
# Fetch from GraphQL API
|
||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
||||
|
||||
if bidding_data:
|
||||
formatted_data = format_bid_data(bidding_data)
|
||||
|
||||
# Update lot with new fields
|
||||
cursor.execute("""
|
||||
UPDATE lots
|
||||
SET followers_count = ?,
|
||||
estimated_min_price = ?,
|
||||
estimated_max_price = ?,
|
||||
lot_condition = ?,
|
||||
appearance = ?
|
||||
WHERE lot_id = ?
|
||||
""", (
|
||||
formatted_data.get('followers_count', 0),
|
||||
formatted_data.get('estimated_min_price'),
|
||||
formatted_data.get('estimated_max_price'),
|
||||
formatted_data.get('lot_condition', ''),
|
||||
formatted_data.get('appearance', ''),
|
||||
lot_id
|
||||
))
|
||||
|
||||
enriched += 1
|
||||
|
||||
# Commit every 50 lots
|
||||
if enriched % 50 == 0:
|
||||
conn.commit()
|
||||
|
||||
else:
|
||||
no_data += 1
|
||||
|
||||
# Rate limit
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n\nComplete!")
|
||||
print(f"Total lots: {len(lot_ids)}")
|
||||
print(f"Enriched: {enriched}")
|
||||
print(f"No data: {no_data}")
|
||||
print(f"Failed: {failed}")
|
||||
|
||||
# Show statistics
|
||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE followers_count > 0")
|
||||
with_followers = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE estimated_min_price IS NOT NULL")
|
||||
with_estimates = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE lot_condition IS NOT NULL AND lot_condition != ''")
|
||||
with_condition = cursor.fetchone()[0]
|
||||
|
||||
print(f"\nEnrichment statistics:")
|
||||
print(f" Lots with followers_count: {with_followers} ({with_followers/len(lot_ids)*100:.1f}%)")
|
||||
print(f" Lots with estimated prices: {with_estimates} ({with_estimates/len(lot_ids)*100:.1f}%)")
|
||||
print(f" Lots with condition: {with_condition} ({with_condition/len(lot_ids)*100:.1f}%)")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("WARNING: This will make ~16,800 API calls at 0.5s intervals (~2.3 hours)")
|
||||
print("Press Ctrl+C to cancel, or wait 5 seconds to continue...")
|
||||
import time
|
||||
try:
|
||||
time.sleep(5)
|
||||
except KeyboardInterrupt:
|
||||
print("\nCancelled")
|
||||
sys.exit(0)
|
||||
|
||||
asyncio.run(enrich_existing_lots())
|
||||
Reference in New Issue
Block a user