enrich data

This commit is contained in:
Tour
2025-12-07 01:59:45 +01:00
parent d09ee5574f
commit 08bf112c3f
9 changed files with 1750 additions and 32 deletions

306
validate_data.py Normal file
View File

@@ -0,0 +1,306 @@
"""
Validate data quality and completeness in the database.
Checks if scraped data matches expectations and API capabilities.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
import sqlite3
from datetime import datetime
from typing import Dict, List, Tuple
from cache import CacheManager
cache = CacheManager()
DB_PATH = cache.db_path
def get_db_stats() -> Dict:
"""Get comprehensive database statistics"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
stats = {}
# Total counts
stats['total_auctions'] = cursor.execute("SELECT COUNT(*) FROM auctions").fetchone()[0]
stats['total_lots'] = cursor.execute("SELECT COUNT(*) FROM lots").fetchone()[0]
stats['total_images'] = cursor.execute("SELECT COUNT(*) FROM images").fetchone()[0]
stats['total_bid_history'] = cursor.execute("SELECT COUNT(*) FROM bid_history").fetchone()[0]
# Auctions completeness
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
SUM(CASE WHEN lots_count IS NOT NULL THEN 1 ELSE 0 END) as has_lots_count,
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
SUM(CASE WHEN first_lot_closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_lot_closing
FROM auctions
""")
row = cursor.fetchone()
stats['auctions'] = {
'total': row[0],
'has_title': row[1],
'has_lots_count': row[2],
'has_closing_time': row[3],
'has_first_lot_closing': row[4]
}
# Lots completeness - Core fields
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
SUM(CASE WHEN current_bid IS NOT NULL THEN 1 ELSE 0 END) as has_current_bid,
SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid,
SUM(CASE WHEN minimum_bid IS NOT NULL THEN 1 ELSE 0 END) as has_minimum_bid,
SUM(CASE WHEN bid_count IS NOT NULL AND bid_count > 0 THEN 1 ELSE 0 END) as has_bids,
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
SUM(CASE WHEN status IS NOT NULL AND status != '' THEN 1 ELSE 0 END) as has_status
FROM lots
""")
row = cursor.fetchone()
stats['lots_core'] = {
'total': row[0],
'has_title': row[1],
'has_current_bid': row[2],
'has_starting_bid': row[3],
'has_minimum_bid': row[4],
'has_bids': row[5],
'has_closing_time': row[6],
'has_status': row[7]
}
# Lots completeness - Enriched fields
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN brand IS NOT NULL AND brand != '' THEN 1 ELSE 0 END) as has_brand,
SUM(CASE WHEN model IS NOT NULL AND model != '' THEN 1 ELSE 0 END) as has_model,
SUM(CASE WHEN manufacturer IS NOT NULL AND manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition_score,
SUM(CASE WHEN condition_description IS NOT NULL AND condition_description != '' THEN 1 ELSE 0 END) as has_condition_desc,
SUM(CASE WHEN serial_number IS NOT NULL AND serial_number != '' THEN 1 ELSE 0 END) as has_serial,
SUM(CASE WHEN damage_description IS NOT NULL AND damage_description != '' THEN 1 ELSE 0 END) as has_damage
FROM lots
""")
row = cursor.fetchone()
stats['lots_enriched'] = {
'total': row[0],
'has_brand': row[1],
'has_model': row[2],
'has_manufacturer': row[3],
'has_year': row[4],
'has_condition_score': row[5],
'has_condition_desc': row[6],
'has_serial': row[7],
'has_damage': row[8]
}
# Lots completeness - Bid intelligence
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN first_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_bid_time,
SUM(CASE WHEN last_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_last_bid_time,
SUM(CASE WHEN bid_velocity IS NOT NULL THEN 1 ELSE 0 END) as has_bid_velocity,
SUM(CASE WHEN bid_increment IS NOT NULL THEN 1 ELSE 0 END) as has_bid_increment
FROM lots
""")
row = cursor.fetchone()
stats['lots_bid_intelligence'] = {
'total': row[0],
'has_first_bid_time': row[1],
'has_last_bid_time': row[2],
'has_bid_velocity': row[3],
'has_bid_increment': row[4]
}
# Bid history stats
cursor.execute("""
SELECT
COUNT(DISTINCT lot_id) as lots_with_history,
COUNT(*) as total_bids,
SUM(CASE WHEN is_autobid = 1 THEN 1 ELSE 0 END) as autobids,
SUM(CASE WHEN bidder_id IS NOT NULL THEN 1 ELSE 0 END) as has_bidder_id
FROM bid_history
""")
row = cursor.fetchone()
stats['bid_history'] = {
'lots_with_history': row[0],
'total_bids': row[1],
'autobids': row[2],
'has_bidder_id': row[3]
}
# Image stats
cursor.execute("""
SELECT
COUNT(DISTINCT lot_id) as lots_with_images,
COUNT(*) as total_images,
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded_images,
SUM(CASE WHEN local_path IS NOT NULL THEN 1 ELSE 0 END) as has_local_path
FROM images
""")
row = cursor.fetchone()
stats['images'] = {
'lots_with_images': row[0],
'total_images': row[1],
'downloaded_images': row[2],
'has_local_path': row[3]
}
conn.close()
return stats
def check_data_quality() -> List[Tuple[str, str, str]]:
"""Check for data quality issues"""
issues = []
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Check for lots without auction
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
""")
orphaned_lots = cursor.fetchone()[0]
if orphaned_lots > 0:
issues.append(("ERROR", "Orphaned Lots", f"{orphaned_lots} lots without matching auction"))
# Check for lots with bids but no bid history
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE bid_count > 0
AND lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
""")
missing_history = cursor.fetchone()[0]
if missing_history > 0:
issues.append(("WARNING", "Missing Bid History", f"{missing_history} lots have bids but no bid history records"))
# Check for lots with closing time in past but still active
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE closing_time IS NOT NULL
AND closing_time < datetime('now')
AND status NOT LIKE '%gesloten%'
""")
past_closing = cursor.fetchone()[0]
if past_closing > 0:
issues.append(("INFO", "Past Closing Time", f"{past_closing} lots have closing time in past"))
# Check for duplicate lot_ids
cursor.execute("""
SELECT lot_id, COUNT(*) FROM lots
GROUP BY lot_id
HAVING COUNT(*) > 1
""")
duplicates = cursor.fetchall()
if duplicates:
issues.append(("ERROR", "Duplicate Lot IDs", f"{len(duplicates)} duplicate lot_id values found"))
# Check for lots without images
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE lot_id NOT IN (SELECT DISTINCT lot_id FROM images)
""")
no_images = cursor.fetchone()[0]
if no_images > 0:
issues.append(("WARNING", "No Images", f"{no_images} lots have no images"))
conn.close()
return issues
def print_validation_report():
"""Print comprehensive validation report"""
print("=" * 80)
print("DATABASE VALIDATION REPORT")
print("=" * 80)
print()
stats = get_db_stats()
# Overall counts
print("OVERALL COUNTS:")
print(f" Auctions: {stats['total_auctions']:,}")
print(f" Lots: {stats['total_lots']:,}")
print(f" Images: {stats['total_images']:,}")
print(f" Bid History Records: {stats['total_bid_history']:,}")
print()
# Auctions completeness
print("AUCTIONS COMPLETENESS:")
a = stats['auctions']
print(f" Title: {a['has_title']:,} / {a['total']:,} ({a['has_title']/a['total']*100:.1f}%)")
print(f" Lots Count: {a['has_lots_count']:,} / {a['total']:,} ({a['has_lots_count']/a['total']*100:.1f}%)")
print(f" Closing Time: {a['has_closing_time']:,} / {a['total']:,} ({a['has_closing_time']/a['total']*100:.1f}%)")
print(f" First Lot Closing: {a['has_first_lot_closing']:,} / {a['total']:,} ({a['has_first_lot_closing']/a['total']*100:.1f}%)")
print()
# Lots core completeness
print("LOTS CORE FIELDS:")
l = stats['lots_core']
print(f" Title: {l['has_title']:,} / {l['total']:,} ({l['has_title']/l['total']*100:.1f}%)")
print(f" Current Bid: {l['has_current_bid']:,} / {l['total']:,} ({l['has_current_bid']/l['total']*100:.1f}%)")
print(f" Starting Bid: {l['has_starting_bid']:,} / {l['total']:,} ({l['has_starting_bid']/l['total']*100:.1f}%)")
print(f" Minimum Bid: {l['has_minimum_bid']:,} / {l['total']:,} ({l['has_minimum_bid']/l['total']*100:.1f}%)")
print(f" Has Bids (>0): {l['has_bids']:,} / {l['total']:,} ({l['has_bids']/l['total']*100:.1f}%)")
print(f" Closing Time: {l['has_closing_time']:,} / {l['total']:,} ({l['has_closing_time']/l['total']*100:.1f}%)")
print(f" Status: {l['has_status']:,} / {l['total']:,} ({l['has_status']/l['total']*100:.1f}%)")
print()
# Lots enriched fields
print("LOTS ENRICHED FIELDS:")
e = stats['lots_enriched']
print(f" Brand: {e['has_brand']:,} / {e['total']:,} ({e['has_brand']/e['total']*100:.1f}%)")
print(f" Model: {e['has_model']:,} / {e['total']:,} ({e['has_model']/e['total']*100:.1f}%)")
print(f" Manufacturer: {e['has_manufacturer']:,} / {e['total']:,} ({e['has_manufacturer']/e['total']*100:.1f}%)")
print(f" Year: {e['has_year']:,} / {e['total']:,} ({e['has_year']/e['total']*100:.1f}%)")
print(f" Condition Score: {e['has_condition_score']:,} / {e['total']:,} ({e['has_condition_score']/e['total']*100:.1f}%)")
print(f" Condition Desc: {e['has_condition_desc']:,} / {e['total']:,} ({e['has_condition_desc']/e['total']*100:.1f}%)")
print(f" Serial Number: {e['has_serial']:,} / {e['total']:,} ({e['has_serial']/e['total']*100:.1f}%)")
print(f" Damage Desc: {e['has_damage']:,} / {e['total']:,} ({e['has_damage']/e['total']*100:.1f}%)")
print()
# Bid intelligence
print("LOTS BID INTELLIGENCE:")
b = stats['lots_bid_intelligence']
print(f" First Bid Time: {b['has_first_bid_time']:,} / {b['total']:,} ({b['has_first_bid_time']/b['total']*100:.1f}%)")
print(f" Last Bid Time: {b['has_last_bid_time']:,} / {b['total']:,} ({b['has_last_bid_time']/b['total']*100:.1f}%)")
print(f" Bid Velocity: {b['has_bid_velocity']:,} / {b['total']:,} ({b['has_bid_velocity']/b['total']*100:.1f}%)")
print(f" Bid Increment: {b['has_bid_increment']:,} / {b['total']:,} ({b['has_bid_increment']/b['total']*100:.1f}%)")
print()
# Bid history
print("BID HISTORY:")
h = stats['bid_history']
print(f" Lots with History: {h['lots_with_history']:,}")
print(f" Total Bid Records: {h['total_bids']:,}")
print(f" Autobids: {h['autobids']:,} ({h['autobids']/max(h['total_bids'],1)*100:.1f}%)")
print(f" Has Bidder ID: {h['has_bidder_id']:,} ({h['has_bidder_id']/max(h['total_bids'],1)*100:.1f}%)")
print()
# Images
print("IMAGES:")
i = stats['images']
print(f" Lots with Images: {i['lots_with_images']:,}")
print(f" Total Images: {i['total_images']:,}")
print(f" Downloaded: {i['downloaded_images']:,} ({i['downloaded_images']/max(i['total_images'],1)*100:.1f}%)")
print(f" Has Local Path: {i['has_local_path']:,} ({i['has_local_path']/max(i['total_images'],1)*100:.1f}%)")
print()
# Data quality issues
print("=" * 80)
print("DATA QUALITY ISSUES:")
print("=" * 80)
issues = check_data_quality()
if issues:
for severity, category, message in issues:
print(f" [{severity}] {category}: {message}")
else:
print(" No issues found!")
print()
if __name__ == "__main__":
print_validation_report()