""" Validate data quality and completeness in the database. Checks if scraped data matches expectations and API capabilities. """ import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) import sqlite3 from datetime import datetime from typing import Dict, List, Tuple from cache import CacheManager cache = CacheManager() DB_PATH = cache.db_path def get_db_stats() -> Dict: """Get comprehensive database statistics""" conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() stats = {} # Total counts stats['total_auctions'] = cursor.execute("SELECT COUNT(*) FROM auctions").fetchone()[0] stats['total_lots'] = cursor.execute("SELECT COUNT(*) FROM lots").fetchone()[0] stats['total_images'] = cursor.execute("SELECT COUNT(*) FROM images").fetchone()[0] stats['total_bid_history'] = cursor.execute("SELECT COUNT(*) FROM bid_history").fetchone()[0] # Auctions completeness cursor.execute(""" SELECT COUNT(*) as total, SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title, SUM(CASE WHEN lots_count IS NOT NULL THEN 1 ELSE 0 END) as has_lots_count, SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time, SUM(CASE WHEN first_lot_closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_lot_closing FROM auctions """) row = cursor.fetchone() stats['auctions'] = { 'total': row[0], 'has_title': row[1], 'has_lots_count': row[2], 'has_closing_time': row[3], 'has_first_lot_closing': row[4] } # Lots completeness - Core fields cursor.execute(""" SELECT COUNT(*) as total, SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title, SUM(CASE WHEN current_bid IS NOT NULL THEN 1 ELSE 0 END) as has_current_bid, SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid, SUM(CASE WHEN minimum_bid IS NOT NULL THEN 1 ELSE 0 END) as has_minimum_bid, SUM(CASE WHEN bid_count IS NOT NULL AND bid_count > 0 THEN 1 ELSE 0 END) as has_bids, SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time, SUM(CASE WHEN status IS NOT NULL AND status != '' THEN 1 ELSE 0 END) as has_status FROM lots """) row = cursor.fetchone() stats['lots_core'] = { 'total': row[0], 'has_title': row[1], 'has_current_bid': row[2], 'has_starting_bid': row[3], 'has_minimum_bid': row[4], 'has_bids': row[5], 'has_closing_time': row[6], 'has_status': row[7] } # Lots completeness - Enriched fields cursor.execute(""" SELECT COUNT(*) as total, SUM(CASE WHEN brand IS NOT NULL AND brand != '' THEN 1 ELSE 0 END) as has_brand, SUM(CASE WHEN model IS NOT NULL AND model != '' THEN 1 ELSE 0 END) as has_model, SUM(CASE WHEN manufacturer IS NOT NULL AND manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer, SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year, SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition_score, SUM(CASE WHEN condition_description IS NOT NULL AND condition_description != '' THEN 1 ELSE 0 END) as has_condition_desc, SUM(CASE WHEN serial_number IS NOT NULL AND serial_number != '' THEN 1 ELSE 0 END) as has_serial, SUM(CASE WHEN damage_description IS NOT NULL AND damage_description != '' THEN 1 ELSE 0 END) as has_damage FROM lots """) row = cursor.fetchone() stats['lots_enriched'] = { 'total': row[0], 'has_brand': row[1], 'has_model': row[2], 'has_manufacturer': row[3], 'has_year': row[4], 'has_condition_score': row[5], 'has_condition_desc': row[6], 'has_serial': row[7], 'has_damage': row[8] } # Lots completeness - Bid intelligence cursor.execute(""" SELECT COUNT(*) as total, SUM(CASE WHEN first_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_bid_time, SUM(CASE WHEN last_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_last_bid_time, SUM(CASE WHEN bid_velocity IS NOT NULL THEN 1 ELSE 0 END) as has_bid_velocity, SUM(CASE WHEN bid_increment IS NOT NULL THEN 1 ELSE 0 END) as has_bid_increment FROM lots """) row = cursor.fetchone() stats['lots_bid_intelligence'] = { 'total': row[0], 'has_first_bid_time': row[1], 'has_last_bid_time': row[2], 'has_bid_velocity': row[3], 'has_bid_increment': row[4] } # Bid history stats cursor.execute(""" SELECT COUNT(DISTINCT lot_id) as lots_with_history, COUNT(*) as total_bids, SUM(CASE WHEN is_autobid = 1 THEN 1 ELSE 0 END) as autobids, SUM(CASE WHEN bidder_id IS NOT NULL THEN 1 ELSE 0 END) as has_bidder_id FROM bid_history """) row = cursor.fetchone() stats['bid_history'] = { 'lots_with_history': row[0], 'total_bids': row[1], 'autobids': row[2], 'has_bidder_id': row[3] } # Image stats cursor.execute(""" SELECT COUNT(DISTINCT lot_id) as lots_with_images, COUNT(*) as total_images, SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded_images, SUM(CASE WHEN local_path IS NOT NULL THEN 1 ELSE 0 END) as has_local_path FROM images """) row = cursor.fetchone() stats['images'] = { 'lots_with_images': row[0], 'total_images': row[1], 'downloaded_images': row[2], 'has_local_path': row[3] } conn.close() return stats def check_data_quality() -> List[Tuple[str, str, str]]: """Check for data quality issues""" issues = [] conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() # Check for lots without auction cursor.execute(""" SELECT COUNT(*) FROM lots WHERE auction_id NOT IN (SELECT auction_id FROM auctions) """) orphaned_lots = cursor.fetchone()[0] if orphaned_lots > 0: issues.append(("ERROR", "Orphaned Lots", f"{orphaned_lots} lots without matching auction")) # Check for lots with bids but no bid history cursor.execute(""" SELECT COUNT(*) FROM lots WHERE bid_count > 0 AND lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history) """) missing_history = cursor.fetchone()[0] if missing_history > 0: issues.append(("WARNING", "Missing Bid History", f"{missing_history} lots have bids but no bid history records")) # Check for lots with closing time in past but still active cursor.execute(""" SELECT COUNT(*) FROM lots WHERE closing_time IS NOT NULL AND closing_time < datetime('now') AND status NOT LIKE '%gesloten%' """) past_closing = cursor.fetchone()[0] if past_closing > 0: issues.append(("INFO", "Past Closing Time", f"{past_closing} lots have closing time in past")) # Check for duplicate lot_ids cursor.execute(""" SELECT lot_id, COUNT(*) FROM lots GROUP BY lot_id HAVING COUNT(*) > 1 """) duplicates = cursor.fetchall() if duplicates: issues.append(("ERROR", "Duplicate Lot IDs", f"{len(duplicates)} duplicate lot_id values found")) # Check for lots without images cursor.execute(""" SELECT COUNT(*) FROM lots WHERE lot_id NOT IN (SELECT DISTINCT lot_id FROM images) """) no_images = cursor.fetchone()[0] if no_images > 0: issues.append(("WARNING", "No Images", f"{no_images} lots have no images")) conn.close() return issues def print_validation_report(): """Print comprehensive validation report""" print("=" * 80) print("DATABASE VALIDATION REPORT") print("=" * 80) print() stats = get_db_stats() # Overall counts print("OVERALL COUNTS:") print(f" Auctions: {stats['total_auctions']:,}") print(f" Lots: {stats['total_lots']:,}") print(f" Images: {stats['total_images']:,}") print(f" Bid History Records: {stats['total_bid_history']:,}") print() # Auctions completeness print("AUCTIONS COMPLETENESS:") a = stats['auctions'] print(f" Title: {a['has_title']:,} / {a['total']:,} ({a['has_title']/a['total']*100:.1f}%)") print(f" Lots Count: {a['has_lots_count']:,} / {a['total']:,} ({a['has_lots_count']/a['total']*100:.1f}%)") print(f" Closing Time: {a['has_closing_time']:,} / {a['total']:,} ({a['has_closing_time']/a['total']*100:.1f}%)") print(f" First Lot Closing: {a['has_first_lot_closing']:,} / {a['total']:,} ({a['has_first_lot_closing']/a['total']*100:.1f}%)") print() # Lots core completeness print("LOTS CORE FIELDS:") l = stats['lots_core'] print(f" Title: {l['has_title']:,} / {l['total']:,} ({l['has_title']/l['total']*100:.1f}%)") print(f" Current Bid: {l['has_current_bid']:,} / {l['total']:,} ({l['has_current_bid']/l['total']*100:.1f}%)") print(f" Starting Bid: {l['has_starting_bid']:,} / {l['total']:,} ({l['has_starting_bid']/l['total']*100:.1f}%)") print(f" Minimum Bid: {l['has_minimum_bid']:,} / {l['total']:,} ({l['has_minimum_bid']/l['total']*100:.1f}%)") print(f" Has Bids (>0): {l['has_bids']:,} / {l['total']:,} ({l['has_bids']/l['total']*100:.1f}%)") print(f" Closing Time: {l['has_closing_time']:,} / {l['total']:,} ({l['has_closing_time']/l['total']*100:.1f}%)") print(f" Status: {l['has_status']:,} / {l['total']:,} ({l['has_status']/l['total']*100:.1f}%)") print() # Lots enriched fields print("LOTS ENRICHED FIELDS:") e = stats['lots_enriched'] print(f" Brand: {e['has_brand']:,} / {e['total']:,} ({e['has_brand']/e['total']*100:.1f}%)") print(f" Model: {e['has_model']:,} / {e['total']:,} ({e['has_model']/e['total']*100:.1f}%)") print(f" Manufacturer: {e['has_manufacturer']:,} / {e['total']:,} ({e['has_manufacturer']/e['total']*100:.1f}%)") print(f" Year: {e['has_year']:,} / {e['total']:,} ({e['has_year']/e['total']*100:.1f}%)") print(f" Condition Score: {e['has_condition_score']:,} / {e['total']:,} ({e['has_condition_score']/e['total']*100:.1f}%)") print(f" Condition Desc: {e['has_condition_desc']:,} / {e['total']:,} ({e['has_condition_desc']/e['total']*100:.1f}%)") print(f" Serial Number: {e['has_serial']:,} / {e['total']:,} ({e['has_serial']/e['total']*100:.1f}%)") print(f" Damage Desc: {e['has_damage']:,} / {e['total']:,} ({e['has_damage']/e['total']*100:.1f}%)") print() # Bid intelligence print("LOTS BID INTELLIGENCE:") b = stats['lots_bid_intelligence'] print(f" First Bid Time: {b['has_first_bid_time']:,} / {b['total']:,} ({b['has_first_bid_time']/b['total']*100:.1f}%)") print(f" Last Bid Time: {b['has_last_bid_time']:,} / {b['total']:,} ({b['has_last_bid_time']/b['total']*100:.1f}%)") print(f" Bid Velocity: {b['has_bid_velocity']:,} / {b['total']:,} ({b['has_bid_velocity']/b['total']*100:.1f}%)") print(f" Bid Increment: {b['has_bid_increment']:,} / {b['total']:,} ({b['has_bid_increment']/b['total']*100:.1f}%)") print() # Bid history print("BID HISTORY:") h = stats['bid_history'] print(f" Lots with History: {h['lots_with_history']:,}") print(f" Total Bid Records: {h['total_bids']:,}") print(f" Autobids: {h['autobids']:,} ({h['autobids']/max(h['total_bids'],1)*100:.1f}%)") print(f" Has Bidder ID: {h['has_bidder_id']:,} ({h['has_bidder_id']/max(h['total_bids'],1)*100:.1f}%)") print() # Images print("IMAGES:") i = stats['images'] print(f" Lots with Images: {i['lots_with_images']:,}") print(f" Total Images: {i['total_images']:,}") print(f" Downloaded: {i['downloaded_images']:,} ({i['downloaded_images']/max(i['total_images'],1)*100:.1f}%)") print(f" Has Local Path: {i['has_local_path']:,} ({i['has_local_path']/max(i['total_images'],1)*100:.1f}%)") print() # Data quality issues print("=" * 80) print("DATA QUALITY ISSUES:") print("=" * 80) issues = check_data_quality() if issues: for severity, category, message in issues: print(f" [{severity}] {category}: {message}") else: print(" No issues found!") print() if __name__ == "__main__": print_validation_report()