307 lines
12 KiB
Python
307 lines
12 KiB
Python
"""
|
|
Validate data quality and completeness in the database.
|
|
Checks if scraped data matches expectations and API capabilities.
|
|
"""
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
|
|
import sqlite3
|
|
from datetime import datetime
|
|
from typing import Dict, List, Tuple
|
|
from cache import CacheManager
|
|
|
|
cache = CacheManager()
|
|
DB_PATH = cache.db_path
|
|
|
|
def get_db_stats() -> Dict:
|
|
"""Get comprehensive database statistics"""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cursor = conn.cursor()
|
|
|
|
stats = {}
|
|
|
|
# Total counts
|
|
stats['total_auctions'] = cursor.execute("SELECT COUNT(*) FROM auctions").fetchone()[0]
|
|
stats['total_lots'] = cursor.execute("SELECT COUNT(*) FROM lots").fetchone()[0]
|
|
stats['total_images'] = cursor.execute("SELECT COUNT(*) FROM images").fetchone()[0]
|
|
stats['total_bid_history'] = cursor.execute("SELECT COUNT(*) FROM bid_history").fetchone()[0]
|
|
|
|
# Auctions completeness
|
|
cursor.execute("""
|
|
SELECT
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
|
|
SUM(CASE WHEN lots_count IS NOT NULL THEN 1 ELSE 0 END) as has_lots_count,
|
|
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
|
|
SUM(CASE WHEN first_lot_closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_lot_closing
|
|
FROM auctions
|
|
""")
|
|
row = cursor.fetchone()
|
|
stats['auctions'] = {
|
|
'total': row[0],
|
|
'has_title': row[1],
|
|
'has_lots_count': row[2],
|
|
'has_closing_time': row[3],
|
|
'has_first_lot_closing': row[4]
|
|
}
|
|
|
|
# Lots completeness - Core fields
|
|
cursor.execute("""
|
|
SELECT
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
|
|
SUM(CASE WHEN current_bid IS NOT NULL THEN 1 ELSE 0 END) as has_current_bid,
|
|
SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid,
|
|
SUM(CASE WHEN minimum_bid IS NOT NULL THEN 1 ELSE 0 END) as has_minimum_bid,
|
|
SUM(CASE WHEN bid_count IS NOT NULL AND bid_count > 0 THEN 1 ELSE 0 END) as has_bids,
|
|
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
|
|
SUM(CASE WHEN status IS NOT NULL AND status != '' THEN 1 ELSE 0 END) as has_status
|
|
FROM lots
|
|
""")
|
|
row = cursor.fetchone()
|
|
stats['lots_core'] = {
|
|
'total': row[0],
|
|
'has_title': row[1],
|
|
'has_current_bid': row[2],
|
|
'has_starting_bid': row[3],
|
|
'has_minimum_bid': row[4],
|
|
'has_bids': row[5],
|
|
'has_closing_time': row[6],
|
|
'has_status': row[7]
|
|
}
|
|
|
|
# Lots completeness - Enriched fields
|
|
cursor.execute("""
|
|
SELECT
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN brand IS NOT NULL AND brand != '' THEN 1 ELSE 0 END) as has_brand,
|
|
SUM(CASE WHEN model IS NOT NULL AND model != '' THEN 1 ELSE 0 END) as has_model,
|
|
SUM(CASE WHEN manufacturer IS NOT NULL AND manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
|
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
|
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition_score,
|
|
SUM(CASE WHEN condition_description IS NOT NULL AND condition_description != '' THEN 1 ELSE 0 END) as has_condition_desc,
|
|
SUM(CASE WHEN serial_number IS NOT NULL AND serial_number != '' THEN 1 ELSE 0 END) as has_serial,
|
|
SUM(CASE WHEN damage_description IS NOT NULL AND damage_description != '' THEN 1 ELSE 0 END) as has_damage
|
|
FROM lots
|
|
""")
|
|
row = cursor.fetchone()
|
|
stats['lots_enriched'] = {
|
|
'total': row[0],
|
|
'has_brand': row[1],
|
|
'has_model': row[2],
|
|
'has_manufacturer': row[3],
|
|
'has_year': row[4],
|
|
'has_condition_score': row[5],
|
|
'has_condition_desc': row[6],
|
|
'has_serial': row[7],
|
|
'has_damage': row[8]
|
|
}
|
|
|
|
# Lots completeness - Bid intelligence
|
|
cursor.execute("""
|
|
SELECT
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN first_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_bid_time,
|
|
SUM(CASE WHEN last_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_last_bid_time,
|
|
SUM(CASE WHEN bid_velocity IS NOT NULL THEN 1 ELSE 0 END) as has_bid_velocity,
|
|
SUM(CASE WHEN bid_increment IS NOT NULL THEN 1 ELSE 0 END) as has_bid_increment
|
|
FROM lots
|
|
""")
|
|
row = cursor.fetchone()
|
|
stats['lots_bid_intelligence'] = {
|
|
'total': row[0],
|
|
'has_first_bid_time': row[1],
|
|
'has_last_bid_time': row[2],
|
|
'has_bid_velocity': row[3],
|
|
'has_bid_increment': row[4]
|
|
}
|
|
|
|
# Bid history stats
|
|
cursor.execute("""
|
|
SELECT
|
|
COUNT(DISTINCT lot_id) as lots_with_history,
|
|
COUNT(*) as total_bids,
|
|
SUM(CASE WHEN is_autobid = 1 THEN 1 ELSE 0 END) as autobids,
|
|
SUM(CASE WHEN bidder_id IS NOT NULL THEN 1 ELSE 0 END) as has_bidder_id
|
|
FROM bid_history
|
|
""")
|
|
row = cursor.fetchone()
|
|
stats['bid_history'] = {
|
|
'lots_with_history': row[0],
|
|
'total_bids': row[1],
|
|
'autobids': row[2],
|
|
'has_bidder_id': row[3]
|
|
}
|
|
|
|
# Image stats
|
|
cursor.execute("""
|
|
SELECT
|
|
COUNT(DISTINCT lot_id) as lots_with_images,
|
|
COUNT(*) as total_images,
|
|
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded_images,
|
|
SUM(CASE WHEN local_path IS NOT NULL THEN 1 ELSE 0 END) as has_local_path
|
|
FROM images
|
|
""")
|
|
row = cursor.fetchone()
|
|
stats['images'] = {
|
|
'lots_with_images': row[0],
|
|
'total_images': row[1],
|
|
'downloaded_images': row[2],
|
|
'has_local_path': row[3]
|
|
}
|
|
|
|
conn.close()
|
|
return stats
|
|
|
|
def check_data_quality() -> List[Tuple[str, str, str]]:
|
|
"""Check for data quality issues"""
|
|
issues = []
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cursor = conn.cursor()
|
|
|
|
# Check for lots without auction
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM lots
|
|
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
|
""")
|
|
orphaned_lots = cursor.fetchone()[0]
|
|
if orphaned_lots > 0:
|
|
issues.append(("ERROR", "Orphaned Lots", f"{orphaned_lots} lots without matching auction"))
|
|
|
|
# Check for lots with bids but no bid history
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM lots
|
|
WHERE bid_count > 0
|
|
AND lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
|
|
""")
|
|
missing_history = cursor.fetchone()[0]
|
|
if missing_history > 0:
|
|
issues.append(("WARNING", "Missing Bid History", f"{missing_history} lots have bids but no bid history records"))
|
|
|
|
# Check for lots with closing time in past but still active
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM lots
|
|
WHERE closing_time IS NOT NULL
|
|
AND closing_time < datetime('now')
|
|
AND status NOT LIKE '%gesloten%'
|
|
""")
|
|
past_closing = cursor.fetchone()[0]
|
|
if past_closing > 0:
|
|
issues.append(("INFO", "Past Closing Time", f"{past_closing} lots have closing time in past"))
|
|
|
|
# Check for duplicate lot_ids
|
|
cursor.execute("""
|
|
SELECT lot_id, COUNT(*) FROM lots
|
|
GROUP BY lot_id
|
|
HAVING COUNT(*) > 1
|
|
""")
|
|
duplicates = cursor.fetchall()
|
|
if duplicates:
|
|
issues.append(("ERROR", "Duplicate Lot IDs", f"{len(duplicates)} duplicate lot_id values found"))
|
|
|
|
# Check for lots without images
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM lots
|
|
WHERE lot_id NOT IN (SELECT DISTINCT lot_id FROM images)
|
|
""")
|
|
no_images = cursor.fetchone()[0]
|
|
if no_images > 0:
|
|
issues.append(("WARNING", "No Images", f"{no_images} lots have no images"))
|
|
|
|
conn.close()
|
|
return issues
|
|
|
|
def print_validation_report():
|
|
"""Print comprehensive validation report"""
|
|
print("=" * 80)
|
|
print("DATABASE VALIDATION REPORT")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
stats = get_db_stats()
|
|
|
|
# Overall counts
|
|
print("OVERALL COUNTS:")
|
|
print(f" Auctions: {stats['total_auctions']:,}")
|
|
print(f" Lots: {stats['total_lots']:,}")
|
|
print(f" Images: {stats['total_images']:,}")
|
|
print(f" Bid History Records: {stats['total_bid_history']:,}")
|
|
print()
|
|
|
|
# Auctions completeness
|
|
print("AUCTIONS COMPLETENESS:")
|
|
a = stats['auctions']
|
|
print(f" Title: {a['has_title']:,} / {a['total']:,} ({a['has_title']/a['total']*100:.1f}%)")
|
|
print(f" Lots Count: {a['has_lots_count']:,} / {a['total']:,} ({a['has_lots_count']/a['total']*100:.1f}%)")
|
|
print(f" Closing Time: {a['has_closing_time']:,} / {a['total']:,} ({a['has_closing_time']/a['total']*100:.1f}%)")
|
|
print(f" First Lot Closing: {a['has_first_lot_closing']:,} / {a['total']:,} ({a['has_first_lot_closing']/a['total']*100:.1f}%)")
|
|
print()
|
|
|
|
# Lots core completeness
|
|
print("LOTS CORE FIELDS:")
|
|
l = stats['lots_core']
|
|
print(f" Title: {l['has_title']:,} / {l['total']:,} ({l['has_title']/l['total']*100:.1f}%)")
|
|
print(f" Current Bid: {l['has_current_bid']:,} / {l['total']:,} ({l['has_current_bid']/l['total']*100:.1f}%)")
|
|
print(f" Starting Bid: {l['has_starting_bid']:,} / {l['total']:,} ({l['has_starting_bid']/l['total']*100:.1f}%)")
|
|
print(f" Minimum Bid: {l['has_minimum_bid']:,} / {l['total']:,} ({l['has_minimum_bid']/l['total']*100:.1f}%)")
|
|
print(f" Has Bids (>0): {l['has_bids']:,} / {l['total']:,} ({l['has_bids']/l['total']*100:.1f}%)")
|
|
print(f" Closing Time: {l['has_closing_time']:,} / {l['total']:,} ({l['has_closing_time']/l['total']*100:.1f}%)")
|
|
print(f" Status: {l['has_status']:,} / {l['total']:,} ({l['has_status']/l['total']*100:.1f}%)")
|
|
print()
|
|
|
|
# Lots enriched fields
|
|
print("LOTS ENRICHED FIELDS:")
|
|
e = stats['lots_enriched']
|
|
print(f" Brand: {e['has_brand']:,} / {e['total']:,} ({e['has_brand']/e['total']*100:.1f}%)")
|
|
print(f" Model: {e['has_model']:,} / {e['total']:,} ({e['has_model']/e['total']*100:.1f}%)")
|
|
print(f" Manufacturer: {e['has_manufacturer']:,} / {e['total']:,} ({e['has_manufacturer']/e['total']*100:.1f}%)")
|
|
print(f" Year: {e['has_year']:,} / {e['total']:,} ({e['has_year']/e['total']*100:.1f}%)")
|
|
print(f" Condition Score: {e['has_condition_score']:,} / {e['total']:,} ({e['has_condition_score']/e['total']*100:.1f}%)")
|
|
print(f" Condition Desc: {e['has_condition_desc']:,} / {e['total']:,} ({e['has_condition_desc']/e['total']*100:.1f}%)")
|
|
print(f" Serial Number: {e['has_serial']:,} / {e['total']:,} ({e['has_serial']/e['total']*100:.1f}%)")
|
|
print(f" Damage Desc: {e['has_damage']:,} / {e['total']:,} ({e['has_damage']/e['total']*100:.1f}%)")
|
|
print()
|
|
|
|
# Bid intelligence
|
|
print("LOTS BID INTELLIGENCE:")
|
|
b = stats['lots_bid_intelligence']
|
|
print(f" First Bid Time: {b['has_first_bid_time']:,} / {b['total']:,} ({b['has_first_bid_time']/b['total']*100:.1f}%)")
|
|
print(f" Last Bid Time: {b['has_last_bid_time']:,} / {b['total']:,} ({b['has_last_bid_time']/b['total']*100:.1f}%)")
|
|
print(f" Bid Velocity: {b['has_bid_velocity']:,} / {b['total']:,} ({b['has_bid_velocity']/b['total']*100:.1f}%)")
|
|
print(f" Bid Increment: {b['has_bid_increment']:,} / {b['total']:,} ({b['has_bid_increment']/b['total']*100:.1f}%)")
|
|
print()
|
|
|
|
# Bid history
|
|
print("BID HISTORY:")
|
|
h = stats['bid_history']
|
|
print(f" Lots with History: {h['lots_with_history']:,}")
|
|
print(f" Total Bid Records: {h['total_bids']:,}")
|
|
print(f" Autobids: {h['autobids']:,} ({h['autobids']/max(h['total_bids'],1)*100:.1f}%)")
|
|
print(f" Has Bidder ID: {h['has_bidder_id']:,} ({h['has_bidder_id']/max(h['total_bids'],1)*100:.1f}%)")
|
|
print()
|
|
|
|
# Images
|
|
print("IMAGES:")
|
|
i = stats['images']
|
|
print(f" Lots with Images: {i['lots_with_images']:,}")
|
|
print(f" Total Images: {i['total_images']:,}")
|
|
print(f" Downloaded: {i['downloaded_images']:,} ({i['downloaded_images']/max(i['total_images'],1)*100:.1f}%)")
|
|
print(f" Has Local Path: {i['has_local_path']:,} ({i['has_local_path']/max(i['total_images'],1)*100:.1f}%)")
|
|
print()
|
|
|
|
# Data quality issues
|
|
print("=" * 80)
|
|
print("DATA QUALITY ISSUES:")
|
|
print("=" * 80)
|
|
issues = check_data_quality()
|
|
if issues:
|
|
for severity, category, message in issues:
|
|
print(f" [{severity}] {category}: {message}")
|
|
else:
|
|
print(" No issues found!")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
print_validation_report()
|