#!/usr/bin/env python3 """ Script to detect and fix malformed/incomplete database entries. Identifies entries with: - Missing auction_id for auction pages - Missing title - Invalid bid values like "€Huidig ​​bod" - "gap" in closing_time - Empty or invalid critical fields Then re-parses from cache and updates. """ import sys import sqlite3 import zlib from pathlib import Path from typing import List, Dict, Tuple sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from parse import DataParser from config import CACHE_DB class MalformedEntryFixer: """Detects and fixes malformed database entries""" def __init__(self, db_path: str): self.db_path = db_path self.parser = DataParser() def detect_malformed_auctions(self) -> List[Tuple]: """Find auctions with missing or invalid data""" with sqlite3.connect(self.db_path) as conn: # Auctions with issues cursor = conn.execute(""" SELECT auction_id, url, title, first_lot_closing_time FROM auctions WHERE auction_id = '' OR auction_id IS NULL OR title = '' OR title IS NULL OR first_lot_closing_time = 'gap' OR first_lot_closing_time LIKE '%wegens vereffening%' """) return cursor.fetchall() def detect_malformed_lots(self) -> List[Tuple]: """Find lots with missing or invalid data""" with sqlite3.connect(self.db_path) as conn: cursor = conn.execute(""" SELECT lot_id, url, title, current_bid, closing_time FROM lots WHERE auction_id = '' OR auction_id IS NULL OR title = '' OR title IS NULL OR current_bid LIKE '%Huidig%bod%' OR current_bid = '€Huidig ​​bod' OR closing_time = 'gap' OR closing_time = '' OR closing_time LIKE '%wegens vereffening%' """) return cursor.fetchall() def get_cached_content(self, url: str) -> str: """Retrieve and decompress cached HTML for a URL""" with sqlite3.connect(self.db_path) as conn: cursor = conn.execute( "SELECT content FROM cache WHERE url = ?", (url,) ) row = cursor.fetchone() if row and row[0]: try: return zlib.decompress(row[0]).decode('utf-8') except Exception as e: print(f" ❌ Failed to decompress: {e}") return None return None def reparse_and_fix_auction(self, auction_id: str, url: str, dry_run: bool = False) -> bool: """Re-parse auction page from cache and update database""" print(f"\n Fixing auction: {auction_id}") print(f" URL: {url}") content = self.get_cached_content(url) if not content: print(f" ❌ No cached content found") return False # Re-parse using current parser parsed = self.parser.parse_page(content, url) if not parsed or parsed.get('type') != 'auction': print(f" ❌ Could not parse as auction") return False # Validate parsed data if not parsed.get('auction_id') or not parsed.get('title'): print(f" ⚠️ Re-parsed data still incomplete:") print(f" auction_id: {parsed.get('auction_id')}") print(f" title: {parsed.get('title', '')[:50]}") return False print(f" ✓ Parsed successfully:") print(f" auction_id: {parsed.get('auction_id')}") print(f" title: {parsed.get('title', '')[:50]}") print(f" location: {parsed.get('location', 'N/A')}") print(f" lots: {parsed.get('lots_count', 0)}") if not dry_run: with sqlite3.connect(self.db_path) as conn: conn.execute(""" UPDATE auctions SET auction_id = ?, title = ?, location = ?, lots_count = ?, first_lot_closing_time = ? WHERE url = ? """, ( parsed['auction_id'], parsed['title'], parsed.get('location', ''), parsed.get('lots_count', 0), parsed.get('first_lot_closing_time', ''), url )) conn.commit() print(f" ✓ Database updated") return True def reparse_and_fix_lot(self, lot_id: str, url: str, dry_run: bool = False) -> bool: """Re-parse lot page from cache and update database""" print(f"\n Fixing lot: {lot_id}") print(f" URL: {url}") content = self.get_cached_content(url) if not content: print(f" ❌ No cached content found") return False # Re-parse using current parser parsed = self.parser.parse_page(content, url) if not parsed or parsed.get('type') != 'lot': print(f" ❌ Could not parse as lot") return False # Validate parsed data issues = [] if not parsed.get('lot_id'): issues.append("missing lot_id") if not parsed.get('title'): issues.append("missing title") if parsed.get('current_bid', '').lower().startswith('€huidig'): issues.append("invalid bid format") if issues: print(f" ⚠️ Re-parsed data still has issues: {', '.join(issues)}") print(f" lot_id: {parsed.get('lot_id')}") print(f" title: {parsed.get('title', '')[:50]}") print(f" bid: {parsed.get('current_bid')}") return False print(f" ✓ Parsed successfully:") print(f" lot_id: {parsed.get('lot_id')}") print(f" auction_id: {parsed.get('auction_id')}") print(f" title: {parsed.get('title', '')[:50]}") print(f" bid: {parsed.get('current_bid')}") print(f" closing: {parsed.get('closing_time', 'N/A')}") if not dry_run: with sqlite3.connect(self.db_path) as conn: conn.execute(""" UPDATE lots SET lot_id = ?, auction_id = ?, title = ?, current_bid = ?, bid_count = ?, closing_time = ?, viewing_time = ?, pickup_date = ?, location = ?, description = ?, category = ? WHERE url = ? """, ( parsed['lot_id'], parsed.get('auction_id', ''), parsed['title'], parsed.get('current_bid', ''), parsed.get('bid_count', 0), parsed.get('closing_time', ''), parsed.get('viewing_time', ''), parsed.get('pickup_date', ''), parsed.get('location', ''), parsed.get('description', ''), parsed.get('category', ''), url )) conn.commit() print(f" ✓ Database updated") return True def run(self, dry_run: bool = False): """Main execution - detect and fix all malformed entries""" print("="*70) print("MALFORMED ENTRY DETECTION AND REPAIR") print("="*70) # Check for auctions print("\n1. CHECKING AUCTIONS...") malformed_auctions = self.detect_malformed_auctions() print(f" Found {len(malformed_auctions)} malformed auction entries") stats = {'auctions_fixed': 0, 'auctions_failed': 0} for auction_id, url, title, closing_time in malformed_auctions: try: if self.reparse_and_fix_auction(auction_id or url.split('/')[-1], url, dry_run): stats['auctions_fixed'] += 1 else: stats['auctions_failed'] += 1 except Exception as e: print(f" ❌ Error: {e}") stats['auctions_failed'] += 1 # Check for lots print("\n2. CHECKING LOTS...") malformed_lots = self.detect_malformed_lots() print(f" Found {len(malformed_lots)} malformed lot entries") stats['lots_fixed'] = 0 stats['lots_failed'] = 0 for lot_id, url, title, bid, closing_time in malformed_lots: try: if self.reparse_and_fix_lot(lot_id or url.split('/')[-1], url, dry_run): stats['lots_fixed'] += 1 else: stats['lots_failed'] += 1 except Exception as e: print(f" ❌ Error: {e}") stats['lots_failed'] += 1 # Summary print("\n" + "="*70) print("SUMMARY") print("="*70) print(f"Auctions:") print(f" - Found: {len(malformed_auctions)}") print(f" - Fixed: {stats['auctions_fixed']}") print(f" - Failed: {stats['auctions_failed']}") print(f"\nLots:") print(f" - Found: {len(malformed_lots)}") print(f" - Fixed: {stats['lots_fixed']}") print(f" - Failed: {stats['lots_failed']}") if dry_run: print("\n⚠️ DRY RUN - No changes were made to the database") def main(): import argparse parser = argparse.ArgumentParser( description="Detect and fix malformed database entries" ) parser.add_argument( '--db', default=CACHE_DB, help='Path to cache database' ) parser.add_argument( '--dry-run', action='store_true', help='Show what would be done without making changes' ) args = parser.parse_args() print(f"Database: {args.db}") print(f"Dry run: {args.dry_run}\n") fixer = MalformedEntryFixer(args.db) fixer.run(dry_run=args.dry_run) if __name__ == "__main__": main()