Files
scaev/script/fix_malformed_entries.py
2025-12-05 06:48:08 +01:00

291 lines
10 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Script to detect and fix malformed/incomplete database entries.
Identifies entries with:
- Missing auction_id for auction pages
- Missing title
- Invalid bid values like "€Huidig bod"
- "gap" in closing_time
- Empty or invalid critical fields
Then re-parses from cache and updates.
"""
import sys
import sqlite3
import zlib
from pathlib import Path
from typing import List, Dict, Tuple
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from parse import DataParser
from config import CACHE_DB
class MalformedEntryFixer:
"""Detects and fixes malformed database entries"""
def __init__(self, db_path: str):
self.db_path = db_path
self.parser = DataParser()
def detect_malformed_auctions(self) -> List[Tuple]:
"""Find auctions with missing or invalid data"""
with sqlite3.connect(self.db_path) as conn:
# Auctions with issues
cursor = conn.execute("""
SELECT auction_id, url, title, first_lot_closing_time
FROM auctions
WHERE
auction_id = '' OR auction_id IS NULL
OR title = '' OR title IS NULL
OR first_lot_closing_time = 'gap'
OR first_lot_closing_time LIKE '%wegens vereffening%'
""")
return cursor.fetchall()
def detect_malformed_lots(self) -> List[Tuple]:
"""Find lots with missing or invalid data"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT lot_id, url, title, current_bid, closing_time
FROM lots
WHERE
auction_id = '' OR auction_id IS NULL
OR title = '' OR title IS NULL
OR current_bid LIKE '%Huidig%bod%'
OR current_bid = '€Huidig bod'
OR closing_time = 'gap'
OR closing_time = ''
OR closing_time LIKE '%wegens vereffening%'
""")
return cursor.fetchall()
def get_cached_content(self, url: str) -> str:
"""Retrieve and decompress cached HTML for a URL"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"SELECT content FROM cache WHERE url = ?",
(url,)
)
row = cursor.fetchone()
if row and row[0]:
try:
return zlib.decompress(row[0]).decode('utf-8')
except Exception as e:
print(f" ❌ Failed to decompress: {e}")
return None
return None
def reparse_and_fix_auction(self, auction_id: str, url: str, dry_run: bool = False) -> bool:
"""Re-parse auction page from cache and update database"""
print(f"\n Fixing auction: {auction_id}")
print(f" URL: {url}")
content = self.get_cached_content(url)
if not content:
print(f" ❌ No cached content found")
return False
# Re-parse using current parser
parsed = self.parser.parse_page(content, url)
if not parsed or parsed.get('type') != 'auction':
print(f" ❌ Could not parse as auction")
return False
# Validate parsed data
if not parsed.get('auction_id') or not parsed.get('title'):
print(f" ⚠️ Re-parsed data still incomplete:")
print(f" auction_id: {parsed.get('auction_id')}")
print(f" title: {parsed.get('title', '')[:50]}")
return False
print(f" ✓ Parsed successfully:")
print(f" auction_id: {parsed.get('auction_id')}")
print(f" title: {parsed.get('title', '')[:50]}")
print(f" location: {parsed.get('location', 'N/A')}")
print(f" lots: {parsed.get('lots_count', 0)}")
if not dry_run:
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
UPDATE auctions SET
auction_id = ?,
title = ?,
location = ?,
lots_count = ?,
first_lot_closing_time = ?
WHERE url = ?
""", (
parsed['auction_id'],
parsed['title'],
parsed.get('location', ''),
parsed.get('lots_count', 0),
parsed.get('first_lot_closing_time', ''),
url
))
conn.commit()
print(f" ✓ Database updated")
return True
def reparse_and_fix_lot(self, lot_id: str, url: str, dry_run: bool = False) -> bool:
"""Re-parse lot page from cache and update database"""
print(f"\n Fixing lot: {lot_id}")
print(f" URL: {url}")
content = self.get_cached_content(url)
if not content:
print(f" ❌ No cached content found")
return False
# Re-parse using current parser
parsed = self.parser.parse_page(content, url)
if not parsed or parsed.get('type') != 'lot':
print(f" ❌ Could not parse as lot")
return False
# Validate parsed data
issues = []
if not parsed.get('lot_id'):
issues.append("missing lot_id")
if not parsed.get('title'):
issues.append("missing title")
if parsed.get('current_bid', '').lower().startswith('€huidig'):
issues.append("invalid bid format")
if issues:
print(f" ⚠️ Re-parsed data still has issues: {', '.join(issues)}")
print(f" lot_id: {parsed.get('lot_id')}")
print(f" title: {parsed.get('title', '')[:50]}")
print(f" bid: {parsed.get('current_bid')}")
return False
print(f" ✓ Parsed successfully:")
print(f" lot_id: {parsed.get('lot_id')}")
print(f" auction_id: {parsed.get('auction_id')}")
print(f" title: {parsed.get('title', '')[:50]}")
print(f" bid: {parsed.get('current_bid')}")
print(f" closing: {parsed.get('closing_time', 'N/A')}")
if not dry_run:
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
UPDATE lots SET
lot_id = ?,
auction_id = ?,
title = ?,
current_bid = ?,
bid_count = ?,
closing_time = ?,
viewing_time = ?,
pickup_date = ?,
location = ?,
description = ?,
category = ?
WHERE url = ?
""", (
parsed['lot_id'],
parsed.get('auction_id', ''),
parsed['title'],
parsed.get('current_bid', ''),
parsed.get('bid_count', 0),
parsed.get('closing_time', ''),
parsed.get('viewing_time', ''),
parsed.get('pickup_date', ''),
parsed.get('location', ''),
parsed.get('description', ''),
parsed.get('category', ''),
url
))
conn.commit()
print(f" ✓ Database updated")
return True
def run(self, dry_run: bool = False):
"""Main execution - detect and fix all malformed entries"""
print("="*70)
print("MALFORMED ENTRY DETECTION AND REPAIR")
print("="*70)
# Check for auctions
print("\n1. CHECKING AUCTIONS...")
malformed_auctions = self.detect_malformed_auctions()
print(f" Found {len(malformed_auctions)} malformed auction entries")
stats = {'auctions_fixed': 0, 'auctions_failed': 0}
for auction_id, url, title, closing_time in malformed_auctions:
try:
if self.reparse_and_fix_auction(auction_id or url.split('/')[-1], url, dry_run):
stats['auctions_fixed'] += 1
else:
stats['auctions_failed'] += 1
except Exception as e:
print(f" ❌ Error: {e}")
stats['auctions_failed'] += 1
# Check for lots
print("\n2. CHECKING LOTS...")
malformed_lots = self.detect_malformed_lots()
print(f" Found {len(malformed_lots)} malformed lot entries")
stats['lots_fixed'] = 0
stats['lots_failed'] = 0
for lot_id, url, title, bid, closing_time in malformed_lots:
try:
if self.reparse_and_fix_lot(lot_id or url.split('/')[-1], url, dry_run):
stats['lots_fixed'] += 1
else:
stats['lots_failed'] += 1
except Exception as e:
print(f" ❌ Error: {e}")
stats['lots_failed'] += 1
# Summary
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print(f"Auctions:")
print(f" - Found: {len(malformed_auctions)}")
print(f" - Fixed: {stats['auctions_fixed']}")
print(f" - Failed: {stats['auctions_failed']}")
print(f"\nLots:")
print(f" - Found: {len(malformed_lots)}")
print(f" - Fixed: {stats['lots_fixed']}")
print(f" - Failed: {stats['lots_failed']}")
if dry_run:
print("\n⚠️ DRY RUN - No changes were made to the database")
def main():
import argparse
parser = argparse.ArgumentParser(
description="Detect and fix malformed database entries"
)
parser.add_argument(
'--db',
default=CACHE_DB,
help='Path to cache database'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without making changes'
)
args = parser.parse_args()
print(f"Database: {args.db}")
print(f"Dry run: {args.dry_run}\n")
fixer = MalformedEntryFixer(args.db)
fixer.run(dry_run=args.dry_run)
if __name__ == "__main__":
main()