integrating with monitor app
This commit is contained in:
290
script/fix_malformed_entries.py
Normal file
290
script/fix_malformed_entries.py
Normal file
@@ -0,0 +1,290 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to detect and fix malformed/incomplete database entries.
|
||||
|
||||
Identifies entries with:
|
||||
- Missing auction_id for auction pages
|
||||
- Missing title
|
||||
- Invalid bid values like "€Huidig bod"
|
||||
- "gap" in closing_time
|
||||
- Empty or invalid critical fields
|
||||
|
||||
Then re-parses from cache and updates.
|
||||
"""
|
||||
import sys
|
||||
import sqlite3
|
||||
import zlib
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
||||
|
||||
from parse import DataParser
|
||||
from config import CACHE_DB
|
||||
|
||||
|
||||
class MalformedEntryFixer:
|
||||
"""Detects and fixes malformed database entries"""
|
||||
|
||||
def __init__(self, db_path: str):
|
||||
self.db_path = db_path
|
||||
self.parser = DataParser()
|
||||
|
||||
def detect_malformed_auctions(self) -> List[Tuple]:
|
||||
"""Find auctions with missing or invalid data"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Auctions with issues
|
||||
cursor = conn.execute("""
|
||||
SELECT auction_id, url, title, first_lot_closing_time
|
||||
FROM auctions
|
||||
WHERE
|
||||
auction_id = '' OR auction_id IS NULL
|
||||
OR title = '' OR title IS NULL
|
||||
OR first_lot_closing_time = 'gap'
|
||||
OR first_lot_closing_time LIKE '%wegens vereffening%'
|
||||
""")
|
||||
return cursor.fetchall()
|
||||
|
||||
def detect_malformed_lots(self) -> List[Tuple]:
|
||||
"""Find lots with missing or invalid data"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, url, title, current_bid, closing_time
|
||||
FROM lots
|
||||
WHERE
|
||||
auction_id = '' OR auction_id IS NULL
|
||||
OR title = '' OR title IS NULL
|
||||
OR current_bid LIKE '%Huidig%bod%'
|
||||
OR current_bid = '€Huidig bod'
|
||||
OR closing_time = 'gap'
|
||||
OR closing_time = ''
|
||||
OR closing_time LIKE '%wegens vereffening%'
|
||||
""")
|
||||
return cursor.fetchall()
|
||||
|
||||
def get_cached_content(self, url: str) -> str:
|
||||
"""Retrieve and decompress cached HTML for a URL"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT content FROM cache WHERE url = ?",
|
||||
(url,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
try:
|
||||
return zlib.decompress(row[0]).decode('utf-8')
|
||||
except Exception as e:
|
||||
print(f" ❌ Failed to decompress: {e}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def reparse_and_fix_auction(self, auction_id: str, url: str, dry_run: bool = False) -> bool:
|
||||
"""Re-parse auction page from cache and update database"""
|
||||
print(f"\n Fixing auction: {auction_id}")
|
||||
print(f" URL: {url}")
|
||||
|
||||
content = self.get_cached_content(url)
|
||||
if not content:
|
||||
print(f" ❌ No cached content found")
|
||||
return False
|
||||
|
||||
# Re-parse using current parser
|
||||
parsed = self.parser.parse_page(content, url)
|
||||
if not parsed or parsed.get('type') != 'auction':
|
||||
print(f" ❌ Could not parse as auction")
|
||||
return False
|
||||
|
||||
# Validate parsed data
|
||||
if not parsed.get('auction_id') or not parsed.get('title'):
|
||||
print(f" ⚠️ Re-parsed data still incomplete:")
|
||||
print(f" auction_id: {parsed.get('auction_id')}")
|
||||
print(f" title: {parsed.get('title', '')[:50]}")
|
||||
return False
|
||||
|
||||
print(f" ✓ Parsed successfully:")
|
||||
print(f" auction_id: {parsed.get('auction_id')}")
|
||||
print(f" title: {parsed.get('title', '')[:50]}")
|
||||
print(f" location: {parsed.get('location', 'N/A')}")
|
||||
print(f" lots: {parsed.get('lots_count', 0)}")
|
||||
|
||||
if not dry_run:
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
UPDATE auctions SET
|
||||
auction_id = ?,
|
||||
title = ?,
|
||||
location = ?,
|
||||
lots_count = ?,
|
||||
first_lot_closing_time = ?
|
||||
WHERE url = ?
|
||||
""", (
|
||||
parsed['auction_id'],
|
||||
parsed['title'],
|
||||
parsed.get('location', ''),
|
||||
parsed.get('lots_count', 0),
|
||||
parsed.get('first_lot_closing_time', ''),
|
||||
url
|
||||
))
|
||||
conn.commit()
|
||||
print(f" ✓ Database updated")
|
||||
|
||||
return True
|
||||
|
||||
def reparse_and_fix_lot(self, lot_id: str, url: str, dry_run: bool = False) -> bool:
|
||||
"""Re-parse lot page from cache and update database"""
|
||||
print(f"\n Fixing lot: {lot_id}")
|
||||
print(f" URL: {url}")
|
||||
|
||||
content = self.get_cached_content(url)
|
||||
if not content:
|
||||
print(f" ❌ No cached content found")
|
||||
return False
|
||||
|
||||
# Re-parse using current parser
|
||||
parsed = self.parser.parse_page(content, url)
|
||||
if not parsed or parsed.get('type') != 'lot':
|
||||
print(f" ❌ Could not parse as lot")
|
||||
return False
|
||||
|
||||
# Validate parsed data
|
||||
issues = []
|
||||
if not parsed.get('lot_id'):
|
||||
issues.append("missing lot_id")
|
||||
if not parsed.get('title'):
|
||||
issues.append("missing title")
|
||||
if parsed.get('current_bid', '').lower().startswith('€huidig'):
|
||||
issues.append("invalid bid format")
|
||||
|
||||
if issues:
|
||||
print(f" ⚠️ Re-parsed data still has issues: {', '.join(issues)}")
|
||||
print(f" lot_id: {parsed.get('lot_id')}")
|
||||
print(f" title: {parsed.get('title', '')[:50]}")
|
||||
print(f" bid: {parsed.get('current_bid')}")
|
||||
return False
|
||||
|
||||
print(f" ✓ Parsed successfully:")
|
||||
print(f" lot_id: {parsed.get('lot_id')}")
|
||||
print(f" auction_id: {parsed.get('auction_id')}")
|
||||
print(f" title: {parsed.get('title', '')[:50]}")
|
||||
print(f" bid: {parsed.get('current_bid')}")
|
||||
print(f" closing: {parsed.get('closing_time', 'N/A')}")
|
||||
|
||||
if not dry_run:
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
UPDATE lots SET
|
||||
lot_id = ?,
|
||||
auction_id = ?,
|
||||
title = ?,
|
||||
current_bid = ?,
|
||||
bid_count = ?,
|
||||
closing_time = ?,
|
||||
viewing_time = ?,
|
||||
pickup_date = ?,
|
||||
location = ?,
|
||||
description = ?,
|
||||
category = ?
|
||||
WHERE url = ?
|
||||
""", (
|
||||
parsed['lot_id'],
|
||||
parsed.get('auction_id', ''),
|
||||
parsed['title'],
|
||||
parsed.get('current_bid', ''),
|
||||
parsed.get('bid_count', 0),
|
||||
parsed.get('closing_time', ''),
|
||||
parsed.get('viewing_time', ''),
|
||||
parsed.get('pickup_date', ''),
|
||||
parsed.get('location', ''),
|
||||
parsed.get('description', ''),
|
||||
parsed.get('category', ''),
|
||||
url
|
||||
))
|
||||
conn.commit()
|
||||
print(f" ✓ Database updated")
|
||||
|
||||
return True
|
||||
|
||||
def run(self, dry_run: bool = False):
|
||||
"""Main execution - detect and fix all malformed entries"""
|
||||
print("="*70)
|
||||
print("MALFORMED ENTRY DETECTION AND REPAIR")
|
||||
print("="*70)
|
||||
|
||||
# Check for auctions
|
||||
print("\n1. CHECKING AUCTIONS...")
|
||||
malformed_auctions = self.detect_malformed_auctions()
|
||||
print(f" Found {len(malformed_auctions)} malformed auction entries")
|
||||
|
||||
stats = {'auctions_fixed': 0, 'auctions_failed': 0}
|
||||
for auction_id, url, title, closing_time in malformed_auctions:
|
||||
try:
|
||||
if self.reparse_and_fix_auction(auction_id or url.split('/')[-1], url, dry_run):
|
||||
stats['auctions_fixed'] += 1
|
||||
else:
|
||||
stats['auctions_failed'] += 1
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
stats['auctions_failed'] += 1
|
||||
|
||||
# Check for lots
|
||||
print("\n2. CHECKING LOTS...")
|
||||
malformed_lots = self.detect_malformed_lots()
|
||||
print(f" Found {len(malformed_lots)} malformed lot entries")
|
||||
|
||||
stats['lots_fixed'] = 0
|
||||
stats['lots_failed'] = 0
|
||||
for lot_id, url, title, bid, closing_time in malformed_lots:
|
||||
try:
|
||||
if self.reparse_and_fix_lot(lot_id or url.split('/')[-1], url, dry_run):
|
||||
stats['lots_fixed'] += 1
|
||||
else:
|
||||
stats['lots_failed'] += 1
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
stats['lots_failed'] += 1
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("SUMMARY")
|
||||
print("="*70)
|
||||
print(f"Auctions:")
|
||||
print(f" - Found: {len(malformed_auctions)}")
|
||||
print(f" - Fixed: {stats['auctions_fixed']}")
|
||||
print(f" - Failed: {stats['auctions_failed']}")
|
||||
print(f"\nLots:")
|
||||
print(f" - Found: {len(malformed_lots)}")
|
||||
print(f" - Fixed: {stats['lots_fixed']}")
|
||||
print(f" - Failed: {stats['lots_failed']}")
|
||||
|
||||
if dry_run:
|
||||
print("\n⚠️ DRY RUN - No changes were made to the database")
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Detect and fix malformed database entries"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--db',
|
||||
default=CACHE_DB,
|
||||
help='Path to cache database'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be done without making changes'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Database: {args.db}")
|
||||
print(f"Dry run: {args.dry_run}\n")
|
||||
|
||||
fixer = MalformedEntryFixer(args.db)
|
||||
fixer.run(dry_run=args.dry_run)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
180
script/migrate_reparse_lots.py
Normal file
180
script/migrate_reparse_lots.py
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Migration script to re-parse cached HTML pages and update database entries.
|
||||
Fixes issues with incomplete data extraction from earlier scrapes.
|
||||
"""
|
||||
import sys
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
||||
|
||||
from parse import DataParser
|
||||
from config import CACHE_DB
|
||||
|
||||
|
||||
def reparse_and_update_lots(db_path: str = CACHE_DB, dry_run: bool = False):
|
||||
"""
|
||||
Re-parse cached HTML pages and update lot entries in the database.
|
||||
|
||||
This extracts improved data from __NEXT_DATA__ JSON blobs that may have been
|
||||
missed in earlier scraping runs when validation was less strict.
|
||||
"""
|
||||
parser = DataParser()
|
||||
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
# Get all cached lot pages
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
ORDER BY timestamp DESC
|
||||
""")
|
||||
|
||||
cached_pages = cursor.fetchall()
|
||||
print(f"Found {len(cached_pages)} cached lot pages to re-parse")
|
||||
|
||||
stats = {
|
||||
'processed': 0,
|
||||
'updated': 0,
|
||||
'skipped': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
for url, compressed_content in cached_pages:
|
||||
try:
|
||||
# Decompress content
|
||||
import zlib
|
||||
content = zlib.decompress(compressed_content).decode('utf-8')
|
||||
|
||||
# Re-parse using current parser logic
|
||||
parsed_data = parser.parse_page(content, url)
|
||||
|
||||
if not parsed_data or parsed_data.get('type') != 'lot':
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
lot_id = parsed_data.get('lot_id', '')
|
||||
if not lot_id:
|
||||
print(f" ⚠️ No lot_id for {url}")
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Check if lot exists
|
||||
existing = conn.execute(
|
||||
"SELECT lot_id FROM lots WHERE lot_id = ?",
|
||||
(lot_id,)
|
||||
).fetchone()
|
||||
|
||||
if not existing:
|
||||
print(f" → New lot: {lot_id}")
|
||||
# Insert new lot
|
||||
if not dry_run:
|
||||
conn.execute("""
|
||||
INSERT INTO lots
|
||||
(lot_id, auction_id, url, title, current_bid, bid_count,
|
||||
closing_time, viewing_time, pickup_date, location,
|
||||
description, category, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
lot_id,
|
||||
parsed_data.get('auction_id', ''),
|
||||
url,
|
||||
parsed_data.get('title', ''),
|
||||
parsed_data.get('current_bid', ''),
|
||||
parsed_data.get('bid_count', 0),
|
||||
parsed_data.get('closing_time', ''),
|
||||
parsed_data.get('viewing_time', ''),
|
||||
parsed_data.get('pickup_date', ''),
|
||||
parsed_data.get('location', ''),
|
||||
parsed_data.get('description', ''),
|
||||
parsed_data.get('category', ''),
|
||||
parsed_data.get('scraped_at', '')
|
||||
))
|
||||
stats['updated'] += 1
|
||||
else:
|
||||
# Update existing lot with newly parsed data
|
||||
# Only update fields that are now populated but weren't before
|
||||
if not dry_run:
|
||||
conn.execute("""
|
||||
UPDATE lots SET
|
||||
auction_id = COALESCE(NULLIF(?, ''), auction_id),
|
||||
title = COALESCE(NULLIF(?, ''), title),
|
||||
current_bid = COALESCE(NULLIF(?, ''), current_bid),
|
||||
bid_count = CASE WHEN ? > 0 THEN ? ELSE bid_count END,
|
||||
closing_time = COALESCE(NULLIF(?, ''), closing_time),
|
||||
viewing_time = COALESCE(NULLIF(?, ''), viewing_time),
|
||||
pickup_date = COALESCE(NULLIF(?, ''), pickup_date),
|
||||
location = COALESCE(NULLIF(?, ''), location),
|
||||
description = COALESCE(NULLIF(?, ''), description),
|
||||
category = COALESCE(NULLIF(?, ''), category)
|
||||
WHERE lot_id = ?
|
||||
""", (
|
||||
parsed_data.get('auction_id', ''),
|
||||
parsed_data.get('title', ''),
|
||||
parsed_data.get('current_bid', ''),
|
||||
parsed_data.get('bid_count', 0),
|
||||
parsed_data.get('bid_count', 0),
|
||||
parsed_data.get('closing_time', ''),
|
||||
parsed_data.get('viewing_time', ''),
|
||||
parsed_data.get('pickup_date', ''),
|
||||
parsed_data.get('location', ''),
|
||||
parsed_data.get('description', ''),
|
||||
parsed_data.get('category', ''),
|
||||
lot_id
|
||||
))
|
||||
stats['updated'] += 1
|
||||
|
||||
print(f" ✓ Updated: {lot_id[:20]}")
|
||||
|
||||
# Update images if they exist
|
||||
images = parsed_data.get('images', [])
|
||||
if images and not dry_run:
|
||||
for img_url in images:
|
||||
conn.execute("""
|
||||
INSERT OR IGNORE INTO images (lot_id, url)
|
||||
VALUES (?, ?)
|
||||
""", (lot_id, img_url))
|
||||
|
||||
stats['processed'] += 1
|
||||
|
||||
if stats['processed'] % 100 == 0:
|
||||
print(f" Progress: {stats['processed']}/{len(cached_pages)}")
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error processing {url}: {e}")
|
||||
stats['errors'] += 1
|
||||
continue
|
||||
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("MIGRATION COMPLETE")
|
||||
print("="*60)
|
||||
print(f"Processed: {stats['processed']}")
|
||||
print(f"Updated: {stats['updated']}")
|
||||
print(f"Skipped: {stats['skipped']}")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
|
||||
if dry_run:
|
||||
print("\n⚠️ DRY RUN - No changes were made to the database")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Re-parse and update lot entries from cached HTML")
|
||||
parser.add_argument('--db', default=CACHE_DB, help='Path to cache database')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Database: {args.db}")
|
||||
print(f"Dry run: {args.dry_run}")
|
||||
print()
|
||||
|
||||
reparse_and_update_lots(args.db, args.dry_run)
|
||||
Reference in New Issue
Block a user