This commit is contained in:
Tour
2025-12-09 08:04:16 +01:00
commit e69563d4d6
37 changed files with 7262 additions and 0 deletions

View File

@@ -0,0 +1,290 @@
#!/usr/bin/env python3
"""
Script to detect and fix malformed/incomplete database entries.
Identifies entries with:
- Missing auction_id for auction pages
- Missing title
- Invalid bid values like "€Huidig bod"
- "gap" in closing_time
- Empty or invalid critical fields
Then re-parses from cache and updates.
"""
import sys
import sqlite3
import zlib
from pathlib import Path
from typing import List, Dict, Tuple
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from parse import DataParser
from config import CACHE_DB
class MalformedEntryFixer:
"""Detects and fixes malformed database entries"""
def __init__(self, db_path: str):
self.db_path = db_path
self.parser = DataParser()
def detect_malformed_auctions(self) -> List[Tuple]:
"""Find auctions with missing or invalid data"""
with sqlite3.connect(self.db_path) as conn:
# Auctions with issues
cursor = conn.execute("""
SELECT auction_id, url, title, first_lot_closing_time
FROM auctions
WHERE
auction_id = '' OR auction_id IS NULL
OR title = '' OR title IS NULL
OR first_lot_closing_time = 'gap'
OR first_lot_closing_time LIKE '%wegens vereffening%'
""")
return cursor.fetchall()
def detect_malformed_lots(self) -> List[Tuple]:
"""Find lots with missing or invalid data"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT lot_id, url, title, current_bid, closing_time
FROM lots
WHERE
auction_id = '' OR auction_id IS NULL
OR title = '' OR title IS NULL
OR current_bid LIKE '%Huidig%bod%'
OR current_bid = '€Huidig bod'
OR closing_time = 'gap'
OR closing_time = ''
OR closing_time LIKE '%wegens vereffening%'
""")
return cursor.fetchall()
def get_cached_content(self, url: str) -> str:
"""Retrieve and decompress cached HTML for a URL"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"SELECT content FROM cache WHERE url = ?",
(url,)
)
row = cursor.fetchone()
if row and row[0]:
try:
return zlib.decompress(row[0]).decode('utf-8')
except Exception as e:
print(f" ❌ Failed to decompress: {e}")
return None
return None
def reparse_and_fix_auction(self, auction_id: str, url: str, dry_run: bool = False) -> bool:
"""Re-parse auction page from cache and update database"""
print(f"\n Fixing auction: {auction_id}")
print(f" URL: {url}")
content = self.get_cached_content(url)
if not content:
print(f" ❌ No cached content found")
return False
# Re-parse using current parser
parsed = self.parser.parse_page(content, url)
if not parsed or parsed.get('type') != 'auction':
print(f" ❌ Could not parse as auction")
return False
# Validate parsed data
if not parsed.get('auction_id') or not parsed.get('title'):
print(f" ⚠️ Re-parsed data still incomplete:")
print(f" auction_id: {parsed.get('auction_id')}")
print(f" title: {parsed.get('title', '')[:50]}")
return False
print(f" ✓ Parsed successfully:")
print(f" auction_id: {parsed.get('auction_id')}")
print(f" title: {parsed.get('title', '')[:50]}")
print(f" location: {parsed.get('location', 'N/A')}")
print(f" lots: {parsed.get('lots_count', 0)}")
if not dry_run:
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
UPDATE auctions SET
auction_id = ?,
title = ?,
location = ?,
lots_count = ?,
first_lot_closing_time = ?
WHERE url = ?
""", (
parsed['auction_id'],
parsed['title'],
parsed.get('location', ''),
parsed.get('lots_count', 0),
parsed.get('first_lot_closing_time', ''),
url
))
conn.commit()
print(f" ✓ Database updated")
return True
def reparse_and_fix_lot(self, lot_id: str, url: str, dry_run: bool = False) -> bool:
"""Re-parse lot page from cache and update database"""
print(f"\n Fixing lot: {lot_id}")
print(f" URL: {url}")
content = self.get_cached_content(url)
if not content:
print(f" ❌ No cached content found")
return False
# Re-parse using current parser
parsed = self.parser.parse_page(content, url)
if not parsed or parsed.get('type') != 'lot':
print(f" ❌ Could not parse as lot")
return False
# Validate parsed data
issues = []
if not parsed.get('lot_id'):
issues.append("missing lot_id")
if not parsed.get('title'):
issues.append("missing title")
if parsed.get('current_bid', '').lower().startswith('€huidig'):
issues.append("invalid bid format")
if issues:
print(f" ⚠️ Re-parsed data still has issues: {', '.join(issues)}")
print(f" lot_id: {parsed.get('lot_id')}")
print(f" title: {parsed.get('title', '')[:50]}")
print(f" bid: {parsed.get('current_bid')}")
return False
print(f" ✓ Parsed successfully:")
print(f" lot_id: {parsed.get('lot_id')}")
print(f" auction_id: {parsed.get('auction_id')}")
print(f" title: {parsed.get('title', '')[:50]}")
print(f" bid: {parsed.get('current_bid')}")
print(f" closing: {parsed.get('closing_time', 'N/A')}")
if not dry_run:
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
UPDATE lots SET
lot_id = ?,
auction_id = ?,
title = ?,
current_bid = ?,
bid_count = ?,
closing_time = ?,
viewing_time = ?,
pickup_date = ?,
location = ?,
description = ?,
category = ?
WHERE url = ?
""", (
parsed['lot_id'],
parsed.get('auction_id', ''),
parsed['title'],
parsed.get('current_bid', ''),
parsed.get('bid_count', 0),
parsed.get('closing_time', ''),
parsed.get('viewing_time', ''),
parsed.get('pickup_date', ''),
parsed.get('location', ''),
parsed.get('description', ''),
parsed.get('category', ''),
url
))
conn.commit()
print(f" ✓ Database updated")
return True
def run(self, dry_run: bool = False):
"""Main execution - detect and fix all malformed entries"""
print("="*70)
print("MALFORMED ENTRY DETECTION AND REPAIR")
print("="*70)
# Check for auctions
print("\n1. CHECKING AUCTIONS...")
malformed_auctions = self.detect_malformed_auctions()
print(f" Found {len(malformed_auctions)} malformed auction entries")
stats = {'auctions_fixed': 0, 'auctions_failed': 0}
for auction_id, url, title, closing_time in malformed_auctions:
try:
if self.reparse_and_fix_auction(auction_id or url.split('/')[-1], url, dry_run):
stats['auctions_fixed'] += 1
else:
stats['auctions_failed'] += 1
except Exception as e:
print(f" ❌ Error: {e}")
stats['auctions_failed'] += 1
# Check for lots
print("\n2. CHECKING LOTS...")
malformed_lots = self.detect_malformed_lots()
print(f" Found {len(malformed_lots)} malformed lot entries")
stats['lots_fixed'] = 0
stats['lots_failed'] = 0
for lot_id, url, title, bid, closing_time in malformed_lots:
try:
if self.reparse_and_fix_lot(lot_id or url.split('/')[-1], url, dry_run):
stats['lots_fixed'] += 1
else:
stats['lots_failed'] += 1
except Exception as e:
print(f" ❌ Error: {e}")
stats['lots_failed'] += 1
# Summary
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print(f"Auctions:")
print(f" - Found: {len(malformed_auctions)}")
print(f" - Fixed: {stats['auctions_fixed']}")
print(f" - Failed: {stats['auctions_failed']}")
print(f"\nLots:")
print(f" - Found: {len(malformed_lots)}")
print(f" - Fixed: {stats['lots_fixed']}")
print(f" - Failed: {stats['lots_failed']}")
if dry_run:
print("\n⚠️ DRY RUN - No changes were made to the database")
def main():
import argparse
parser = argparse.ArgumentParser(
description="Detect and fix malformed database entries"
)
parser.add_argument(
'--db',
default=CACHE_DB,
help='Path to cache database'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without making changes'
)
args = parser.parse_args()
print(f"Database: {args.db}")
print(f"Dry run: {args.dry_run}\n")
fixer = MalformedEntryFixer(args.db)
fixer.run(dry_run=args.dry_run)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""
Migrate uncompressed cache entries to compressed format
This script compresses all cache entries where compressed=0
"""
import sqlite3
import zlib
import time
CACHE_DB = "/mnt/okcomputer/output/cache.db"
def migrate_cache():
"""Compress all uncompressed cache entries"""
with sqlite3.connect(CACHE_DB) as conn:
# Get uncompressed entries
cursor = conn.execute(
"SELECT url, content FROM cache WHERE compressed = 0 OR compressed IS NULL"
)
uncompressed = cursor.fetchall()
if not uncompressed:
print("✓ No uncompressed entries found. All cache is already compressed!")
return
print(f"Found {len(uncompressed)} uncompressed cache entries")
print("Starting compression...")
total_original_size = 0
total_compressed_size = 0
compressed_count = 0
for url, content in uncompressed:
try:
# Handle both text and bytes
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
original_size = len(content_bytes)
# Compress
compressed_content = zlib.compress(content_bytes, level=9)
compressed_size = len(compressed_content)
# Update in database
conn.execute(
"UPDATE cache SET content = ?, compressed = 1 WHERE url = ?",
(compressed_content, url)
)
total_original_size += original_size
total_compressed_size += compressed_size
compressed_count += 1
if compressed_count % 100 == 0:
conn.commit()
ratio = (1 - total_compressed_size / total_original_size) * 100
print(f" Compressed {compressed_count}/{len(uncompressed)} entries... "
f"({ratio:.1f}% reduction so far)")
except Exception as e:
print(f" ERROR compressing {url}: {e}")
continue
# Final commit
conn.commit()
# Calculate final statistics
ratio = (1 - total_compressed_size / total_original_size) * 100 if total_original_size > 0 else 0
size_saved_mb = (total_original_size - total_compressed_size) / (1024 * 1024)
print("\n" + "="*60)
print("MIGRATION COMPLETE")
print("="*60)
print(f"Entries compressed: {compressed_count}")
print(f"Original size: {total_original_size / (1024*1024):.2f} MB")
print(f"Compressed size: {total_compressed_size / (1024*1024):.2f} MB")
print(f"Space saved: {size_saved_mb:.2f} MB")
print(f"Compression ratio: {ratio:.1f}%")
print("="*60)
def verify_migration():
"""Verify all entries are compressed"""
with sqlite3.connect(CACHE_DB) as conn:
cursor = conn.execute(
"SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL"
)
uncompressed_count = cursor.fetchone()[0]
cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1")
compressed_count = cursor.fetchone()[0]
print("\nVERIFICATION:")
print(f" Compressed entries: {compressed_count}")
print(f" Uncompressed entries: {uncompressed_count}")
if uncompressed_count == 0:
print(" ✓ All cache entries are compressed!")
return True
else:
print(" ✗ Some entries are still uncompressed")
return False
def get_db_size():
"""Get current database file size"""
import os
if os.path.exists(CACHE_DB):
size_mb = os.path.getsize(CACHE_DB) / (1024 * 1024)
return size_mb
return 0
if __name__ == "__main__":
print("Cache Compression Migration Tool")
print("="*60)
# Show initial DB size
initial_size = get_db_size()
print(f"Initial database size: {initial_size:.2f} MB\n")
# Run migration
start_time = time.time()
migrate_cache()
elapsed = time.time() - start_time
print(f"\nTime taken: {elapsed:.2f} seconds")
# Verify
verify_migration()
# Show final DB size
final_size = get_db_size()
print(f"\nFinal database size: {final_size:.2f} MB")
print(f"Database size reduced by: {initial_size - final_size:.2f} MB")
print("\n✓ Migration complete! You can now run VACUUM to reclaim disk space:")
print(" sqlite3 /mnt/okcomputer/output/cache.db 'VACUUM;'")

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env python3
"""
Migration script to re-parse cached HTML pages and update database entries.
Fixes issues with incomplete data extraction from earlier scrapes.
"""
import sys
import sqlite3
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from parse import DataParser
from config import CACHE_DB
def reparse_and_update_lots(db_path: str = CACHE_DB, dry_run: bool = False):
"""
Re-parse cached HTML pages and update lot entries in the database.
This extracts improved data from __NEXT_DATA__ JSON blobs that may have been
missed in earlier scraping runs when validation was less strict.
"""
parser = DataParser()
with sqlite3.connect(db_path) as conn:
# Get all cached lot pages
cursor = conn.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/l/%'
ORDER BY timestamp DESC
""")
cached_pages = cursor.fetchall()
print(f"Found {len(cached_pages)} cached lot pages to re-parse")
stats = {
'processed': 0,
'updated': 0,
'skipped': 0,
'errors': 0
}
for url, compressed_content in cached_pages:
try:
# Decompress content
import zlib
content = zlib.decompress(compressed_content).decode('utf-8')
# Re-parse using current parser logic
parsed_data = parser.parse_page(content, url)
if not parsed_data or parsed_data.get('type') != 'lot':
stats['skipped'] += 1
continue
lot_id = parsed_data.get('lot_id', '')
if not lot_id:
print(f" ⚠️ No lot_id for {url}")
stats['skipped'] += 1
continue
# Check if lot exists
existing = conn.execute(
"SELECT lot_id FROM lots WHERE lot_id = ?",
(lot_id,)
).fetchone()
if not existing:
print(f" → New lot: {lot_id}")
# Insert new lot
if not dry_run:
conn.execute("""
INSERT INTO lots
(lot_id, auction_id, url, title, current_bid, bid_count,
closing_time, viewing_time, pickup_date, location,
description, category, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
lot_id,
parsed_data.get('auction_id', ''),
url,
parsed_data.get('title', ''),
parsed_data.get('current_bid', ''),
parsed_data.get('bid_count', 0),
parsed_data.get('closing_time', ''),
parsed_data.get('viewing_time', ''),
parsed_data.get('pickup_date', ''),
parsed_data.get('location', ''),
parsed_data.get('description', ''),
parsed_data.get('category', ''),
parsed_data.get('scraped_at', '')
))
stats['updated'] += 1
else:
# Update existing lot with newly parsed data
# Only update fields that are now populated but weren't before
if not dry_run:
conn.execute("""
UPDATE lots SET
auction_id = COALESCE(NULLIF(?, ''), auction_id),
title = COALESCE(NULLIF(?, ''), title),
current_bid = COALESCE(NULLIF(?, ''), current_bid),
bid_count = CASE WHEN ? > 0 THEN ? ELSE bid_count END,
closing_time = COALESCE(NULLIF(?, ''), closing_time),
viewing_time = COALESCE(NULLIF(?, ''), viewing_time),
pickup_date = COALESCE(NULLIF(?, ''), pickup_date),
location = COALESCE(NULLIF(?, ''), location),
description = COALESCE(NULLIF(?, ''), description),
category = COALESCE(NULLIF(?, ''), category)
WHERE lot_id = ?
""", (
parsed_data.get('auction_id', ''),
parsed_data.get('title', ''),
parsed_data.get('current_bid', ''),
parsed_data.get('bid_count', 0),
parsed_data.get('bid_count', 0),
parsed_data.get('closing_time', ''),
parsed_data.get('viewing_time', ''),
parsed_data.get('pickup_date', ''),
parsed_data.get('location', ''),
parsed_data.get('description', ''),
parsed_data.get('category', ''),
lot_id
))
stats['updated'] += 1
print(f" ✓ Updated: {lot_id[:20]}")
# Update images if they exist
images = parsed_data.get('images', [])
if images and not dry_run:
for img_url in images:
conn.execute("""
INSERT OR IGNORE INTO images (lot_id, url)
VALUES (?, ?)
""", (lot_id, img_url))
stats['processed'] += 1
if stats['processed'] % 100 == 0:
print(f" Progress: {stats['processed']}/{len(cached_pages)}")
if not dry_run:
conn.commit()
except Exception as e:
print(f" ❌ Error processing {url}: {e}")
stats['errors'] += 1
continue
if not dry_run:
conn.commit()
print("\n" + "="*60)
print("MIGRATION COMPLETE")
print("="*60)
print(f"Processed: {stats['processed']}")
print(f"Updated: {stats['updated']}")
print(f"Skipped: {stats['skipped']}")
print(f"Errors: {stats['errors']}")
if dry_run:
print("\n⚠️ DRY RUN - No changes were made to the database")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Re-parse and update lot entries from cached HTML")
parser.add_argument('--db', default=CACHE_DB, help='Path to cache database')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
args = parser.parse_args()
print(f"Database: {args.db}")
print(f"Dry run: {args.dry_run}")
print()
reparse_and_update_lots(args.db, args.dry_run)