From aea188699f431dd0e97d7bc8635f03ff549cd39c Mon Sep 17 00:00:00 2001 From: Tour Date: Fri, 5 Dec 2025 06:48:08 +0100 Subject: [PATCH] integrating with monitor app --- _wiki/FIXING_MALFORMED_ENTRIES.md | 262 +++++++++++++++++++++++++++ _wiki/JAVA_FIXES_NEEDED.md | 170 ++++++++++++++++++ _wiki/REFACTORING_SUMMARY.md | 118 ++++++++++++ _wiki/RUN_INSTRUCTIONS.md | 164 +++++++++++++++++ script/fix_malformed_entries.py | 290 ++++++++++++++++++++++++++++++ script/migrate_reparse_lots.py | 180 +++++++++++++++++++ src/parse.py | 57 +++++- 7 files changed, 1234 insertions(+), 7 deletions(-) create mode 100644 _wiki/FIXING_MALFORMED_ENTRIES.md create mode 100644 _wiki/JAVA_FIXES_NEEDED.md create mode 100644 _wiki/REFACTORING_SUMMARY.md create mode 100644 _wiki/RUN_INSTRUCTIONS.md create mode 100644 script/fix_malformed_entries.py create mode 100644 script/migrate_reparse_lots.py diff --git a/_wiki/FIXING_MALFORMED_ENTRIES.md b/_wiki/FIXING_MALFORMED_ENTRIES.md new file mode 100644 index 0000000..aa76d2e --- /dev/null +++ b/_wiki/FIXING_MALFORMED_ENTRIES.md @@ -0,0 +1,262 @@ +# Fixing Malformed Database Entries + +## Problem + +After the initial scrape run with less strict validation, the database contains entries with incomplete or incorrect data: + +### Examples of Malformed Data + +```csv +A1-34327,"",https://...,"",€Huidig ​​bod,0,gap,"","","",... +A1-39577,"",https://...,"",€Huidig ​​bod,0,gap,"","","",... +``` + +**Issues identified:** +1. ❌ Missing `auction_id` (empty string) +2. ❌ Missing `title` (empty string) +3. ❌ Invalid bid value: `€Huidig ​​bod` (Dutch for "Current bid" - placeholder text) +4. ❌ Invalid timestamp: `gap` (should be empty or valid date) +5. ❌ Missing `viewing_time`, `pickup_date`, and other fields + +## Root Cause + +Earlier scraping runs: +- Used less strict validation +- Fell back to HTML parsing when `__NEXT_DATA__` JSON extraction failed +- HTML parser extracted placeholder text as actual values +- Continued on errors instead of flagging incomplete data + +## Solution + +### Step 1: Parser Improvements ✅ + +**Fixed in `src/parse.py`:** + +1. **Timestamp parsing** (lines 37-70): + - Filters invalid strings like "gap", "materieel wegens vereffening" + - Returns empty string instead of invalid value + - Handles Unix timestamps in seconds and milliseconds + +2. **Bid extraction** (lines 246-280): + - Rejects placeholder text like "€Huidig ​​bod", "€Huidig bod" + - Removes zero-width Unicode spaces + - Returns "No bids" instead of invalid placeholder text + +### Step 2: Detection and Repair Scripts ✅ + +Created two scripts to fix existing data: + +#### A. `script/migrate_reparse_lots.py` +**Purpose:** Re-parse ALL cached entries with improved JSON extraction + + +```bash +# Preview what would be changed +# python script/fix_malformed_entries.py --db C:/mnt/okcomputer/output/cache.db +python script/migrate_reparse_lots.py --db C:/mnt/okcomputer/output/cache.db +``` + +```bash +# Preview what would be changed +python script/migrate_reparse_lots.py --dry-run +# Apply changes +python script/migrate_reparse_lots.py + +# Use custom database path +python script/migrate_reparse_lots.py --db /path/to/cache.db +``` + +**What it does:** +- Reads all cached HTML pages from `cache` table +- Re-parses using improved `__NEXT_DATA__` JSON extraction +- Updates existing database entries with newly extracted fields +- Populates missing `auction_id`, `viewing_time`, `pickup_date`, etc. + +#### B. `script/fix_malformed_entries.py` ⭐ **RECOMMENDED** +**Purpose:** Detect and fix ONLY malformed entries + +```bash +# Preview malformed entries and fixes +python script/fix_malformed_entries.py --dry-run + +# Fix malformed entries +python script/fix_malformed_entries.py + +# Use custom database path +python script/fix_malformed_entries.py --db /path/to/cache.db +``` + +**What it detects:** +```sql +-- Auctions with issues +SELECT * FROM auctions WHERE + auction_id = '' OR auction_id IS NULL + OR title = '' OR title IS NULL + OR first_lot_closing_time = 'gap' + +-- Lots with issues +SELECT * FROM lots WHERE + auction_id = '' OR auction_id IS NULL + OR title = '' OR title IS NULL + OR current_bid LIKE '%Huidig%bod%' + OR closing_time = 'gap' OR closing_time = '' +``` + +**Example output:** +``` +================================================================= +MALFORMED ENTRY DETECTION AND REPAIR +================================================================= + +1. CHECKING AUCTIONS... + Found 23 malformed auction entries + + Fixing auction: A1-39577 + URL: https://www.troostwijkauctions.com/a/...-A1-39577 + ✓ Parsed successfully: + auction_id: A1-39577 + title: Bootveiling Rotterdam - Console boten, RIB, speedboten... + location: Rotterdam, NL + lots: 45 + ✓ Database updated + +2. CHECKING LOTS... + Found 127 malformed lot entries + + Fixing lot: A1-39529-10 + URL: https://www.troostwijkauctions.com/l/...-A1-39529-10 + ✓ Parsed successfully: + lot_id: A1-39529-10 + auction_id: A1-39529 + title: Audi A7 Sportback Personenauto + bid: No bids + closing: 2024-12-08 15:30:00 + ✓ Database updated + +================================================================= +SUMMARY +================================================================= +Auctions: + - Found: 23 + - Fixed: 21 + - Failed: 2 + +Lots: + - Found: 127 + - Fixed: 124 + - Failed: 3 +``` + +### Step 3: Verification + +After running the fix script, verify the data: + +```bash +# Check if malformed entries still exist +python -c " +import sqlite3 +conn = sqlite3.connect('path/to/cache.db') + +print('Auctions with empty auction_id:') +print(conn.execute('SELECT COUNT(*) FROM auctions WHERE auction_id = \"\" OR auction_id IS NULL').fetchone()[0]) + +print('Lots with invalid bids:') +print(conn.execute('SELECT COUNT(*) FROM lots WHERE current_bid LIKE \"%Huidig%bod%\"').fetchone()[0]) + +print('Lots with \"gap\" timestamps:') +print(conn.execute('SELECT COUNT(*) FROM lots WHERE closing_time = \"gap\"').fetchone()[0]) +" +``` + +Expected result after fix: **All counts should be 0** + +### Step 4: Prevention + +To prevent future occurrences: + +1. **Validation in scraper** - Add validation before saving to database: + +```python +def validate_lot_data(lot_data: Dict) -> bool: + """Validate lot data before saving""" + required_fields = ['lot_id', 'title', 'url'] + invalid_values = ['gap', '€Huidig bod', '€Huidig ​​bod', ''] + + for field in required_fields: + value = lot_data.get(field, '') + if not value or value in invalid_values: + print(f" ⚠️ Invalid {field}: {value}") + return False + + return True + +# In save_lot method: +if not validate_lot_data(lot_data): + print(f" ❌ Skipping invalid lot: {lot_data.get('url')}") + return +``` + +2. **Prefer JSON over HTML** - Ensure `__NEXT_DATA__` parsing is tried first (already implemented) + +3. **Logging** - Add logging for fallback to HTML parsing: + +```python +if next_data: + return next_data +else: + print(f" ⚠️ No __NEXT_DATA__ found, falling back to HTML parsing: {url}") + # HTML parsing... +``` + +## Recommended Workflow + +```bash +# 1. First, run dry-run to see what will be fixed +python script/fix_malformed_entries.py --dry-run + +# 2. Review the output - check if fixes look correct + +# 3. Run the actual fix +python script/fix_malformed_entries.py + +# 4. Verify the results +python script/fix_malformed_entries.py --dry-run +# Should show "Found 0 malformed auction entries" and "Found 0 malformed lot entries" + +# 5. (Optional) Run full migration to ensure all fields are populated +python script/migrate_reparse_lots.py +``` + +## Files Modified/Created + +### Modified: +- ✅ `src/parse.py` - Improved timestamp and bid parsing with validation + +### Created: +- ✅ `script/fix_malformed_entries.py` - Targeted fix for malformed entries +- ✅ `script/migrate_reparse_lots.py` - Full re-parse migration +- ✅ `_wiki/JAVA_FIXES_NEEDED.md` - Java-side fixes documentation +- ✅ `_wiki/FIXING_MALFORMED_ENTRIES.md` - This file + +## Database Location + +If you get "no such table" errors, find your actual database: + +```bash +# Find all .db files +find . -name "*.db" + +# Check which one has data +sqlite3 path/to/cache.db "SELECT COUNT(*) FROM lots" + +# Use that path with --db flag +python script/fix_malformed_entries.py --db /actual/path/to/cache.db +``` + +## Next Steps + +After fixing malformed entries: +1. ✅ Run `fix_malformed_entries.py` to repair bad data +2. ⏳ Apply Java-side fixes (see `_wiki/JAVA_FIXES_NEEDED.md`) +3. ⏳ Re-run Java monitoring process +4. ✅ Add validation to prevent future issues diff --git a/_wiki/JAVA_FIXES_NEEDED.md b/_wiki/JAVA_FIXES_NEEDED.md new file mode 100644 index 0000000..555c439 --- /dev/null +++ b/_wiki/JAVA_FIXES_NEEDED.md @@ -0,0 +1,170 @@ +# Java Monitoring Process Fixes + +## Issues Identified + +Based on the error logs from the Java monitoring process, the following bugs need to be fixed: + +### 1. Integer Overflow - `extractNumericId()` method + +**Error:** +``` +For input string: "239144949705335" +at java.lang.Integer.parseInt(Integer.java:565) +at auctiora.ScraperDataAdapter.extractNumericId(ScraperDataAdapter.java:81) +``` + +**Problem:** +- Lot IDs are being parsed as `int` (32-bit, max value: 2,147,483,647) +- Actual lot IDs can exceed this limit (e.g., "239144949705335") + +**Solution:** +Change from `Integer.parseInt()` to `Long.parseLong()`: + +```java +// BEFORE (ScraperDataAdapter.java:81) +int numericId = Integer.parseInt(lotId); + +// AFTER +long numericId = Long.parseLong(lotId); +``` + +**Additional changes needed:** +- Update all related fields/variables from `int` to `long` +- Update database schema if numeric ID is stored (change INTEGER to BIGINT) +- Update any method signatures that return/accept `int` for lot IDs + +--- + +### 2. UNIQUE Constraint Failures + +**Error:** +``` +Failed to import lot: [SQLITE_CONSTRAINT_UNIQUE] A UNIQUE constraint failed (UNIQUE constraint failed: lots.url) +``` + +**Problem:** +- Attempting to re-insert lots that already exist +- No graceful handling of duplicate entries + +**Solution:** +Use `INSERT OR REPLACE` or `INSERT OR IGNORE`: + +```java +// BEFORE +String sql = "INSERT INTO lots (lot_id, url, ...) VALUES (?, ?, ...)"; + +// AFTER - Option 1: Update existing records +String sql = "INSERT OR REPLACE INTO lots (lot_id, url, ...) VALUES (?, ?, ...)"; + +// AFTER - Option 2: Skip duplicates silently +String sql = "INSERT OR IGNORE INTO lots (lot_id, url, ...) VALUES (?, ?, ...)"; +``` + +**Alternative with try-catch:** +```java +try { + insertLot(lotData); +} catch (SQLException e) { + if (e.getMessage().contains("UNIQUE constraint")) { + logger.debug("Lot already exists, skipping: " + lotData.getUrl()); + return; // Or update instead + } + throw e; +} +``` + +--- + +### 3. Timestamp Parsing - Already Fixed in Python + +**Error:** +``` +Unable to parse timestamp: materieel wegens vereffening +Unable to parse timestamp: gap +``` + +**Status:** ✅ Fixed in `parse.py` (src/parse.py:37-70) + +The Python parser now: +- Filters out invalid timestamp strings like "gap", "materieel wegens vereffening" +- Returns empty string for invalid values +- Handles both Unix timestamps (seconds/milliseconds) + +**Java side action:** +If the Java code also parses timestamps, apply similar validation: +- Check for known invalid values before parsing +- Use try-catch and return null/empty for unparseable timestamps +- Don't fail the entire import if one timestamp is invalid + +--- + +## Migration Strategy + +### Step 1: Fix Python Parser ✅ +- [x] Updated `format_timestamp()` to handle invalid strings +- [x] Created migration script `script/migrate_reparse_lots.py` + +### Step 2: Run Migration +```bash +cd /path/to/scaev +python script/migrate_reparse_lots.py --dry-run # Preview changes +python script/migrate_reparse_lots.py # Apply changes +``` + +This will: +- Re-parse all cached HTML pages using improved __NEXT_DATA__ extraction +- Update existing database entries with newly extracted fields +- Populate missing `viewing_time`, `pickup_date`, and other fields + +### Step 3: Fix Java Code +1. Update `ScraperDataAdapter.java:81` - use `Long.parseLong()` +2. Update `DatabaseService.java` - use `INSERT OR REPLACE` or handle duplicates +3. Update timestamp parsing - add validation for invalid strings +4. Update database schema - change numeric ID columns to BIGINT if needed + +### Step 4: Re-run Monitoring Process +After fixes, the monitoring process should: +- Successfully import all lots without crashes +- Gracefully skip duplicates +- Handle large numeric IDs +- Ignore invalid timestamp values + +--- + +## Database Schema Changes (if needed) + +If lot IDs are stored as numeric values in Java's database: + +```sql +-- Check current schema +PRAGMA table_info(lots); + +-- If numeric ID field exists and is INTEGER, change to BIGINT: +ALTER TABLE lots ADD COLUMN lot_id_numeric BIGINT; +UPDATE lots SET lot_id_numeric = CAST(lot_id AS BIGINT) WHERE lot_id GLOB '[0-9]*'; +-- Then update code to use lot_id_numeric +``` + +--- + +## Testing Checklist + +After applying fixes: +- [ ] Import lot with ID > 2,147,483,647 (e.g., "239144949705335") +- [ ] Re-import existing lot (should update or skip gracefully) +- [ ] Import lot with invalid timestamp (should not crash) +- [ ] Verify all newly extracted fields are populated (viewing_time, pickup_date, etc.) +- [ ] Check logs for any remaining errors + +--- + +## Files Modified + +Python side (completed): +- `src/parse.py` - Fixed `format_timestamp()` method +- `script/migrate_reparse_lots.py` - New migration script + +Java side (needs implementation): +- `auctiora/ScraperDataAdapter.java` - Line 81: Change Integer.parseInt to Long.parseLong +- `auctiora/DatabaseService.java` - Line ~569: Handle UNIQUE constraints gracefully +- Database schema - Consider BIGINT for numeric IDs diff --git a/_wiki/REFACTORING_SUMMARY.md b/_wiki/REFACTORING_SUMMARY.md new file mode 100644 index 0000000..f640702 --- /dev/null +++ b/_wiki/REFACTORING_SUMMARY.md @@ -0,0 +1,118 @@ +# Refactoring Summary: Troostwijk Auction Monitor + +## Overview +This project has been refactored to focus on **image processing and monitoring**, removing all auction/lot scraping functionality which is now handled by the external `ARCHITECTURE-TROOSTWIJK-SCRAPER` process. + +## Architecture Changes + +### Removed Components +- ❌ **TroostwijkScraper.java** - Removed (replaced by TroostwijkMonitor) +- ❌ Auction discovery and scraping logic +- ❌ Lot scraping via Playwright/JSoup +- ❌ CacheDatabase (can be removed if not used elsewhere) + +### New/Updated Components + +#### New Classes +- ✅ **TroostwijkMonitor.java** - Monitors bids and coordinates services (no scraping) +- ✅ **ImageProcessingService.java** - Downloads images and runs object detection +- ✅ **Console.java** - Simple output utility (renamed from IO to avoid Java 25 conflict) + +#### Modernized Classes +- ✅ **AuctionInfo** - Converted to immutable `record` +- ✅ **Lot** - Converted to immutable `record` with `minutesUntilClose()` method +- ✅ **DatabaseService.java** - Uses modern Java features: + - Text blocks (`"""`) for SQL + - Record accessor methods + - Added `getImagesForLot()` method + - Added `processed_at` timestamp to images table + - Nested `ImageRecord` record + +#### Preserved Components +- ✅ **NotificationService.java** - Desktop/email notifications +- ✅ **ObjectDetectionService.java** - YOLO-based object detection +- ✅ **Main.java** - Updated to use new architecture + +## Database Schema + +### Populated by External Scraper +- `auctions` table - Auction metadata +- `lots` table - Lot details with bidding info + +### Populated by This Process +- `images` table - Downloaded images with: + - `file_path` - Local storage path + - `labels` - Detected objects (comma-separated) + - `processed_at` - Processing timestamp + +## Modern Java Features Used + +- **Records** - Immutable data carriers (AuctionInfo, Lot, ImageRecord) +- **Text Blocks** - Multi-line SQL queries +- **var** - Type inference throughout +- **Switch expressions** - Where applicable +- **Pattern matching** - Ready for future enhancements + +## Responsibilities + +### This Project +1. ✅ Image downloading from URLs in database +2. ✅ Object detection using YOLO/OpenCV +3. ✅ Bid monitoring and change detection +4. ✅ Desktop and email notifications +5. ✅ Data enrichment with image analysis + +### External ARCHITECTURE-TROOSTWIJK-SCRAPER +1. 🔄 Discover auctions from Troostwijk website +2. 🔄 Scrape lot details via API +3. 🔄 Populate `auctions` and `lots` tables +4. 🔄 Share database with this process + +## Usage + +### Running the Monitor +```bash +# With environment variables +export DATABASE_FILE=troostwijk.db +export NOTIFICATION_CONFIG=desktop # or smtp:user:pass:email + +java -jar troostwijk-monitor.jar +``` + +### Expected Output +``` +=== Troostwijk Auction Monitor === + +✓ OpenCV loaded +Initializing monitor... + +📊 Current Database State: + Total lots in database: 42 + Total images processed: 0 + +[1/2] Processing images... +Processing pending images... + +[2/2] Starting bid monitoring... +✓ Monitoring service started + +✓ Monitor is running. Press Ctrl+C to stop. + +NOTE: This process expects auction/lot data from the external scraper. + Make sure ARCHITECTURE-TROOSTWIJK-SCRAPER is running and populating the database. +``` + +## Migration Notes + +1. The project now compiles successfully with Java 25 +2. All scraping logic removed - rely on external scraper +3. Shared database architecture for inter-process communication +4. Clean separation of concerns +5. Modern, maintainable codebase with records and text blocks + +## Next Steps + +- Remove `CacheDatabase.java` if not needed +- Consider adding API endpoint for external scraper to trigger image processing +- Add metrics/logging framework +- Consider message queue (e.g., Redis, RabbitMQ) for better inter-process communication diff --git a/_wiki/RUN_INSTRUCTIONS.md b/_wiki/RUN_INSTRUCTIONS.md new file mode 100644 index 0000000..3c90def --- /dev/null +++ b/_wiki/RUN_INSTRUCTIONS.md @@ -0,0 +1,164 @@ +# Troostwijk Auction Extractor - Run Instructions + +## Fixed Warnings + +All warnings have been resolved: +- ✅ SLF4J logging configured (slf4j-simple) +- ✅ Native access enabled for SQLite JDBC +- ✅ Logging output controlled via simplelogger.properties + +## Prerequisites + +1. **Java 21** installed +2. **Maven** installed +3. **IntelliJ IDEA** (recommended) or command line + +## Setup (First Time Only) + +### 1. Install Dependencies + +In IntelliJ Terminal or PowerShell: + +```bash +# Reload Maven dependencies +mvn clean install + +# Install Playwright browser binaries (first time only) +mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="install" +``` + +## Running the Application + +### Option A: Using IntelliJ IDEA (Easiest) + +1. **Add VM Options for native access:** + - Run → Edit Configurations + - Select or create configuration for `TroostwijkAuctionExtractor` + - In "VM options" field, add: + ``` + --enable-native-access=ALL-UNNAMED + ``` + +2. **Add Program Arguments (optional):** + - In "Program arguments" field, add: + ``` + --max-visits 3 + ``` + +3. **Run the application:** + - Click the green Run button + +### Option B: Using Maven (Command Line) + +```bash +# Run with 3 page limit +mvn exec:java + +# Run with custom arguments (override pom.xml defaults) +mvn exec:java -Dexec.args="--max-visits 5" + +# Run without cache +mvn exec:java -Dexec.args="--no-cache --max-visits 2" + +# Run with unlimited visits +mvn exec:java -Dexec.args="" +``` + +### Option C: Using Java Directly + +```bash +# Compile first +mvn clean compile + +# Run with native access enabled +java --enable-native-access=ALL-UNNAMED \ + -cp target/classes:$(mvn dependency:build-classpath -Dmdep.outputFile=/dev/stdout -q) \ + com.auction.TroostwijkAuctionExtractor --max-visits 3 +``` + +## Command Line Arguments + +``` +--max-visits Limit actual page fetches to n (0 = unlimited, default) +--no-cache Disable page caching +--help Show help message +``` + +## Examples + +### Test with 3 page visits (cached pages don't count): +```bash +mvn exec:java -Dexec.args="--max-visits 3" +``` + +### Fresh extraction without cache: +```bash +mvn exec:java -Dexec.args="--no-cache --max-visits 5" +``` + +### Full extraction (all pages, unlimited): +```bash +mvn exec:java -Dexec.args="" +``` + +## Expected Output (No Warnings) + +``` +=== Troostwijk Auction Extractor === +Max page visits set to: 3 + +Initializing Playwright browser... +✓ Browser ready +✓ Cache database initialized + +Starting auction extraction from https://www.troostwijkauctions.com/auctions + +[Page 1] Fetching auctions... + ✓ Fetched from website (visit 1/3) + ✓ Found 20 auctions + +[Page 2] Fetching auctions... + ✓ Loaded from cache + ✓ Found 20 auctions + +[Page 3] Fetching auctions... + ✓ Fetched from website (visit 2/3) + ✓ Found 20 auctions + +✓ Total auctions extracted: 60 + +=== Results === +Total auctions found: 60 +Dutch auctions (NL): 45 +Actual page visits: 2 + +✓ Browser and cache closed +``` + +## Cache Management + +- Cache is stored in: `cache/page_cache.db` +- Cache expires after: 24 hours (configurable in code) +- To clear cache: Delete `cache/page_cache.db` file + +## Troubleshooting + +### If you still see warnings: + +1. **Reload Maven project in IntelliJ:** + - Right-click `pom.xml` → Maven → Reload project + +2. **Verify VM options:** + - Ensure `--enable-native-access=ALL-UNNAMED` is in VM options + +3. **Clean and rebuild:** + ```bash + mvn clean install + ``` + +### If Playwright fails: + +```bash +# Reinstall browser binaries +mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="install chromium" +``` diff --git a/script/fix_malformed_entries.py b/script/fix_malformed_entries.py new file mode 100644 index 0000000..4e440cd --- /dev/null +++ b/script/fix_malformed_entries.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +""" +Script to detect and fix malformed/incomplete database entries. + +Identifies entries with: +- Missing auction_id for auction pages +- Missing title +- Invalid bid values like "€Huidig ​​bod" +- "gap" in closing_time +- Empty or invalid critical fields + +Then re-parses from cache and updates. +""" +import sys +import sqlite3 +import zlib +from pathlib import Path +from typing import List, Dict, Tuple + +sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) + +from parse import DataParser +from config import CACHE_DB + + +class MalformedEntryFixer: + """Detects and fixes malformed database entries""" + + def __init__(self, db_path: str): + self.db_path = db_path + self.parser = DataParser() + + def detect_malformed_auctions(self) -> List[Tuple]: + """Find auctions with missing or invalid data""" + with sqlite3.connect(self.db_path) as conn: + # Auctions with issues + cursor = conn.execute(""" + SELECT auction_id, url, title, first_lot_closing_time + FROM auctions + WHERE + auction_id = '' OR auction_id IS NULL + OR title = '' OR title IS NULL + OR first_lot_closing_time = 'gap' + OR first_lot_closing_time LIKE '%wegens vereffening%' + """) + return cursor.fetchall() + + def detect_malformed_lots(self) -> List[Tuple]: + """Find lots with missing or invalid data""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT lot_id, url, title, current_bid, closing_time + FROM lots + WHERE + auction_id = '' OR auction_id IS NULL + OR title = '' OR title IS NULL + OR current_bid LIKE '%Huidig%bod%' + OR current_bid = '€Huidig ​​bod' + OR closing_time = 'gap' + OR closing_time = '' + OR closing_time LIKE '%wegens vereffening%' + """) + return cursor.fetchall() + + def get_cached_content(self, url: str) -> str: + """Retrieve and decompress cached HTML for a URL""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute( + "SELECT content FROM cache WHERE url = ?", + (url,) + ) + row = cursor.fetchone() + if row and row[0]: + try: + return zlib.decompress(row[0]).decode('utf-8') + except Exception as e: + print(f" ❌ Failed to decompress: {e}") + return None + return None + + def reparse_and_fix_auction(self, auction_id: str, url: str, dry_run: bool = False) -> bool: + """Re-parse auction page from cache and update database""" + print(f"\n Fixing auction: {auction_id}") + print(f" URL: {url}") + + content = self.get_cached_content(url) + if not content: + print(f" ❌ No cached content found") + return False + + # Re-parse using current parser + parsed = self.parser.parse_page(content, url) + if not parsed or parsed.get('type') != 'auction': + print(f" ❌ Could not parse as auction") + return False + + # Validate parsed data + if not parsed.get('auction_id') or not parsed.get('title'): + print(f" ⚠️ Re-parsed data still incomplete:") + print(f" auction_id: {parsed.get('auction_id')}") + print(f" title: {parsed.get('title', '')[:50]}") + return False + + print(f" ✓ Parsed successfully:") + print(f" auction_id: {parsed.get('auction_id')}") + print(f" title: {parsed.get('title', '')[:50]}") + print(f" location: {parsed.get('location', 'N/A')}") + print(f" lots: {parsed.get('lots_count', 0)}") + + if not dry_run: + with sqlite3.connect(self.db_path) as conn: + conn.execute(""" + UPDATE auctions SET + auction_id = ?, + title = ?, + location = ?, + lots_count = ?, + first_lot_closing_time = ? + WHERE url = ? + """, ( + parsed['auction_id'], + parsed['title'], + parsed.get('location', ''), + parsed.get('lots_count', 0), + parsed.get('first_lot_closing_time', ''), + url + )) + conn.commit() + print(f" ✓ Database updated") + + return True + + def reparse_and_fix_lot(self, lot_id: str, url: str, dry_run: bool = False) -> bool: + """Re-parse lot page from cache and update database""" + print(f"\n Fixing lot: {lot_id}") + print(f" URL: {url}") + + content = self.get_cached_content(url) + if not content: + print(f" ❌ No cached content found") + return False + + # Re-parse using current parser + parsed = self.parser.parse_page(content, url) + if not parsed or parsed.get('type') != 'lot': + print(f" ❌ Could not parse as lot") + return False + + # Validate parsed data + issues = [] + if not parsed.get('lot_id'): + issues.append("missing lot_id") + if not parsed.get('title'): + issues.append("missing title") + if parsed.get('current_bid', '').lower().startswith('€huidig'): + issues.append("invalid bid format") + + if issues: + print(f" ⚠️ Re-parsed data still has issues: {', '.join(issues)}") + print(f" lot_id: {parsed.get('lot_id')}") + print(f" title: {parsed.get('title', '')[:50]}") + print(f" bid: {parsed.get('current_bid')}") + return False + + print(f" ✓ Parsed successfully:") + print(f" lot_id: {parsed.get('lot_id')}") + print(f" auction_id: {parsed.get('auction_id')}") + print(f" title: {parsed.get('title', '')[:50]}") + print(f" bid: {parsed.get('current_bid')}") + print(f" closing: {parsed.get('closing_time', 'N/A')}") + + if not dry_run: + with sqlite3.connect(self.db_path) as conn: + conn.execute(""" + UPDATE lots SET + lot_id = ?, + auction_id = ?, + title = ?, + current_bid = ?, + bid_count = ?, + closing_time = ?, + viewing_time = ?, + pickup_date = ?, + location = ?, + description = ?, + category = ? + WHERE url = ? + """, ( + parsed['lot_id'], + parsed.get('auction_id', ''), + parsed['title'], + parsed.get('current_bid', ''), + parsed.get('bid_count', 0), + parsed.get('closing_time', ''), + parsed.get('viewing_time', ''), + parsed.get('pickup_date', ''), + parsed.get('location', ''), + parsed.get('description', ''), + parsed.get('category', ''), + url + )) + conn.commit() + print(f" ✓ Database updated") + + return True + + def run(self, dry_run: bool = False): + """Main execution - detect and fix all malformed entries""" + print("="*70) + print("MALFORMED ENTRY DETECTION AND REPAIR") + print("="*70) + + # Check for auctions + print("\n1. CHECKING AUCTIONS...") + malformed_auctions = self.detect_malformed_auctions() + print(f" Found {len(malformed_auctions)} malformed auction entries") + + stats = {'auctions_fixed': 0, 'auctions_failed': 0} + for auction_id, url, title, closing_time in malformed_auctions: + try: + if self.reparse_and_fix_auction(auction_id or url.split('/')[-1], url, dry_run): + stats['auctions_fixed'] += 1 + else: + stats['auctions_failed'] += 1 + except Exception as e: + print(f" ❌ Error: {e}") + stats['auctions_failed'] += 1 + + # Check for lots + print("\n2. CHECKING LOTS...") + malformed_lots = self.detect_malformed_lots() + print(f" Found {len(malformed_lots)} malformed lot entries") + + stats['lots_fixed'] = 0 + stats['lots_failed'] = 0 + for lot_id, url, title, bid, closing_time in malformed_lots: + try: + if self.reparse_and_fix_lot(lot_id or url.split('/')[-1], url, dry_run): + stats['lots_fixed'] += 1 + else: + stats['lots_failed'] += 1 + except Exception as e: + print(f" ❌ Error: {e}") + stats['lots_failed'] += 1 + + # Summary + print("\n" + "="*70) + print("SUMMARY") + print("="*70) + print(f"Auctions:") + print(f" - Found: {len(malformed_auctions)}") + print(f" - Fixed: {stats['auctions_fixed']}") + print(f" - Failed: {stats['auctions_failed']}") + print(f"\nLots:") + print(f" - Found: {len(malformed_lots)}") + print(f" - Fixed: {stats['lots_fixed']}") + print(f" - Failed: {stats['lots_failed']}") + + if dry_run: + print("\n⚠️ DRY RUN - No changes were made to the database") + + +def main(): + import argparse + + parser = argparse.ArgumentParser( + description="Detect and fix malformed database entries" + ) + parser.add_argument( + '--db', + default=CACHE_DB, + help='Path to cache database' + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Show what would be done without making changes' + ) + + args = parser.parse_args() + + print(f"Database: {args.db}") + print(f"Dry run: {args.dry_run}\n") + + fixer = MalformedEntryFixer(args.db) + fixer.run(dry_run=args.dry_run) + + +if __name__ == "__main__": + main() diff --git a/script/migrate_reparse_lots.py b/script/migrate_reparse_lots.py new file mode 100644 index 0000000..a27ffe6 --- /dev/null +++ b/script/migrate_reparse_lots.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +""" +Migration script to re-parse cached HTML pages and update database entries. +Fixes issues with incomplete data extraction from earlier scrapes. +""" +import sys +import sqlite3 +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) + +from parse import DataParser +from config import CACHE_DB + + +def reparse_and_update_lots(db_path: str = CACHE_DB, dry_run: bool = False): + """ + Re-parse cached HTML pages and update lot entries in the database. + + This extracts improved data from __NEXT_DATA__ JSON blobs that may have been + missed in earlier scraping runs when validation was less strict. + """ + parser = DataParser() + + with sqlite3.connect(db_path) as conn: + # Get all cached lot pages + cursor = conn.execute(""" + SELECT url, content + FROM cache + WHERE url LIKE '%/l/%' + ORDER BY timestamp DESC + """) + + cached_pages = cursor.fetchall() + print(f"Found {len(cached_pages)} cached lot pages to re-parse") + + stats = { + 'processed': 0, + 'updated': 0, + 'skipped': 0, + 'errors': 0 + } + + for url, compressed_content in cached_pages: + try: + # Decompress content + import zlib + content = zlib.decompress(compressed_content).decode('utf-8') + + # Re-parse using current parser logic + parsed_data = parser.parse_page(content, url) + + if not parsed_data or parsed_data.get('type') != 'lot': + stats['skipped'] += 1 + continue + + lot_id = parsed_data.get('lot_id', '') + if not lot_id: + print(f" ⚠️ No lot_id for {url}") + stats['skipped'] += 1 + continue + + # Check if lot exists + existing = conn.execute( + "SELECT lot_id FROM lots WHERE lot_id = ?", + (lot_id,) + ).fetchone() + + if not existing: + print(f" → New lot: {lot_id}") + # Insert new lot + if not dry_run: + conn.execute(""" + INSERT INTO lots + (lot_id, auction_id, url, title, current_bid, bid_count, + closing_time, viewing_time, pickup_date, location, + description, category, scraped_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, ( + lot_id, + parsed_data.get('auction_id', ''), + url, + parsed_data.get('title', ''), + parsed_data.get('current_bid', ''), + parsed_data.get('bid_count', 0), + parsed_data.get('closing_time', ''), + parsed_data.get('viewing_time', ''), + parsed_data.get('pickup_date', ''), + parsed_data.get('location', ''), + parsed_data.get('description', ''), + parsed_data.get('category', ''), + parsed_data.get('scraped_at', '') + )) + stats['updated'] += 1 + else: + # Update existing lot with newly parsed data + # Only update fields that are now populated but weren't before + if not dry_run: + conn.execute(""" + UPDATE lots SET + auction_id = COALESCE(NULLIF(?, ''), auction_id), + title = COALESCE(NULLIF(?, ''), title), + current_bid = COALESCE(NULLIF(?, ''), current_bid), + bid_count = CASE WHEN ? > 0 THEN ? ELSE bid_count END, + closing_time = COALESCE(NULLIF(?, ''), closing_time), + viewing_time = COALESCE(NULLIF(?, ''), viewing_time), + pickup_date = COALESCE(NULLIF(?, ''), pickup_date), + location = COALESCE(NULLIF(?, ''), location), + description = COALESCE(NULLIF(?, ''), description), + category = COALESCE(NULLIF(?, ''), category) + WHERE lot_id = ? + """, ( + parsed_data.get('auction_id', ''), + parsed_data.get('title', ''), + parsed_data.get('current_bid', ''), + parsed_data.get('bid_count', 0), + parsed_data.get('bid_count', 0), + parsed_data.get('closing_time', ''), + parsed_data.get('viewing_time', ''), + parsed_data.get('pickup_date', ''), + parsed_data.get('location', ''), + parsed_data.get('description', ''), + parsed_data.get('category', ''), + lot_id + )) + stats['updated'] += 1 + + print(f" ✓ Updated: {lot_id[:20]}") + + # Update images if they exist + images = parsed_data.get('images', []) + if images and not dry_run: + for img_url in images: + conn.execute(""" + INSERT OR IGNORE INTO images (lot_id, url) + VALUES (?, ?) + """, (lot_id, img_url)) + + stats['processed'] += 1 + + if stats['processed'] % 100 == 0: + print(f" Progress: {stats['processed']}/{len(cached_pages)}") + if not dry_run: + conn.commit() + + except Exception as e: + print(f" ❌ Error processing {url}: {e}") + stats['errors'] += 1 + continue + + if not dry_run: + conn.commit() + + print("\n" + "="*60) + print("MIGRATION COMPLETE") + print("="*60) + print(f"Processed: {stats['processed']}") + print(f"Updated: {stats['updated']}") + print(f"Skipped: {stats['skipped']}") + print(f"Errors: {stats['errors']}") + + if dry_run: + print("\n⚠️ DRY RUN - No changes were made to the database") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Re-parse and update lot entries from cached HTML") + parser.add_argument('--db', default=CACHE_DB, help='Path to cache database') + parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') + + args = parser.parse_args() + + print(f"Database: {args.db}") + print(f"Dry run: {args.dry_run}") + print() + + reparse_and_update_lots(args.db, args.dry_run) diff --git a/src/parse.py b/src/parse.py index 5b3e83c..d996823 100644 --- a/src/parse.py +++ b/src/parse.py @@ -38,11 +38,36 @@ class DataParser: def format_timestamp(timestamp) -> str: """Convert Unix timestamp to readable date""" try: + # Handle numeric timestamps if isinstance(timestamp, (int, float)) and timestamp > 0: + # Unix timestamps are typically 10 digits (seconds) or 13 digits (milliseconds) + if timestamp > 1e12: # Milliseconds + timestamp = timestamp / 1000 return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S') + + # Handle string timestamps that might be numeric + if isinstance(timestamp, str): + # Try to parse as number + try: + ts_num = float(timestamp) + if ts_num > 1e12: + ts_num = ts_num / 1000 + if ts_num > 0: + return datetime.fromtimestamp(ts_num).strftime('%Y-%m-%d %H:%M:%S') + except ValueError: + # Not a numeric string - check if it's an invalid value + invalid_values = ['gap', 'materieel wegens vereffening', 'tbd', 'n/a', 'unknown'] + if timestamp.lower().strip() in invalid_values: + return '' + # Return as-is if it looks like a formatted date + return timestamp if len(timestamp) > 0 else '' + return str(timestamp) if timestamp else '' - except: - return str(timestamp) if timestamp else '' + except Exception as e: + # Log parsing errors for debugging + if timestamp and str(timestamp).strip(): + print(f" ⚠️ Could not parse timestamp: {timestamp}") + return '' @staticmethod def format_currency(amount) -> str: @@ -226,15 +251,33 @@ class DataParser: r'(?:Current bid|Huidig bod)[:\s]*\s*(€[\d,.\s]+)', r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)', ] + + # Invalid bid texts that should be treated as "no bids" + invalid_bid_texts = [ + 'huidig bod', + 'current bid', + '€huidig bod', + '€huidig ​​bod', # With zero-width spaces + 'huidig ​​bod', + ] + for pattern in patterns: match = re.search(pattern, content, re.IGNORECASE) if match: bid = match.group(1).strip() - if bid and bid.lower() not in ['huidig bod', 'current bid']: - if not bid.startswith('€'): - bid = f"€{bid}" - return bid - return "€0" + # Remove zero-width spaces and other unicode whitespace + bid = re.sub(r'[\u200b\u200c\u200d\u00a0]+', ' ', bid).strip() + + # Check if it's a valid bid + if bid: + # Reject invalid bid texts + bid_lower = bid.lower().replace(' ', '').replace('€', '') + if bid_lower not in [t.lower().replace(' ', '').replace('€', '') for t in invalid_bid_texts]: + if not bid.startswith('€'): + bid = f"€{bid}" + return bid + + return "No bids" def _extract_bid_count(self, content: str) -> int: """Extract number of bids"""