diff --git a/.gitignore b/.gitignore index f445390..119e892 100644 --- a/.gitignore +++ b/.gitignore @@ -28,8 +28,6 @@ share/python-wheels/ MANIFEST # PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec @@ -83,31 +81,6 @@ target/ profile_default/ ipython_config.py -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control .pdm.toml .pdm-python .pdm-build/ @@ -155,11 +128,6 @@ dmypy.json # Cython debug symbols cython_debug/ -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ # Project specific - Scaev diff --git a/AUTOSTART_SETUP.md b/docs/AUTOSTART_SETUP.md similarity index 100% rename from AUTOSTART_SETUP.md rename to docs/AUTOSTART_SETUP.md diff --git a/docs/COMPREHENSIVE_UPDATE_PLAN.md b/docs/COMPREHENSIVE_UPDATE_PLAN.md deleted file mode 100644 index 89e81e7..0000000 --- a/docs/COMPREHENSIVE_UPDATE_PLAN.md +++ /dev/null @@ -1,143 +0,0 @@ -# Comprehensive Data Enrichment Plan - -## Current Status: Working Features -✅ Image downloads (concurrent) -✅ Basic bid data (current_bid, starting_bid, minimum_bid, bid_count, closing_time) -✅ Status extraction -✅ Brand/Model from attributes -✅ Attributes JSON storage - -## Phase 1: Core Bidding Intelligence (HIGH PRIORITY) - -### Data Sources Identified: -1. **GraphQL lot bidding API** - Already integrated - - currentBidAmount, initialAmount, bidsCount - - startDate, endDate (for first_bid_time calculation) - -2. **REST bid history API** ✨ NEW DISCOVERY - - Endpoint: `https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history` - - Returns: bid amounts, timestamps, autobid flags, bidder IDs - - Pagination supported - -### Database Schema Changes: - -```sql --- Extend lots table with bidding intelligence -ALTER TABLE lots ADD COLUMN estimated_min DECIMAL(12,2); -ALTER TABLE lots ADD COLUMN estimated_max DECIMAL(12,2); -ALTER TABLE lots ADD COLUMN reserve_price DECIMAL(12,2); -ALTER TABLE lots ADD COLUMN reserve_met BOOLEAN DEFAULT FALSE; -ALTER TABLE lots ADD COLUMN bid_increment DECIMAL(12,2); -ALTER TABLE lots ADD COLUMN watch_count INTEGER DEFAULT 0; -ALTER TABLE lots ADD COLUMN first_bid_time TEXT; -ALTER TABLE lots ADD COLUMN last_bid_time TEXT; -ALTER TABLE lots ADD COLUMN bid_velocity DECIMAL(5,2); - --- NEW: Bid history table -CREATE TABLE IF NOT EXISTS bid_history ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - lot_id TEXT NOT NULL, - lot_uuid TEXT NOT NULL, - bid_amount DECIMAL(12,2) NOT NULL, - bid_time TEXT NOT NULL, - is_winning BOOLEAN DEFAULT FALSE, - is_autobid BOOLEAN DEFAULT FALSE, - bidder_id TEXT, - bidder_number INTEGER, - created_at TEXT DEFAULT CURRENT_TIMESTAMP, - FOREIGN KEY (lot_id) REFERENCES lots(lot_id) -); - -CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time ON bid_history(lot_id, bid_time); -CREATE INDEX IF NOT EXISTS idx_bid_history_bidder ON bid_history(bidder_id); -``` - -### Implementation: -- Add `fetch_bid_history()` function to call REST API -- Parse and store all historical bids -- Calculate bid_velocity (bids per hour) -- Extract first_bid_time, last_bid_time - -## Phase 2: Valuation Intelligence - -### Data Sources: -1. **Attributes array** (already in __NEXT_DATA__) - - condition, year, manufacturer, model, serial_number - -2. **Description field** - - Extract year patterns, condition mentions, damage descriptions - -### Database Schema: - -```sql --- Valuation fields -ALTER TABLE lots ADD COLUMN condition_score DECIMAL(3,2); -ALTER TABLE lots ADD COLUMN condition_description TEXT; -ALTER TABLE lots ADD COLUMN year_manufactured INTEGER; -ALTER TABLE lots ADD COLUMN serial_number TEXT; -ALTER TABLE lots ADD COLUMN manufacturer TEXT; -ALTER TABLE lots ADD COLUMN damage_description TEXT; -ALTER TABLE lots ADD COLUMN provenance TEXT; -``` - -### Implementation: -- Parse attributes for: Jaar, Conditie, Serienummer, Fabrikant -- Extract 4-digit years from title/description -- Map condition values to 0-10 scale - -## Phase 3: Auction House Intelligence - -### Data Sources: -1. **GraphQL auction query** - - Already partially working - -2. **Auction __NEXT_DATA__** - - May contain buyer's premium, shipping costs - -### Database Schema: - -```sql -ALTER TABLE auctions ADD COLUMN buyers_premium_percent DECIMAL(5,2); -ALTER TABLE auctions ADD COLUMN shipping_available BOOLEAN; -ALTER TABLE auctions ADD COLUMN payment_methods TEXT; -``` - -## Viewing/Pickup Times Resolution - -### Finding: -- `viewingDays` and `collectionDays` in GraphQL only return location (city, countryCode) -- Times are NOT in the GraphQL API -- Times must be in auction __NEXT_DATA__ or not set for many auctions - -### Solution: -- Mark viewing_time/pickup_date as "location only" when times unavailable -- Store: "Nijmegen, NL" instead of full date/time string -- Accept that many auctions don't have viewing times set - -## Priority Implementation Order: - -1. **BID HISTORY API** (30 min) - Highest value - - Fetch and store all bid history - - Calculate bid_velocity - - Track autobid patterns - -2. **ENRICHED ATTRIBUTES** (20 min) - Medium-high value - - Extract year, condition, manufacturer from existing data - - Parse description for damage/condition mentions - -3. **VIEWING/PICKUP FIX** (10 min) - Low value (data often missing) - - Update to store location-only when times unavailable - -## Data Quality Expectations: - -| Field | Coverage Expected | Source | -|-------|------------------|---------| -| bid_history | 100% (for lots with bids) | REST API | -| bid_velocity | 100% (calculated) | Derived | -| year_manufactured | ~40% | Attributes/Title | -| condition_score | ~30% | Attributes | -| manufacturer | ~60% | Attributes | -| viewing_time | ~20% | Often not set | -| buyers_premium | 100% | GraphQL/Props | - -## Estimated Total Implementation Time: 60-90 minutes diff --git a/docs/ENHANCED_LOGGING_EXAMPLE.md b/docs/ENHANCED_LOGGING_EXAMPLE.md deleted file mode 100644 index 4b0a4fe..0000000 --- a/docs/ENHANCED_LOGGING_EXAMPLE.md +++ /dev/null @@ -1,294 +0,0 @@ -# Enhanced Logging Examples - -## What Changed in the Logs - -The scraper now displays **5 new intelligence fields** during scraping, making it easy to spot opportunities in real-time. - ---- - -## Example 1: Bargain Opportunity (High Value) - -### Before: -``` -[8766/15859] -[PAGE ford-generator-A1-34731-107] - Type: LOT - Title: Ford FGT9250E Generator... - Fetching bidding data from API... - Bid: EUR 500.00 - Status: Geen Minimumprijs - Location: Venray, NL - Images: 6 - Downloaded: 6/6 images -``` - -### After (with new fields): -``` -[8766/15859] -[PAGE ford-generator-A1-34731-107] - Type: LOT - Title: Ford FGT9250E Generator... - Fetching bidding data from API... - Bid: EUR 500.00 - Status: Geen Minimumprijs - Followers: 12 watching ← NEW - Estimate: EUR 1200.00 - EUR 1800.00 ← NEW - >> BARGAIN: 58% below estimate! ← NEW (auto-calculated) - Condition: Used - Good working order ← NEW - Item: 2015 Ford FGT9250E ← NEW (enhanced) - Fetching bid history... - >> Bid velocity: 2.4 bids/hour ← Enhanced - Location: Venray, NL - Images: 6 - Downloaded: 6/6 images -``` - -**Intelligence at a glance:** -- 🔥 **BARGAIN ALERT** - 58% below estimate = great opportunity -- 👁 **12 followers** - good interest level -- 📈 **2.4 bids/hour** - active bidding -- ✅ **Good condition** - quality item -- 💰 **Potential profit:** €700 - €1,300 - ---- - -## Example 2: Sleeper Lot (Hidden Opportunity) - -### After (with new fields): -``` -[8767/15859] -[PAGE macbook-pro-15-A1-35223-89] - Type: LOT - Title: MacBook Pro 15" 2019... - Fetching bidding data from API... - Bid: No bids - Status: Geen Minimumprijs - Followers: 47 watching ← NEW - HIGH INTEREST! - Estimate: EUR 800.00 - EUR 1200.00 ← NEW - Condition: Used - Like new ← NEW - Item: 2019 Apple MacBook Pro 15" ← NEW - Location: Amsterdam, NL - Images: 8 - Downloaded: 8/8 images -``` - -**Intelligence at a glance:** -- 👀 **47 followers** but **NO BIDS** = sleeper lot -- 💎 **Like new condition** - premium quality -- 📊 **Good estimate range** - clear valuation -- ⏰ **Early opportunity** - bid before competition heats up - ---- - -## Example 3: Active Auction with Competition - -### After (with new fields): -``` -[8768/15859] -[PAGE iphone-15-pro-A1-34987-12] - Type: LOT - Title: iPhone 15 Pro 256GB... - Fetching bidding data from API... - Bid: EUR 650.00 - Status: Minimumprijs nog niet gehaald - Followers: 32 watching ← NEW - Estimate: EUR 900.00 - EUR 1100.00 ← NEW - Value gap: 28% below estimate ← NEW - Condition: Used - Excellent ← NEW - Item: 2023 Apple iPhone 15 Pro ← NEW - Fetching bid history... - >> Bid velocity: 8.5 bids/hour ← Enhanced - VERY ACTIVE - Location: Rotterdam, NL - Images: 12 - Downloaded: 12/12 images -``` - -**Intelligence at a glance:** -- 🔥 **Still 28% below estimate** - good value -- 👥 **32 followers + 8.5 bids/hour** - high competition -- ⚡ **Very active bidding** - expect price to rise -- ⚠ **Minimum not met** - reserve price higher -- 📱 **Excellent condition** - premium item - ---- - -## Example 4: Overvalued (Warning) - -### After (with new fields): -``` -[8769/15859] -[PAGE office-chair-A1-39102-45] - Type: LOT - Title: Office Chair Herman Miller... - Fetching bidding data from API... - Bid: EUR 450.00 - Status: Minimumprijs gehaald - Followers: 8 watching ← NEW - Estimate: EUR 200.00 - EUR 300.00 ← NEW - >> WARNING: 125% ABOVE estimate! ← NEW (auto-calculated) - Condition: Used - Fair ← NEW - Item: Herman Miller Aeron ← NEW - Location: Utrecht, NL - Images: 5 - Downloaded: 5/5 images -``` - -**Intelligence at a glance:** -- ⚠ **125% above estimate** - significantly overvalued -- 📉 **Low followers** - limited interest -- ⚖ **Fair condition** - not premium -- 🚫 **Avoid** - better deals available - ---- - -## Example 5: No Estimate Available - -### After (with new fields): -``` -[8770/15859] -[PAGE antique-painting-A1-40215-3] - Type: LOT - Title: Antique Oil Painting 19th Century... - Fetching bidding data from API... - Bid: EUR 1500.00 - Status: Geen Minimumprijs - Followers: 24 watching ← NEW - Condition: Antique - Good for age ← NEW - Item: 1890 Unknown Artist Oil Painting ← NEW - Fetching bid history... - >> Bid velocity: 1.2 bids/hour ← Enhanced - Location: Maastricht, NL - Images: 15 - Downloaded: 15/15 images -``` - -**Intelligence at a glance:** -- ℹ️ **No estimate** - difficult to value (common for art/antiques) -- 👁 **24 followers** - decent interest -- 🎨 **Good condition for age** - authentic piece -- 📊 **Steady bidding** - organic interest - ---- - -## Example 6: Fresh Listing (No Bids Yet) - -### After (with new fields): -``` -[8771/15859] -[PAGE laptop-dell-xps-15-A1-40301-8] - Type: LOT - Title: Dell XPS 15 9520 Laptop... - Fetching bidding data from API... - Bid: No bids - Status: Geen Minimumprijs - Followers: 5 watching ← NEW - Estimate: EUR 800.00 - EUR 1000.00 ← NEW - Condition: Used - Good ← NEW - Item: 2022 Dell XPS 15 ← NEW - Location: Eindhoven, NL - Images: 10 - Downloaded: 10/10 images -``` - -**Intelligence at a glance:** -- 🆕 **Fresh listing** - no bids yet -- 📊 **Clear estimate** - good valuation available -- 👀 **5 followers** - early interest -- 💼 **Good condition** - solid laptop -- ⏰ **Early opportunity** - bid before others - ---- - -## Log Output Summary - -### New Fields Shown: -1. ✅ **Followers:** Watch count (popularity indicator) -2. ✅ **Estimate:** Min-max estimated value range -3. ✅ **Value Gap:** Auto-calculated bargain/overvaluation indicator -4. ✅ **Condition:** Direct condition from auction house -5. ✅ **Item Details:** Year + Brand + Model combined - -### Enhanced Fields: -- ✅ **Bid velocity:** Now shows as ">> Bid velocity: X.X bids/hour" (more prominent) -- ✅ **Auto-alerts:** ">> BARGAIN:" for >20% below estimate - -### Bargain Detection (Automatic): -- **>20% below estimate:** Shows ">> BARGAIN: X% below estimate!" -- **<20% below estimate:** Shows "Value gap: X% below estimate" -- **Above estimate:** Shows ">> WARNING: X% ABOVE estimate!" - ---- - -## Real-Time Intelligence Benefits - -### For Monitoring/Alerting: -```bash -# Easy to grep for opportunities in logs -docker logs scaev | grep "BARGAIN" -docker logs scaev | grep "Followers: [0-9]\{2\}" # High followers -docker logs scaev | grep "WARNING:" # Overvalued -``` - -### For Live Monitoring: -Watch logs in real-time and spot opportunities as they're scraped: -```bash -docker logs -f scaev -``` - -You'll immediately see: -- 🔥 Bargains being discovered -- 👀 Popular lots (high followers) -- 📈 Active auctions (high bid velocity) -- ⚠ Overvalued items to avoid - ---- - -## Color Coding Suggestion (Optional) - -For even better visibility, you could add color coding in the monitoring app: - -- 🔴 **RED:** Overvalued (>120% estimate) -- 🟢 **GREEN:** Bargain (<80% estimate) -- 🟡 **YELLOW:** High followers (>20 watching) -- 🔵 **BLUE:** Active bidding (>5 bids/hour) -- ⚪ **WHITE:** Normal / No special signals - ---- - -## Integration with Monitoring App - -The enhanced logs make it easy to: - -1. **Parse for opportunities:** - - Grep for "BARGAIN" in logs - - Extract follower counts - - Track estimates vs current bids - -2. **Generate alerts:** - - High followers + no bids = sleeper alert - - Large value gap = bargain alert - - High bid velocity = competition alert - -3. **Build dashboards:** - - Show real-time scraping progress - - Highlight opportunities as they're found - - Track bargain discovery rate - -4. **Export intelligence:** - - All data in database for analysis - - Logs provide human-readable summary - - Easy to spot patterns - ---- - -## Conclusion - -The enhanced logging turns the scraper into a **real-time opportunity scanner**. You can now: - -- ✅ **Spot bargains** as they're scraped (>20% below estimate) -- ✅ **Identify popular items** (high follower counts) -- ✅ **Track competition** (bid velocity) -- ✅ **Assess condition** (direct from auction house) -- ✅ **Avoid overvalued lots** (automatic warnings) - -All without opening the database - the intelligence is right there in the logs! 🚀 diff --git a/docs/FIXING_MALFORMED_ENTRIES.md b/docs/FIXING_MALFORMED_ENTRIES.md deleted file mode 100644 index aa76d2e..0000000 --- a/docs/FIXING_MALFORMED_ENTRIES.md +++ /dev/null @@ -1,262 +0,0 @@ -# Fixing Malformed Database Entries - -## Problem - -After the initial scrape run with less strict validation, the database contains entries with incomplete or incorrect data: - -### Examples of Malformed Data - -```csv -A1-34327,"",https://...,"",€Huidig ​​bod,0,gap,"","","",... -A1-39577,"",https://...,"",€Huidig ​​bod,0,gap,"","","",... -``` - -**Issues identified:** -1. ❌ Missing `auction_id` (empty string) -2. ❌ Missing `title` (empty string) -3. ❌ Invalid bid value: `€Huidig ​​bod` (Dutch for "Current bid" - placeholder text) -4. ❌ Invalid timestamp: `gap` (should be empty or valid date) -5. ❌ Missing `viewing_time`, `pickup_date`, and other fields - -## Root Cause - -Earlier scraping runs: -- Used less strict validation -- Fell back to HTML parsing when `__NEXT_DATA__` JSON extraction failed -- HTML parser extracted placeholder text as actual values -- Continued on errors instead of flagging incomplete data - -## Solution - -### Step 1: Parser Improvements ✅ - -**Fixed in `src/parse.py`:** - -1. **Timestamp parsing** (lines 37-70): - - Filters invalid strings like "gap", "materieel wegens vereffening" - - Returns empty string instead of invalid value - - Handles Unix timestamps in seconds and milliseconds - -2. **Bid extraction** (lines 246-280): - - Rejects placeholder text like "€Huidig ​​bod", "€Huidig bod" - - Removes zero-width Unicode spaces - - Returns "No bids" instead of invalid placeholder text - -### Step 2: Detection and Repair Scripts ✅ - -Created two scripts to fix existing data: - -#### A. `script/migrate_reparse_lots.py` -**Purpose:** Re-parse ALL cached entries with improved JSON extraction - - -```bash -# Preview what would be changed -# python script/fix_malformed_entries.py --db C:/mnt/okcomputer/output/cache.db -python script/migrate_reparse_lots.py --db C:/mnt/okcomputer/output/cache.db -``` - -```bash -# Preview what would be changed -python script/migrate_reparse_lots.py --dry-run -# Apply changes -python script/migrate_reparse_lots.py - -# Use custom database path -python script/migrate_reparse_lots.py --db /path/to/cache.db -``` - -**What it does:** -- Reads all cached HTML pages from `cache` table -- Re-parses using improved `__NEXT_DATA__` JSON extraction -- Updates existing database entries with newly extracted fields -- Populates missing `auction_id`, `viewing_time`, `pickup_date`, etc. - -#### B. `script/fix_malformed_entries.py` ⭐ **RECOMMENDED** -**Purpose:** Detect and fix ONLY malformed entries - -```bash -# Preview malformed entries and fixes -python script/fix_malformed_entries.py --dry-run - -# Fix malformed entries -python script/fix_malformed_entries.py - -# Use custom database path -python script/fix_malformed_entries.py --db /path/to/cache.db -``` - -**What it detects:** -```sql --- Auctions with issues -SELECT * FROM auctions WHERE - auction_id = '' OR auction_id IS NULL - OR title = '' OR title IS NULL - OR first_lot_closing_time = 'gap' - --- Lots with issues -SELECT * FROM lots WHERE - auction_id = '' OR auction_id IS NULL - OR title = '' OR title IS NULL - OR current_bid LIKE '%Huidig%bod%' - OR closing_time = 'gap' OR closing_time = '' -``` - -**Example output:** -``` -================================================================= -MALFORMED ENTRY DETECTION AND REPAIR -================================================================= - -1. CHECKING AUCTIONS... - Found 23 malformed auction entries - - Fixing auction: A1-39577 - URL: https://www.troostwijkauctions.com/a/...-A1-39577 - ✓ Parsed successfully: - auction_id: A1-39577 - title: Bootveiling Rotterdam - Console boten, RIB, speedboten... - location: Rotterdam, NL - lots: 45 - ✓ Database updated - -2. CHECKING LOTS... - Found 127 malformed lot entries - - Fixing lot: A1-39529-10 - URL: https://www.troostwijkauctions.com/l/...-A1-39529-10 - ✓ Parsed successfully: - lot_id: A1-39529-10 - auction_id: A1-39529 - title: Audi A7 Sportback Personenauto - bid: No bids - closing: 2024-12-08 15:30:00 - ✓ Database updated - -================================================================= -SUMMARY -================================================================= -Auctions: - - Found: 23 - - Fixed: 21 - - Failed: 2 - -Lots: - - Found: 127 - - Fixed: 124 - - Failed: 3 -``` - -### Step 3: Verification - -After running the fix script, verify the data: - -```bash -# Check if malformed entries still exist -python -c " -import sqlite3 -conn = sqlite3.connect('path/to/cache.db') - -print('Auctions with empty auction_id:') -print(conn.execute('SELECT COUNT(*) FROM auctions WHERE auction_id = \"\" OR auction_id IS NULL').fetchone()[0]) - -print('Lots with invalid bids:') -print(conn.execute('SELECT COUNT(*) FROM lots WHERE current_bid LIKE \"%Huidig%bod%\"').fetchone()[0]) - -print('Lots with \"gap\" timestamps:') -print(conn.execute('SELECT COUNT(*) FROM lots WHERE closing_time = \"gap\"').fetchone()[0]) -" -``` - -Expected result after fix: **All counts should be 0** - -### Step 4: Prevention - -To prevent future occurrences: - -1. **Validation in scraper** - Add validation before saving to database: - -```python -def validate_lot_data(lot_data: Dict) -> bool: - """Validate lot data before saving""" - required_fields = ['lot_id', 'title', 'url'] - invalid_values = ['gap', '€Huidig bod', '€Huidig ​​bod', ''] - - for field in required_fields: - value = lot_data.get(field, '') - if not value or value in invalid_values: - print(f" ⚠️ Invalid {field}: {value}") - return False - - return True - -# In save_lot method: -if not validate_lot_data(lot_data): - print(f" ❌ Skipping invalid lot: {lot_data.get('url')}") - return -``` - -2. **Prefer JSON over HTML** - Ensure `__NEXT_DATA__` parsing is tried first (already implemented) - -3. **Logging** - Add logging for fallback to HTML parsing: - -```python -if next_data: - return next_data -else: - print(f" ⚠️ No __NEXT_DATA__ found, falling back to HTML parsing: {url}") - # HTML parsing... -``` - -## Recommended Workflow - -```bash -# 1. First, run dry-run to see what will be fixed -python script/fix_malformed_entries.py --dry-run - -# 2. Review the output - check if fixes look correct - -# 3. Run the actual fix -python script/fix_malformed_entries.py - -# 4. Verify the results -python script/fix_malformed_entries.py --dry-run -# Should show "Found 0 malformed auction entries" and "Found 0 malformed lot entries" - -# 5. (Optional) Run full migration to ensure all fields are populated -python script/migrate_reparse_lots.py -``` - -## Files Modified/Created - -### Modified: -- ✅ `src/parse.py` - Improved timestamp and bid parsing with validation - -### Created: -- ✅ `script/fix_malformed_entries.py` - Targeted fix for malformed entries -- ✅ `script/migrate_reparse_lots.py` - Full re-parse migration -- ✅ `_wiki/JAVA_FIXES_NEEDED.md` - Java-side fixes documentation -- ✅ `_wiki/FIXING_MALFORMED_ENTRIES.md` - This file - -## Database Location - -If you get "no such table" errors, find your actual database: - -```bash -# Find all .db files -find . -name "*.db" - -# Check which one has data -sqlite3 path/to/cache.db "SELECT COUNT(*) FROM lots" - -# Use that path with --db flag -python script/fix_malformed_entries.py --db /actual/path/to/cache.db -``` - -## Next Steps - -After fixing malformed entries: -1. ✅ Run `fix_malformed_entries.py` to repair bad data -2. ⏳ Apply Java-side fixes (see `_wiki/JAVA_FIXES_NEEDED.md`) -3. ⏳ Re-run Java monitoring process -4. ✅ Add validation to prevent future issues diff --git a/docs/Getting-Started.md b/docs/Getting-Started.md deleted file mode 100644 index 160c7ce..0000000 --- a/docs/Getting-Started.md +++ /dev/null @@ -1,71 +0,0 @@ -# Getting Started - -## Prerequisites - -- Python 3.8+ -- Git -- pip (Python package manager) - -## Installation - -### 1. Clone the repository - -```bash -git clone --recurse-submodules git@git.appmodel.nl:Tour/troost-scraper.git -cd troost-scraper -``` - -### 2. Install dependencies - -```bash -pip install -r requirements.txt -``` - -### 3. Install Playwright browsers - -```bash -playwright install chromium -``` - -## Configuration - -Edit the configuration in `main.py`: - -```python -BASE_URL = "https://www.troostwijkauctions.com" -CACHE_DB = "/path/to/cache.db" # Path to cache database -OUTPUT_DIR = "/path/to/output" # Output directory -RATE_LIMIT_SECONDS = 0.5 # Delay between requests -MAX_PAGES = 50 # Number of listing pages -``` - -**Windows users:** Use paths like `C:\\output\\cache.db` - -## Usage - -### Basic scraping - -```bash -python main.py -``` - -This will: -1. Crawl listing pages to collect lot URLs -2. Scrape each individual lot page -3. Save results in JSON and CSV formats -4. Cache all pages for future runs - -### Test mode - -Debug extraction on a specific URL: - -```bash -python main.py --test "https://www.troostwijkauctions.com/a/lot-url" -``` - -## Output - -The scraper generates: -- `troostwijk_lots_final_YYYYMMDD_HHMMSS.json` - Complete data -- `troostwijk_lots_final_YYYYMMDD_HHMMSS.csv` - CSV export -- `cache.db` - SQLite cache (persistent) diff --git a/docs/HOLISTIC.md b/docs/HOLISTIC.md deleted file mode 100644 index 2e9cbed..0000000 --- a/docs/HOLISTIC.md +++ /dev/null @@ -1,107 +0,0 @@ -# Architecture - -## Overview - -The Scaev Auctions Scraper is a Python-based web scraper that extracts auction lot data using Playwright for browser automation and SQLite for caching. - -## Core Components - -### 1. **Browser Automation (Playwright)** -- Launches Chromium browser in headless mode -- Bypasses Cloudflare protection -- Handles dynamic content rendering -- Supports network idle detection - -### 2. **Cache Manager (SQLite)** -- Caches every fetched page -- Prevents redundant requests -- Stores page content, timestamps, and status codes -- Auto-cleans entries older than 7 days -- Database: `cache.db` - -### 3. **Rate Limiter** -- Enforces exactly 0.5 seconds between requests -- Prevents server overload -- Tracks last request time globally - -### 4. **Data Extractor** -- **Primary method:** Parses `__NEXT_DATA__` JSON from Next.js pages -- **Fallback method:** HTML pattern matching with regex -- Extracts: title, location, bid info, dates, images, descriptions - -### 5. **Output Manager** -- Exports data in JSON and CSV formats -- Saves progress checkpoints every 10 lots -- Timestamped filenames for tracking - -## Data Flow - -``` -1. Listing Pages → Extract lot URLs → Store in memory - ↓ -2. For each lot URL → Check cache → If cached: use cached content - ↓ If not: fetch with rate limit - ↓ -3. Parse __NEXT_DATA__ JSON → Extract fields → Store in results - ↓ -4. Every 10 lots → Save progress checkpoint - ↓ -5. All lots complete → Export final JSON + CSV -``` - -## Key Design Decisions - -### Why Playwright? -- Handles JavaScript-rendered content (Next.js) -- Bypasses Cloudflare protection -- More reliable than requests/BeautifulSoup for modern SPAs - -### Why JSON extraction? -- Site uses Next.js with embedded `__NEXT_DATA__` -- JSON is more reliable than HTML pattern matching -- Avoids breaking when HTML/CSS changes -- Faster parsing - -### Why SQLite caching? -- Persistent across runs -- Reduces load on target server -- Enables test mode without re-fetching -- Respects website resources - -## File Structure - -``` -troost-scraper/ -├── main.py # Main scraper logic -├── requirements.txt # Python dependencies -├── README.md # Documentation -├── .gitignore # Git exclusions -└── output/ # Generated files (not in git) - ├── cache.db # SQLite cache - ├── *_partial_*.json # Progress checkpoints - ├── *_final_*.json # Final JSON output - └── *_final_*.csv # Final CSV output -``` - -## Classes - -### `CacheManager` -- `__init__(db_path)` - Initialize cache database -- `get(url, max_age_hours)` - Retrieve cached page -- `set(url, content, status_code)` - Cache a page -- `clear_old(max_age_hours)` - Remove old entries - -### `TroostwijkScraper` -- `crawl_auctions(max_pages)` - Main entry point -- `crawl_listing_page(page, page_num)` - Extract lot URLs -- `crawl_lot(page, url)` - Scrape individual lot -- `_extract_nextjs_data(content)` - Parse JSON data -- `_parse_lot_page(content, url)` - Extract all fields -- `save_final_results(data)` - Export JSON + CSV - -## Scalability Notes - -- **Rate limiting** prevents IP blocks but slows execution -- **Caching** makes subsequent runs instant for unchanged pages -- **Progress checkpoints** allow resuming after interruption -- **Async/await** used throughout for non-blocking I/O diff --git a/docs/JAVA_FIXES_NEEDED.md b/docs/JAVA_FIXES_NEEDED.md deleted file mode 100644 index 555c439..0000000 --- a/docs/JAVA_FIXES_NEEDED.md +++ /dev/null @@ -1,170 +0,0 @@ -# Java Monitoring Process Fixes - -## Issues Identified - -Based on the error logs from the Java monitoring process, the following bugs need to be fixed: - -### 1. Integer Overflow - `extractNumericId()` method - -**Error:** -``` -For input string: "239144949705335" -at java.lang.Integer.parseInt(Integer.java:565) -at auctiora.ScraperDataAdapter.extractNumericId(ScraperDataAdapter.java:81) -``` - -**Problem:** -- Lot IDs are being parsed as `int` (32-bit, max value: 2,147,483,647) -- Actual lot IDs can exceed this limit (e.g., "239144949705335") - -**Solution:** -Change from `Integer.parseInt()` to `Long.parseLong()`: - -```java -// BEFORE (ScraperDataAdapter.java:81) -int numericId = Integer.parseInt(lotId); - -// AFTER -long numericId = Long.parseLong(lotId); -``` - -**Additional changes needed:** -- Update all related fields/variables from `int` to `long` -- Update database schema if numeric ID is stored (change INTEGER to BIGINT) -- Update any method signatures that return/accept `int` for lot IDs - ---- - -### 2. UNIQUE Constraint Failures - -**Error:** -``` -Failed to import lot: [SQLITE_CONSTRAINT_UNIQUE] A UNIQUE constraint failed (UNIQUE constraint failed: lots.url) -``` - -**Problem:** -- Attempting to re-insert lots that already exist -- No graceful handling of duplicate entries - -**Solution:** -Use `INSERT OR REPLACE` or `INSERT OR IGNORE`: - -```java -// BEFORE -String sql = "INSERT INTO lots (lot_id, url, ...) VALUES (?, ?, ...)"; - -// AFTER - Option 1: Update existing records -String sql = "INSERT OR REPLACE INTO lots (lot_id, url, ...) VALUES (?, ?, ...)"; - -// AFTER - Option 2: Skip duplicates silently -String sql = "INSERT OR IGNORE INTO lots (lot_id, url, ...) VALUES (?, ?, ...)"; -``` - -**Alternative with try-catch:** -```java -try { - insertLot(lotData); -} catch (SQLException e) { - if (e.getMessage().contains("UNIQUE constraint")) { - logger.debug("Lot already exists, skipping: " + lotData.getUrl()); - return; // Or update instead - } - throw e; -} -``` - ---- - -### 3. Timestamp Parsing - Already Fixed in Python - -**Error:** -``` -Unable to parse timestamp: materieel wegens vereffening -Unable to parse timestamp: gap -``` - -**Status:** ✅ Fixed in `parse.py` (src/parse.py:37-70) - -The Python parser now: -- Filters out invalid timestamp strings like "gap", "materieel wegens vereffening" -- Returns empty string for invalid values -- Handles both Unix timestamps (seconds/milliseconds) - -**Java side action:** -If the Java code also parses timestamps, apply similar validation: -- Check for known invalid values before parsing -- Use try-catch and return null/empty for unparseable timestamps -- Don't fail the entire import if one timestamp is invalid - ---- - -## Migration Strategy - -### Step 1: Fix Python Parser ✅ -- [x] Updated `format_timestamp()` to handle invalid strings -- [x] Created migration script `script/migrate_reparse_lots.py` - -### Step 2: Run Migration -```bash -cd /path/to/scaev -python script/migrate_reparse_lots.py --dry-run # Preview changes -python script/migrate_reparse_lots.py # Apply changes -``` - -This will: -- Re-parse all cached HTML pages using improved __NEXT_DATA__ extraction -- Update existing database entries with newly extracted fields -- Populate missing `viewing_time`, `pickup_date`, and other fields - -### Step 3: Fix Java Code -1. Update `ScraperDataAdapter.java:81` - use `Long.parseLong()` -2. Update `DatabaseService.java` - use `INSERT OR REPLACE` or handle duplicates -3. Update timestamp parsing - add validation for invalid strings -4. Update database schema - change numeric ID columns to BIGINT if needed - -### Step 4: Re-run Monitoring Process -After fixes, the monitoring process should: -- Successfully import all lots without crashes -- Gracefully skip duplicates -- Handle large numeric IDs -- Ignore invalid timestamp values - ---- - -## Database Schema Changes (if needed) - -If lot IDs are stored as numeric values in Java's database: - -```sql --- Check current schema -PRAGMA table_info(lots); - --- If numeric ID field exists and is INTEGER, change to BIGINT: -ALTER TABLE lots ADD COLUMN lot_id_numeric BIGINT; -UPDATE lots SET lot_id_numeric = CAST(lot_id AS BIGINT) WHERE lot_id GLOB '[0-9]*'; --- Then update code to use lot_id_numeric -``` - ---- - -## Testing Checklist - -After applying fixes: -- [ ] Import lot with ID > 2,147,483,647 (e.g., "239144949705335") -- [ ] Re-import existing lot (should update or skip gracefully) -- [ ] Import lot with invalid timestamp (should not crash) -- [ ] Verify all newly extracted fields are populated (viewing_time, pickup_date, etc.) -- [ ] Check logs for any remaining errors - ---- - -## Files Modified - -Python side (completed): -- `src/parse.py` - Fixed `format_timestamp()` method -- `script/migrate_reparse_lots.py` - New migration script - -Java side (needs implementation): -- `auctiora/ScraperDataAdapter.java` - Line 81: Change Integer.parseInt to Long.parseLong -- `auctiora/DatabaseService.java` - Line ~569: Handle UNIQUE constraints gracefully -- Database schema - Consider BIGINT for numeric IDs diff --git a/docs/QUICK_REFERENCE.md b/docs/QUICK_REFERENCE.md deleted file mode 100644 index ce6ee50..0000000 --- a/docs/QUICK_REFERENCE.md +++ /dev/null @@ -1,215 +0,0 @@ -# Quick Reference Card - -## 🎯 What Changed (TL;DR) - -**Fixed orphaned lots:** 16,807 → 13 (99.9% fixed) -**Added 5 new intelligence fields:** followers, estimates, condition -**Enhanced logs:** Real-time bargain detection -**Impact:** 80%+ more intelligence per lot - ---- - -## 📊 New Intelligence Fields - -| Field | Type | Purpose | -|-------|------|---------| -| `followers_count` | INTEGER | Watch count (popularity) | -| `estimated_min_price` | REAL | Minimum estimated value | -| `estimated_max_price` | REAL | Maximum estimated value | -| `lot_condition` | TEXT | Direct condition from API | -| `appearance` | TEXT | Visual quality notes | - -**All automatically captured in future scrapes!** - ---- - -## 🔍 Enhanced Log Output - -**Logs now show:** -- ✅ "Followers: X watching" -- ✅ "Estimate: EUR X - EUR Y" -- ✅ ">> BARGAIN: X% below estimate!" (auto-calculated) -- ✅ "Condition: Used - Good" -- ✅ "Item: 2015 Ford FGT9250E" -- ✅ ">> Bid velocity: X bids/hour" - -**Watch live:** `docker logs -f scaev | grep "BARGAIN"` - ---- - -## 📁 Key Files for Monitoring Team - -1. **INTELLIGENCE_DASHBOARD_UPGRADE.md** ← START HERE - - Complete dashboard upgrade plan - - SQL queries ready to use - - 4 priority levels of features - -2. **ENHANCED_LOGGING_EXAMPLE.md** - - 6 real-world log examples - - Shows what intelligence looks like - -3. **FIXES_COMPLETE.md** - - Technical implementation details - - What code changed - -4. **_wiki/ARCHITECTURE.md** - - Complete system documentation - - Updated database schema - ---- - -## 🚀 Optional Migration Scripts - -```bash -# Populate new fields for existing 16,807 lots -python enrich_existing_lots.py # ~2.3 hours - -# Populate bid history for 1,590 lots -python fetch_missing_bid_history.py # ~13 minutes -``` - -**Not required** - future scrapes capture everything automatically! - ---- - -## 💡 Dashboard Quick Wins - -### 1. Bargain Hunter -```sql --- Find lots >20% below estimate -SELECT lot_id, title, current_bid, estimated_min_price -FROM lots -WHERE current_bid < estimated_min_price * 0.80 -ORDER BY (estimated_min_price - current_bid) DESC; -``` - -### 2. Sleeper Lots -```sql --- High followers, no bids -SELECT lot_id, title, followers_count, closing_time -FROM lots -WHERE followers_count > 10 AND bid_count = 0 -ORDER BY followers_count DESC; -``` - -### 3. Popular Items -```sql --- Most watched lots -SELECT lot_id, title, followers_count, current_bid -FROM lots -WHERE followers_count > 0 -ORDER BY followers_count DESC -LIMIT 50; -``` - ---- - -## 🎨 Example Enhanced Log - -``` -[8766/15859] -[PAGE ford-generator-A1-34731-107] - Type: LOT - Title: Ford FGT9250E Generator... - Fetching bidding data from API... - Bid: EUR 500.00 - Status: Geen Minimumprijs - Followers: 12 watching ← NEW - Estimate: EUR 1200.00 - EUR 1800.00 ← NEW - >> BARGAIN: 58% below estimate! ← NEW - Condition: Used - Good working order ← NEW - Item: 2015 Ford FGT9250E ← NEW - >> Bid velocity: 2.4 bids/hour ← Enhanced - Location: Venray, NL - Images: 6 - Downloaded: 6/6 images -``` - -**Intelligence at a glance:** -- 🔥 58% below estimate = BARGAIN -- 👁 12 watching = Good interest -- 📈 2.4 bids/hour = Active -- ✅ Good condition -- 💰 Profit potential: €700-€1,300 - ---- - -## 📈 Expected ROI - -**Example:** -- Find lot at: €500 current bid -- Estimate: €1,200 - €1,800 -- Buy at: €600 (after competition) -- Resell at: €1,400 (within estimate) -- **Profit: €800** - -**Dashboard identifies 87 such opportunities** -**Total potential value: €69,600** - ---- - -## ⚡ Real-Time Monitoring - -```bash -# Watch for bargains -docker logs -f scaev | grep "BARGAIN" - -# Watch for popular lots -docker logs -f scaev | grep "Followers: [2-9][0-9]" - -# Watch for overvalued -docker logs -f scaev | grep "WARNING" - -# Watch for active bidding -docker logs -f scaev | grep "velocity: [5-9]" -``` - ---- - -## 🎯 Next Actions - -### Immediate: -1. ✅ Run scraper - automatically captures new fields -2. ✅ Monitor enhanced logs for opportunities - -### This Week: -1. Read `INTELLIGENCE_DASHBOARD_UPGRADE.md` -2. Implement bargain hunter dashboard -3. Add opportunity alerts - -### This Month: -1. Build analytics dashboards -2. Implement price prediction -3. Set up webhook notifications - ---- - -## 📞 Need Help? - -**Read These First:** -1. `INTELLIGENCE_DASHBOARD_UPGRADE.md` - Dashboard features -2. `ENHANCED_LOGGING_EXAMPLE.md` - Log examples -3. `SESSION_COMPLETE_SUMMARY.md` - Full details - -**All documentation in:** `C:\vibe\scaev\` - ---- - -## ✅ Success Checklist - -- [x] Fixed orphaned lots (99.9%) -- [x] Fixed auction data (100% complete) -- [x] Added followers_count field -- [x] Added estimated prices -- [x] Added condition field -- [x] Enhanced logging -- [x] Created migration scripts -- [x] Wrote complete documentation -- [x] Provided SQL queries -- [x] Created dashboard upgrade plan - -**Everything ready! 🚀** - ---- - -**System is production-ready with 80%+ more intelligence!** diff --git a/docs/REFACTORING_COMPLETE.md b/docs/REFACTORING_COMPLETE.md deleted file mode 100644 index 48fb083..0000000 --- a/docs/REFACTORING_COMPLETE.md +++ /dev/null @@ -1,209 +0,0 @@ -# Scaev Scraper Refactoring - COMPLETE - -## Date: 2025-12-07 - -## ✅ All Objectives Completed - -### 1. Image Download Integration ✅ -- **Changed**: Enabled `DOWNLOAD_IMAGES = True` in `config.py` and `docker-compose.yml` -- **Added**: Unique constraint on `images(lot_id, url)` to prevent duplicates -- **Added**: Automatic duplicate cleanup migration in `cache.py` -- **Optimized**: **Images now download concurrently per lot** (all images for a lot download in parallel) -- **Performance**: **~16x speedup** - all lot images download simultaneously within the 0.5s page rate limit -- **Result**: Images downloaded to `/mnt/okcomputer/output/images/{lot_id}/` and marked as `downloaded=1` -- **Impact**: Eliminates 57M+ duplicate image downloads by monitor app - -### 2. Data Completeness Fix ✅ -- **Problem**: 99.9% of lots missing closing_time, 100% missing bid data -- **Root Cause**: Troostwijk loads bid/timing data dynamically via GraphQL API, not in HTML -- **Solution**: Added GraphQL client to fetch real-time bidding data -- **Data Now Captured**: - - ✅ `current_bid`: EUR 50.00 - - ✅ `starting_bid`: EUR 50.00 - - ✅ `minimum_bid`: EUR 55.00 - - ✅ `bid_count`: 1 - - ✅ `closing_time`: 2025-12-16 19:10:00 - - ⚠️ `viewing_time`: Not available (lot pages don't include this; auction-level data) - - ⚠️ `pickup_date`: Not available (lot pages don't include this; auction-level data) - -### 3. Performance Optimization ✅ -- **Rate Limiting**: 0.5s between page fetches (unchanged) -- **Image Downloads**: All images per lot download concurrently (changed from sequential) -- **Impact**: Every 0.5s downloads: **1 page + ALL its images (n images) simultaneously** -- **Example**: Lot with 5 images: Downloads page + 5 images in ~0.5s (not 2.5s) - -## Key Implementation Details - -### Rate Limiting Strategy -``` -┌─────────────────────────────────────────────────────────┐ -│ Timeline (0.5s per lot page) │ -├─────────────────────────────────────────────────────────┤ -│ │ -│ 0.0s: Fetch lot page HTML (rate limited) │ -│ 0.1s: ├─ Parse HTML │ -│ ├─ Fetch GraphQL API │ -│ └─ Download images (ALL CONCURRENT) │ -│ ├─ image1.jpg ┐ │ -│ ├─ image2.jpg ├─ Parallel │ -│ ├─ image3.jpg ├─ Downloads │ -│ └─ image4.jpg ┘ │ -│ │ -│ 0.5s: RATE LIMIT - wait before next page │ -│ │ -│ 0.5s: Fetch next lot page... │ -└─────────────────────────────────────────────────────────┘ -``` - -## New Files Created - -1. **src/graphql_client.py** - GraphQL API integration - - Endpoint: `https://storefront.tbauctions.com/storefront/graphql` - - Query: `LotBiddingData(lotDisplayId, locale, platform)` - - Returns: Complete bidding data including timestamps - -## Modified Files - -1. **src/config.py** - - Line 22: `DOWNLOAD_IMAGES = True` - -2. **docker-compose.yml** - - Line 13: `DOWNLOAD_IMAGES: "True"` - -3. **src/cache.py** - - Added unique index `idx_unique_lot_url` on `images(lot_id, url)` - - Added migration to clean existing duplicates - - Added columns: `starting_bid`, `minimum_bid` to `lots` table - - Migration runs automatically on init - -4. **src/scraper.py** - - Imported `graphql_client` - - Modified `_download_image()`: Removed internal rate limiting, accepts session parameter - - Modified `crawl_page()`: - - Calls GraphQL API after parsing HTML - - Downloads all images concurrently using `asyncio.gather()` - - Removed unicode characters (→, ✓) for Windows compatibility - -## Database Schema Updates - -```sql --- New columns (auto-migrated) -ALTER TABLE lots ADD COLUMN starting_bid TEXT; -ALTER TABLE lots ADD COLUMN minimum_bid TEXT; - --- New index (auto-created with duplicate cleanup) -CREATE UNIQUE INDEX idx_unique_lot_url ON images(lot_id, url); -``` - -## Testing Results - -### Test Lot: A1-28505-5 -``` -✅ Current Bid: EUR 50.00 -✅ Starting Bid: EUR 50.00 -✅ Minimum Bid: EUR 55.00 -✅ Bid Count: 1 -✅ Closing Time: 2025-12-16 19:10:00 -✅ Images: 2/2 downloaded -⏱️ Total Time: 0.06s (16x faster than sequential) -⚠️ Viewing Time: Empty (not in lot page JSON) -⚠️ Pickup Date: Empty (not in lot page JSON) -``` - -## Known Limitations - -### viewing_time and pickup_date -- **Status**: ⚠️ Not captured from lot pages -- **Reason**: Individual lot pages don't include `viewingDays` or `collectionDays` in `__NEXT_DATA__` -- **Location**: This data exists at the auction level, not lot level -- **Impact**: Fields will be empty for lots scraped individually -- **Solution Options**: - 1. Accept empty values (current approach) - 2. Modify scraper to also fetch parent auction data - 3. Add separate auction data enrichment step -- **Code Already Exists**: Parser has `_extract_viewing_time()` and `_extract_pickup_date()` ready to use if data becomes available - -## Deployment Instructions - -1. **Backup existing database** - ```bash - cp /mnt/okcomputer/output/cache.db /mnt/okcomputer/output/cache.db.backup - ``` - -2. **Deploy updated code** - ```bash - cd /opt/apps/scaev - git pull - docker-compose build - docker-compose up -d - ``` - -3. **Migrations run automatically** on first start - -4. **Verify deployment** - ```bash - python verify_images.py - python check_data.py - ``` - -## Post-Deployment Verification - -Run these queries to verify data quality: - -```sql --- Check new lots have complete data -SELECT - COUNT(*) as total, - SUM(CASE WHEN closing_time != '' THEN 1 ELSE 0 END) as has_closing, - SUM(CASE WHEN bid_count >= 0 THEN 1 ELSE 0 END) as has_bidcount, - SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting -FROM lots -WHERE scraped_at > datetime('now', '-1 day'); - --- Check image download success rate -SELECT - COUNT(*) as total, - SUM(downloaded) as downloaded, - ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate -FROM images -WHERE id IN ( - SELECT i.id FROM images i - JOIN lots l ON i.lot_id = l.lot_id - WHERE l.scraped_at > datetime('now', '-1 day') -); - --- Verify no duplicates -SELECT lot_id, url, COUNT(*) as dup_count -FROM images -GROUP BY lot_id, url -HAVING COUNT(*) > 1; --- Should return 0 rows -``` - -## Performance Metrics - -### Before -- Page fetch: 0.5s -- Image downloads: 0.5s × n images (sequential) -- **Total per lot**: 0.5s + (0.5s × n images) -- **Example (5 images)**: 0.5s + 2.5s = 3.0s per lot - -### After -- Page fetch: 0.5s -- GraphQL API: ~0.1s -- Image downloads: All concurrent -- **Total per lot**: ~0.5s (rate limit) + minimal overhead -- **Example (5 images)**: ~0.6s per lot -- **Speedup**: ~5x for lots with multiple images - -## Summary - -The scraper now: -1. ✅ Downloads images to disk during scraping (prevents 57M+ duplicates) -2. ✅ Captures complete bid data via GraphQL API -3. ✅ Downloads all lot images concurrently (~16x faster) -4. ✅ Maintains 0.5s rate limit between pages -5. ✅ Auto-migrates database schema -6. ⚠️ Does not capture viewing_time/pickup_date (not available in lot page data) - -**Ready for production deployment!** diff --git a/docs/REFACTORING_SUMMARY.md b/docs/REFACTORING_SUMMARY.md deleted file mode 100644 index 86d1e88..0000000 --- a/docs/REFACTORING_SUMMARY.md +++ /dev/null @@ -1,140 +0,0 @@ -# Scaev Scraper Refactoring Summary - -## Date: 2025-12-07 - -## Objectives Completed - -### 1. Image Download Integration ✅ -- **Changed**: Enabled `DOWNLOAD_IMAGES = True` in `config.py` and `docker-compose.yml` -- **Added**: Unique constraint on `images(lot_id, url)` to prevent duplicates -- **Added**: Automatic duplicate cleanup migration in `cache.py` -- **Result**: Images are now downloaded to `/mnt/okcomputer/output/images/{lot_id}/` and marked as `downloaded=1` -- **Impact**: Eliminates 57M+ duplicate image downloads by monitor app - -### 2. Data Completeness Fix ✅ -- **Problem**: 99.9% of lots missing closing_time, 100% missing bid data -- **Root Cause**: Troostwijk loads bid/timing data dynamically via GraphQL API, not in HTML -- **Solution**: Added GraphQL client to fetch real-time bidding data - -## Key Changes - -### New Files -1. **src/graphql_client.py** - GraphQL API client for fetching lot bidding data - - Endpoint: `https://storefront.tbauctions.com/storefront/graphql` - - Fetches: current_bid, starting_bid, minimum_bid, bid_count, closing_time - -### Modified Files -1. **src/config.py:22** - `DOWNLOAD_IMAGES = True` -2. **docker-compose.yml:13** - `DOWNLOAD_IMAGES: "True"` -3. **src/cache.py** - - Added unique index on `images(lot_id, url)` - - Added columns `starting_bid`, `minimum_bid` to `lots` table - - Added migration to clean duplicates and add missing columns -4. **src/scraper.py** - - Integrated GraphQL API calls for each lot - - Fetches real-time bidding data after parsing HTML - - Removed unicode characters causing Windows encoding issues - -## Database Schema Updates - -### lots table - New Columns -```sql -ALTER TABLE lots ADD COLUMN starting_bid TEXT; -ALTER TABLE lots ADD COLUMN minimum_bid TEXT; -``` - -### images table - New Index -```sql -CREATE UNIQUE INDEX idx_unique_lot_url ON images(lot_id, url); -``` - -## Data Flow (New Architecture) - -``` -┌────────────────────────────────────────────────────┐ -│ Phase 3: Scrape Lot Page │ -└────────────────────────────────────────────────────┘ - │ - ├─▶ Parse HTML (__NEXT_DATA__) - │ └─▶ Extract: title, location, images, description - │ - ├─▶ Fetch GraphQL API - │ └─▶ Query: LotBiddingData(lot_display_id) - │ └─▶ Returns: - │ - currentBidAmount (cents) - │ - initialAmount (starting_bid) - │ - nextMinimalBid (minimum_bid) - │ - bidsCount - │ - endDate (Unix timestamp) - │ - startDate - │ - biddingStatus - │ - └─▶ Save to Database - - lots table: complete bid & timing data - - images table: deduplicated URLs - - Download images immediately -``` - -## Testing Results - -### Test Lot: A1-28505-5 -``` -Current Bid: EUR 50.00 ✅ -Starting Bid: EUR 50.00 ✅ -Minimum Bid: EUR 55.00 ✅ -Bid Count: 1 ✅ -Closing Time: 2025-12-16 19:10:00 ✅ -Images: Downloaded 2 ✅ -``` - -## Deployment Checklist - -- [x] Enable DOWNLOAD_IMAGES in config -- [x] Update docker-compose environment -- [x] Add GraphQL client -- [x] Update scraper integration -- [x] Add database migrations -- [x] Test with live lot -- [ ] Deploy to production -- [ ] Run full scrape to populate data -- [ ] Verify monitor app sees downloaded images - -## Post-Deployment Verification - -### Check Data Quality -```sql --- Bid data completeness -SELECT - COUNT(*) as total, - SUM(CASE WHEN closing_time != '' THEN 1 ELSE 0 END) as has_closing, - SUM(CASE WHEN bid_count > 0 THEN 1 ELSE 0 END) as has_bids, - SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid -FROM lots -WHERE scraped_at > datetime('now', '-1 hour'); - --- Image download rate -SELECT - COUNT(*) as total, - SUM(downloaded) as downloaded, - ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate -FROM images -WHERE id IN ( - SELECT i.id FROM images i - JOIN lots l ON i.lot_id = l.lot_id - WHERE l.scraped_at > datetime('now', '-1 hour') -); - --- Duplicate check (should be 0) -SELECT lot_id, url, COUNT(*) as dup_count -FROM images -GROUP BY lot_id, url -HAVING COUNT(*) > 1; -``` - -## Notes - -- GraphQL API requires no authentication -- API rate limits: handled by existing `RATE_LIMIT_SECONDS = 0.5` -- Currency format: Changed from € to EUR for Windows compatibility -- Timestamps: API returns Unix timestamps in seconds (not milliseconds) -- Existing data: Old lots still have missing data; re-scrape required to populate diff --git a/docs/SESSION_COMPLETE_SUMMARY.md b/docs/SESSION_COMPLETE_SUMMARY.md deleted file mode 100644 index 82edc69..0000000 --- a/docs/SESSION_COMPLETE_SUMMARY.md +++ /dev/null @@ -1,426 +0,0 @@ -# Session Complete - Full Summary - -## Overview - -**Duration:** ~3-4 hours -**Tasks Completed:** 6 major fixes + enhancements -**Impact:** 80%+ increase in intelligence value, 99.9% data quality improvement - ---- - -## What Was Accomplished - -### ✅ 1. Fixed Orphaned Lots (99.9% Reduction) -**Problem:** 16,807 lots (100%) had no matching auction -**Root Cause:** Auction ID mismatch - lots used UUIDs, auctions used incorrect numeric IDs -**Solution:** -- Modified `src/parse.py` to extract auction displayId from lot pages -- Created `fix_orphaned_lots.py` to migrate 16,793 existing lots -- Created `fix_auctions_table.py` to rebuild 509 auctions with correct data -**Result:** **16,807 → 13 orphaned lots (0.08%)** - -**Files Modified:** -- `src/parse.py` - Updated `_extract_nextjs_data()` and `_parse_lot_json()` - -**Scripts Created:** -- `fix_orphaned_lots.py` ✅ RAN - Fixed existing lots -- `fix_auctions_table.py` ✅ RAN - Rebuilt auctions table - ---- - -### ✅ 2. Fixed Bid History Fetching -**Problem:** Only 1/1,591 lots with bids had history records -**Root Cause:** Bid history only captured during scraping, not for existing lots -**Solution:** -- Verified scraper logic is correct (fetches from REST API) -- Created `fetch_missing_bid_history.py` to migrate existing 1,590 lots -**Result:** Script ready, will populate all bid history (~13 minutes runtime) - -**Scripts Created:** -- `fetch_missing_bid_history.py` - Ready to run (optional) - ---- - -### ✅ 3. Added followers_count (Watch Count) -**Discovery:** Field exists in GraphQL API (was thought to be unavailable!) -**Implementation:** -- Added `followers_count INTEGER` column to database -- Updated GraphQL query to fetch `followersCount` -- Updated `format_bid_data()` to extract and return value -- Updated `save_lot()` to persist to database -**Intelligence Value:** ⭐⭐⭐⭐⭐ CRITICAL - Popularity predictor - -**Files Modified:** -- `src/cache.py` - Schema + save_lot() -- `src/graphql_client.py` - Query + extraction -- `src/scraper.py` - Enhanced logging - ---- - -### ✅ 4. Added estimatedFullPrice (Min/Max Values) -**Discovery:** Estimated prices available in GraphQL API! -**Implementation:** -- Added `estimated_min_price REAL` column -- Added `estimated_max_price REAL` column -- Updated GraphQL query to fetch `estimatedFullPrice { min max }` -- Updated `format_bid_data()` to extract cents and convert to EUR -- Updated `save_lot()` to persist both values -**Intelligence Value:** ⭐⭐⭐⭐⭐ CRITICAL - Bargain detection, value assessment - -**Files Modified:** -- `src/cache.py` - Schema + save_lot() -- `src/graphql_client.py` - Query + extraction -- `src/scraper.py` - Enhanced logging with value gap calculation - ---- - -### ✅ 5. Added Direct Condition Field -**Discovery:** Direct `condition` and `appearance` fields in API (cleaner than attribute extraction) -**Implementation:** -- Added `lot_condition TEXT` column -- Added `appearance TEXT` column -- Updated GraphQL query to fetch both fields -- Updated `format_bid_data()` to extract and return -- Updated `save_lot()` to persist -**Intelligence Value:** ⭐⭐⭐ HIGH - Better condition filtering - -**Files Modified:** -- `src/cache.py` - Schema + save_lot() -- `src/graphql_client.py` - Query + extraction -- `src/scraper.py` - Enhanced logging - ---- - -### ✅ 6. Enhanced Logging with Intelligence -**Problem:** Logs showed basic info, hard to spot opportunities -**Solution:** Added real-time intelligence display in scraper logs -**New Log Features:** -- **Followers count** - "Followers: X watching" -- **Estimated prices** - "Estimate: EUR X - EUR Y" -- **Automatic bargain detection** - ">> BARGAIN: X% below estimate!" -- **Automatic overvaluation warnings** - ">> WARNING: X% ABOVE estimate!" -- **Condition display** - "Condition: Used - Good" -- **Enhanced item info** - "Item: 2015 Ford FGT9250E" -- **Prominent bid velocity** - ">> Bid velocity: X bids/hour" - -**Files Modified:** -- `src/scraper.py` - Complete logging overhaul - -**Documentation Created:** -- `ENHANCED_LOGGING_EXAMPLE.md` - 6 real-world log examples - ---- - -## Files Modified Summary - -### Core Application Files (3): -1. **src/parse.py** - Fixed auction_id extraction -2. **src/cache.py** - Added 5 columns, updated save_lot() -3. **src/graphql_client.py** - Updated query, added field extraction -4. **src/scraper.py** - Enhanced logging with intelligence - -### Migration Scripts (4): -1. **fix_orphaned_lots.py** - ✅ COMPLETED -2. **fix_auctions_table.py** - ✅ COMPLETED -3. **fetch_missing_bid_history.py** - Ready to run -4. **enrich_existing_lots.py** - Ready to run (~2.3 hours) - -### Documentation Files (6): -1. **FIXES_COMPLETE.md** - Technical implementation summary -2. **VALIDATION_SUMMARY.md** - Data validation findings -3. **API_INTELLIGENCE_FINDINGS.md** - API discovery details -4. **INTELLIGENCE_DASHBOARD_UPGRADE.md** - Dashboard upgrade plan -5. **ENHANCED_LOGGING_EXAMPLE.md** - Log examples -6. **SESSION_COMPLETE_SUMMARY.md** - This document - -### Supporting Files (3): -1. **validate_data.py** - Data quality validation script -2. **explore_api_fields.py** - API exploration tool -3. **check_lot_auction_link.py** - Diagnostic script - ---- - -## Database Schema Changes - -### New Columns Added (5): -```sql -ALTER TABLE lots ADD COLUMN followers_count INTEGER DEFAULT 0; -ALTER TABLE lots ADD COLUMN estimated_min_price REAL; -ALTER TABLE lots ADD COLUMN estimated_max_price REAL; -ALTER TABLE lots ADD COLUMN lot_condition TEXT; -ALTER TABLE lots ADD COLUMN appearance TEXT; -``` - -### Auto-Migration: -All columns are automatically created on next scraper run via `src/cache.py` schema checks. - ---- - -## Data Quality Improvements - -### Before: -``` -Orphaned lots: 16,807 (100%) -Auction lots_count: 0% -Auction closing_time: 0% -Bid history coverage: 0.1% (1/1,591) -Intelligence fields: 0 new fields -``` - -### After: -``` -Orphaned lots: 13 (0.08%) ← 99.9% fixed -Auction lots_count: 100% ← Fixed -Auction closing_time: 100% ← Fixed -Bid history: Script ready ← Fixable -Intelligence fields: 5 new fields ← Added -Enhanced logging: Real-time intel ← Added -``` - ---- - -## Intelligence Value Increase - -### New Capabilities Enabled: - -1. **Bargain Detection (Automated)** - - Compare current_bid vs estimated_min_price - - Auto-flag lots >20% below estimate - - Calculate potential profit - -2. **Popularity Tracking** - - Monitor follower counts - - Identify "sleeper" lots (high followers, low bids) - - Calculate interest-to-bid conversion - -3. **Value Assessment** - - Professional auction house valuations - - Track accuracy of estimates vs final prices - - Build category-specific pricing models - -4. **Condition Intelligence** - - Direct condition from auction house - - Filter by quality level - - Identify restoration opportunities - -5. **Real-Time Opportunity Scanning** - - Logs show intelligence as items are scraped - - Grep for "BARGAIN" to find opportunities - - Watch for high-follower lots - -**Estimated Intelligence Value Increase: 80%+** - ---- - -## Documentation Updated - -### Technical Documentation: -- `_wiki/ARCHITECTURE.md` - Complete system documentation - - Updated Phase 3 diagram with API enrichment - - Expanded lots table schema (all 33+ fields) - - Added bid_history table documentation - - Added API Integration Architecture section - - Updated data flow diagrams - -### Intelligence Documentation: -- `INTELLIGENCE_DASHBOARD_UPGRADE.md` - Complete upgrade plan - - 4 priority levels of features - - SQL queries for all analytics - - Real-world use case examples - - ROI calculations - -### User Documentation: -- `ENHANCED_LOGGING_EXAMPLE.md` - 6 log examples showing: - - Bargain opportunities - - Sleeper lots - - Active auctions - - Overvalued items - - Fresh listings - - Items without estimates - ---- - -## Running the System - -### Immediate (Already Working): -```bash -# Scraper now captures all 5 new intelligence fields automatically -docker-compose up -d - -# Watch logs for real-time intelligence -docker logs -f scaev - -# Grep for opportunities -docker logs scaev | grep "BARGAIN" -docker logs scaev | grep "Followers: [0-9]\{2\}" -``` - -### Optional Migrations: -```bash -# Populate bid history for 1,590 existing lots (~13 minutes) -python fetch_missing_bid_history.py - -# Populate new intelligence fields for 16,807 lots (~2.3 hours) -python enrich_existing_lots.py -``` - -**Note:** Future scrapes automatically capture all data, so migrations are optional. - ---- - -## Example Enhanced Log Output - -### Before: -``` -[8766/15859] -[PAGE ford-generator-A1-34731-107] - Type: LOT - Title: Ford FGT9250E Generator... - Fetching bidding data from API... - Bid: EUR 500.00 - Location: Venray, NL - Images: 6 -``` - -### After: -``` -[8766/15859] -[PAGE ford-generator-A1-34731-107] - Type: LOT - Title: Ford FGT9250E Generator... - Fetching bidding data from API... - Bid: EUR 500.00 - Status: Geen Minimumprijs - Followers: 12 watching ← NEW - Estimate: EUR 1200.00 - EUR 1800.00 ← NEW - >> BARGAIN: 58% below estimate! ← NEW - Condition: Used - Good working order ← NEW - Item: 2015 Ford FGT9250E ← NEW - Fetching bid history... - >> Bid velocity: 2.4 bids/hour ← Enhanced - Location: Venray, NL - Images: 6 - Downloaded: 6/6 images -``` - -**Intelligence at a glance:** -- 🔥 58% below estimate = great bargain -- 👁 12 followers = good interest -- 📈 2.4 bids/hour = active bidding -- ✅ Good condition -- 💰 Potential profit: €700-€1,300 - ---- - -## Dashboard Upgrade Recommendations - -### Priority 1: Opportunity Detection -1. **Bargain Hunter Dashboard** - Auto-detect <80% estimate -2. **Sleeper Lot Alerts** - High followers + no bids -3. **Value Gap Heatmap** - Visual bargain overview - -### Priority 2: Intelligence Analytics -4. **Enhanced Lot Cards** - Show all new fields -5. **Auction House Accuracy** - Track estimate accuracy -6. **Interest Conversion** - Followers → Bidders analysis - -### Priority 3: Real-Time Alerts -7. **Bargain Alerts** - <80% estimate, closing soon -8. **Sleeper Alerts** - 10+ followers, 0 bids -9. **Overvalued Warnings** - >120% estimate - -### Priority 4: Advanced Features -10. **ML Price Prediction** - Use new fields for AI models -11. **Category Intelligence** - Deep category analytics -12. **Smart Watchlist** - Personalized opportunity alerts - -**Full plan available in:** `INTELLIGENCE_DASHBOARD_UPGRADE.md` - ---- - -## Next Steps (Optional) - -### For Existing Data: -```bash -# Run migrations to populate new fields for existing 16,807 lots -python enrich_existing_lots.py # ~2.3 hours -python fetch_missing_bid_history.py # ~13 minutes -``` - -### For Dashboard Development: -1. Read `INTELLIGENCE_DASHBOARD_UPGRADE.md` for complete plan -2. Use provided SQL queries for analytics -3. Implement priority 1 features first (bargain detection) - -### For Monitoring: -1. Monitor enhanced logs for real-time intelligence -2. Set up grep alerts for "BARGAIN" and high followers -3. Track scraper progress with new log details - ---- - -## Success Metrics - -### Data Quality: -- ✅ Orphaned lots: 16,807 → 13 (99.9% reduction) -- ✅ Auction completeness: 0% → 100% -- ✅ Database schema: +5 intelligence columns - -### Code Quality: -- ✅ 4 files modified (parse, cache, graphql_client, scraper) -- ✅ 4 migration scripts created -- ✅ 6 documentation files created -- ✅ Enhanced logging implemented - -### Intelligence Value: -- ✅ 5 new fields per lot (80%+ value increase) -- ✅ Real-time bargain detection in logs -- ✅ Automated value gap calculation -- ✅ Popularity tracking enabled -- ✅ Professional valuations captured - -### Documentation: -- ✅ Complete technical documentation -- ✅ Dashboard upgrade plan with SQL queries -- ✅ Enhanced logging examples -- ✅ API intelligence findings -- ✅ Migration guides - ---- - -## Files Ready for Monitoring App Team - -All files are in: `C:\vibe\scaev\` - -**Must Read:** -1. `INTELLIGENCE_DASHBOARD_UPGRADE.md` - Complete dashboard plan -2. `ENHANCED_LOGGING_EXAMPLE.md` - Log output examples -3. `FIXES_COMPLETE.md` - Technical changes - -**Reference:** -4. `_wiki/ARCHITECTURE.md` - System architecture -5. `API_INTELLIGENCE_FINDINGS.md` - API details -6. `VALIDATION_SUMMARY.md` - Data quality analysis - -**Scripts (if needed):** -7. `enrich_existing_lots.py` - Populate new fields -8. `fetch_missing_bid_history.py` - Get bid history -9. `validate_data.py` - Check data quality - ---- - -## Conclusion - -**Successfully completed comprehensive upgrade:** - -- 🔧 **Fixed critical data issues** (orphaned lots, bid history) -- 📊 **Added 5 intelligence fields** (followers, estimates, condition) -- 📝 **Enhanced logging** with real-time opportunity detection -- 📚 **Complete documentation** for monitoring app upgrade -- 🚀 **80%+ intelligence value increase** - -**System is now production-ready with advanced intelligence capabilities!** - -All future scrapes will automatically capture the new intelligence fields, enabling powerful analytics, opportunity detection, and predictive modeling in the monitoring dashboard. - -🎉 **Session Complete!** 🎉 diff --git a/docs/TESTING.md b/docs/TESTING.md deleted file mode 100644 index 73fcbba..0000000 --- a/docs/TESTING.md +++ /dev/null @@ -1,279 +0,0 @@ -# Testing & Migration Guide - -## Overview - -This guide covers: -1. Migrating existing cache to compressed format -2. Running the test suite -3. Understanding test results - -## Step 1: Migrate Cache to Compressed Format - -If you have an existing database with uncompressed entries (from before compression was added), run the migration script: - -```bash -python migrate_compress_cache.py -``` - -### What it does: -- Finds all cache entries where data is uncompressed -- Compresses them using zlib (level 9) -- Reports compression statistics and space saved -- Verifies all entries are compressed - -### Expected output: -``` -Cache Compression Migration Tool -============================================================ -Initial database size: 1024.56 MB - -Found 1134 uncompressed cache entries -Starting compression... - Compressed 100/1134 entries... (78.3% reduction so far) - Compressed 200/1134 entries... (79.1% reduction so far) - ... - -============================================================ -MIGRATION COMPLETE -============================================================ -Entries compressed: 1134 -Original size: 1024.56 MB -Compressed size: 198.34 MB -Space saved: 826.22 MB -Compression ratio: 80.6% -============================================================ - -VERIFICATION: - Compressed entries: 1134 - Uncompressed entries: 0 - ✓ All cache entries are compressed! - -Final database size: 1024.56 MB -Database size reduced by: 0.00 MB - -✓ Migration complete! You can now run VACUUM to reclaim disk space: - sqlite3 /mnt/okcomputer/output/cache.db 'VACUUM;' -``` - -### Reclaim disk space: -After migration, the database file still contains the space used by old uncompressed data. To actually reclaim the disk space: - -```bash -sqlite3 /mnt/okcomputer/output/cache.db 'VACUUM;' -``` - -This will rebuild the database file and reduce its size significantly. - -## Step 2: Run Tests - -The test suite validates that auction and lot parsing works correctly using **cached data only** (no live requests to server). - -```bash -python test_scraper.py -``` - -### What it tests: - -**Auction Pages:** -- Type detection (must be 'auction') -- auction_id extraction -- title extraction -- location extraction -- lots_count extraction -- first_lot_closing_time extraction - -**Lot Pages:** -- Type detection (must be 'lot') -- lot_id extraction -- title extraction (must not be '...', 'N/A', or empty) -- location extraction (must not be 'Locatie', 'Location', or empty) -- current_bid extraction (must not be '€Huidig ​​bod' or invalid) -- closing_time extraction -- images array extraction -- bid_count validation -- viewing_time and pickup_date (optional) - -### Expected output: - -``` -====================================================================== -TROOSTWIJK SCRAPER TEST SUITE -====================================================================== - -This test suite uses CACHED data only - no live requests to server -====================================================================== - -====================================================================== -CACHE STATUS CHECK -====================================================================== -Total cache entries: 1134 -Compressed: 1134 (100.0%) -Uncompressed: 0 (0.0%) - -✓ All cache entries are compressed! - -====================================================================== -TEST URL CACHE STATUS: -====================================================================== -✓ https://www.troostwijkauctions.com/a/online-auction-cnc-lat... -✓ https://www.troostwijkauctions.com/a/faillissement-bab-sho... -✓ https://www.troostwijkauctions.com/a/industriele-goederen-... -✓ https://www.troostwijkauctions.com/l/%25282x%2529-duo-bure... -✓ https://www.troostwijkauctions.com/l/tos-sui-50-1000-unive... -✓ https://www.troostwijkauctions.com/l/rolcontainer-%25282x%... - -6/6 test URLs are cached - -====================================================================== -TESTING AUCTIONS -====================================================================== - -====================================================================== -Testing Auction: https://www.troostwijkauctions.com/a/online-auction... -====================================================================== -✓ Cache hit (age: 12.3 hours) - ✓ auction_id: A7-39813 - ✓ title: Online Auction: CNC Lathes, Machining Centres & Precision... - ✓ location: Cluj-Napoca, RO - ✓ first_lot_closing_time: 2024-12-05 14:30:00 - ✓ lots_count: 45 - -====================================================================== -TESTING LOTS -====================================================================== - -====================================================================== -Testing Lot: https://www.troostwijkauctions.com/l/%25282x%2529-duo... -====================================================================== -✓ Cache hit (age: 8.7 hours) - ✓ lot_id: A1-28505-5 - ✓ title: (2x) Duo Bureau - 160x168 cm - ✓ location: Dongen, NL - ✓ current_bid: No bids - ✓ closing_time: 2024-12-10 16:00:00 - ✓ images: 2 images - 1. https://media.tbauctions.com/image-media/c3f9825f-e3fd... - 2. https://media.tbauctions.com/image-media/45c85ced-9c63... - ✓ bid_count: 0 - ✓ viewing_time: 2024-12-08 09:00:00 - 2024-12-08 17:00:00 - ✓ pickup_date: 2024-12-11 09:00:00 - 2024-12-11 15:00:00 - -====================================================================== -TEST SUMMARY -====================================================================== - -Total tests: 6 -Passed: 6 ✓ -Failed: 0 ✗ -Success rate: 100.0% - -====================================================================== -``` - -## Test URLs - -The test suite tests these specific URLs (you can modify in `test_scraper.py`): - -**Auctions:** -- https://www.troostwijkauctions.com/a/online-auction-cnc-lathes-machining-centres-precision-measurement-romania-A7-39813 -- https://www.troostwijkauctions.com/a/faillissement-bab-shortlease-i-ii-b-v-%E2%80%93-2024-big-ass-energieopslagsystemen-A1-39557 -- https://www.troostwijkauctions.com/a/industriele-goederen-uit-diverse-bedrijfsbeeindigingen-A1-38675 - -**Lots:** -- https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5 -- https://www.troostwijkauctions.com/l/tos-sui-50-1000-universele-draaibank-A7-39568-9 -- https://www.troostwijkauctions.com/l/rolcontainer-%25282x%2529-A1-40191-101 - -## Adding More Test Cases - -To add more test URLs, edit `test_scraper.py`: - -```python -TEST_AUCTIONS = [ - "https://www.troostwijkauctions.com/a/your-auction-url", - # ... add more -] - -TEST_LOTS = [ - "https://www.troostwijkauctions.com/l/your-lot-url", - # ... add more -] -``` - -Then run the main scraper to cache these URLs: -```bash -python main.py -``` - -Then run tests: -```bash -python test_scraper.py -``` - -## Troubleshooting - -### "NOT IN CACHE" errors -If tests show URLs are not cached, run the main scraper first: -```bash -python main.py -``` - -### "Failed to decompress cache" warnings -This means you have uncompressed legacy data. Run the migration: -```bash -python migrate_compress_cache.py -``` - -### Tests failing with parsing errors -Check the detailed error output in the TEST SUMMARY section. It will show: -- Which field failed validation -- The actual value that was extracted -- Why it failed (empty, wrong type, invalid format) - -## Cache Behavior - -The test suite uses cached data with these characteristics: -- **No rate limiting** - reads from DB instantly -- **No server load** - zero HTTP requests -- **Repeatable** - same results every time -- **Fast** - all tests run in < 5 seconds - -This allows you to: -- Test parsing changes without re-scraping -- Run tests repeatedly during development -- Validate changes before deploying -- Ensure data quality without server impact - -## Continuous Integration - -You can integrate these tests into CI/CD: - -```bash -# Run migration if needed -python migrate_compress_cache.py - -# Run tests -python test_scraper.py - -# Exit code: 0 = success, 1 = failure -``` - -## Performance Benchmarks - -Based on typical HTML sizes: - -| Metric | Before Compression | After Compression | Improvement | -|--------|-------------------|-------------------|-------------| -| Avg page size | 800 KB | 150 KB | 81.3% | -| 1000 pages | 800 MB | 150 MB | 650 MB saved | -| 10,000 pages | 8 GB | 1.5 GB | 6.5 GB saved | -| DB read speed | ~50 ms | ~5 ms | 10x faster | - -## Best Practices - -1. **Always run migration after upgrading** to the compressed cache version -2. **Run VACUUM** after migration to reclaim disk space -3. **Run tests after major changes** to parsing logic -4. **Add test cases for edge cases** you encounter in production -5. **Keep test URLs diverse** - different auctions, lot types, languages -6. **Monitor cache hit rates** to ensure effective caching diff --git a/docs/VALIDATION_SUMMARY.md b/docs/VALIDATION_SUMMARY.md deleted file mode 100644 index dfab08c..0000000 --- a/docs/VALIDATION_SUMMARY.md +++ /dev/null @@ -1,308 +0,0 @@ -# Data Validation & API Intelligence Summary - -## Executive Summary - -Completed comprehensive validation of the Troostwijk scraper database and API capabilities. Discovered **15+ additional intelligence fields** available from APIs that are not yet captured. Updated ARCHITECTURE.md with complete documentation of current system and data structures. - ---- - -## Data Validation Results - -### Database Statistics (as of 2025-12-07) - -#### Overall Counts: -- **Auctions:** 475 -- **Lots:** 16,807 -- **Images:** 217,513 -- **Bid History Records:** 1 - -### Data Completeness Analysis - -#### ✅ EXCELLENT (>90% complete): -- **Lot titles:** 100% (16,807/16,807) -- **Current bid:** 100% (16,807/16,807) -- **Closing time:** 100% (16,807/16,807) -- **Auction titles:** 100% (475/475) - -#### ⚠️ GOOD (50-90% complete): -- **Brand:** 72.1% (12,113/16,807) -- **Manufacturer:** 72.1% (12,113/16,807) -- **Model:** 55.3% (9,298/16,807) - -#### 🔴 NEEDS IMPROVEMENT (<50% complete): -- **Year manufactured:** 31.7% (5,335/16,807) -- **Starting bid:** 18.8% (3,155/16,807) -- **Minimum bid:** 18.8% (3,155/16,807) -- **Condition description:** 6.1% (1,018/16,807) -- **Serial number:** 9.8% (1,645/16,807) -- **Lots with bids:** 9.5% (1,591/16,807) -- **Status:** 0.0% (2/16,807) -- **Auction lots count:** 0.0% (0/475) -- **Auction closing time:** 0.8% (4/475) -- **First lot closing:** 0.0% (0/475) - -#### 🔴 MISSING (0% - fields exist but no data): -- **Condition score:** 0% -- **Damage description:** 0% -- **First bid time:** 0.0% (1/16,807) -- **Last bid time:** 0.0% (1/16,807) -- **Bid velocity:** 0.0% (1/16,807) -- **Bid history:** Only 1 lot has history - -### Data Quality Issues - -#### ❌ CRITICAL: -- **16,807 orphaned lots:** All lots have no matching auction record - - Likely due to auction_id mismatch or missing auction scraping - -#### ⚠️ WARNINGS: -- **1,590 lots have bids but no bid history** - - These lots should have bid_history records but don't - - Suggests bid history fetching is not working for most lots -- **13 lots have no images** - - Minor issue, some lots legitimately have no images - -### Image Download Status -- **Total images:** 217,513 -- **Downloaded:** 16.9% (36,683) -- **Has local path:** 30.6% (66,606) -- **Lots with images:** 18,489 (more than total lots suggests duplicates or multiple sources) - ---- - -## API Intelligence Findings - -### 🎯 Major Discovery: Additional Fields Available - -From GraphQL API schema introspection, discovered **15+ additional fields** that can significantly enhance intelligence: - -### HIGH PRIORITY Fields (Immediate Value): - -1. **`followersCount`** (Int) - **CRITICAL MISSING FIELD** - - This is the "watch count" we thought wasn't available - - Shows how many users are watching/following a lot - - Direct indicator of bidder interest and potential competition - - **Intelligence value:** Predict lot popularity and final price - -2. **`estimatedFullPrice`** (Object) - **CRITICAL MISSING FIELD** - - Contains `min { cents currency }` and `max { cents currency }` - - Auction house's estimated value range - - **Intelligence value:** Compare final price to estimate, identify bargains - -3. **`nextBidStepInCents`** (Long) - - Exact bid increment in cents - - Currently we calculate bid_increment, but API provides exact value - - **Intelligence value:** Show exact next bid amount - -4. **`condition`** (String) - - Direct condition field from API - - Cleaner than extracting from attributes - - **Intelligence value:** Better condition scoring - -5. **`categoryInformation`** (Object) - - Structured category data with `id`, `name`, `path` - - Better than simple category string - - **Intelligence value:** Category-based filtering and analytics - -6. **`location`** (LotLocation) - - Structured location with `city`, `countryCode`, `addressLine1`, `addressLine2` - - Currently just storing simple location string - - **Intelligence value:** Proximity filtering, logistics calculations - -### MEDIUM PRIORITY Fields: - -7. **`biddingStatus`** (Enum) - More detailed than `minimumBidAmountMet` -8. **`appearance`** (String) - Visual condition notes -9. **`packaging`** (String) - Packaging details -10. **`quantity`** (Long) - Lot quantity (important for bulk lots) -11. **`vat`** (BigDecimal) - VAT percentage -12. **`buyerPremiumPercentage`** (BigDecimal) - Buyer premium -13. **`remarks`** (String) - May contain viewing/pickup text -14. **`negotiated`** (Boolean) - Bid history: was bid negotiated - -### LOW PRIORITY Fields: - -15. **`videos`** (Array) - Video URLs (if available) -16. **`documents`** (Array) - Document URLs (specs/manuals) - ---- - -## Intelligence Impact Analysis - -### With `followersCount`: -``` -- Predict lot popularity BEFORE bidding wars start -- Calculate interest-to-bid conversion rate -- Identify "sleeper" lots (high followers, low bids) -- Alert on lots gaining sudden interest -``` - -### With `estimatedFullPrice`: -``` -- Compare final price vs estimate (accuracy analysis) -- Identify bargains: final_price < estimated_min -- Identify overvalued: final_price > estimated_max -- Build pricing models per category -``` - -### With exact `nextBidStepInCents`: -``` -- Show users exact next bid amount -- No calculation errors -- Better UX for bidding recommendations -``` - -### With structured `location`: -``` -- Filter by distance from user -- Calculate pickup logistics costs -- Group by region for bulk purchases -``` - -### With `vat` and `buyerPremiumPercentage`: -``` -- Calculate TRUE total cost including fees -- Compare all-in prices across lots -- Budget planning with accurate costs -``` - -**Estimated intelligence value increase:** 80%+ - ---- - -## Current Implementation Status - -### ✅ Working Well: -1. **HTML caching with compression** (70-90% size reduction) -2. **Concurrent image downloads** (16x speedup vs sequential) -3. **GraphQL API integration** for bidding data -4. **Bid history API integration** with pagination -5. **Attribute extraction** (brand, model, manufacturer) -6. **Bid intelligence calculations** (velocity, timing) -7. **Database auto-migration** for schema changes -8. **Unique constraints** preventing image duplicates - -### ⚠️ Needs Attention: -1. **Auction data completeness** (0% lots_count, closing_time, first_lot_closing) -2. **Lot-to-auction relationship** (all 16,807 lots are orphaned) -3. **Bid history fetching** (only 1 lot has history, should be 1,591) -4. **Status field extraction** (99.9% missing) -5. **Condition score calculation** (0% - not working) - -### 🔴 Missing Features (High Value): -1. **followersCount extraction** -2. **estimatedFullPrice extraction** -3. **Structured location extraction** -4. **Category information extraction** -5. **Direct condition field usage** -6. **VAT and buyer premium extraction** - ---- - -## Recommendations - -### Immediate Actions (High ROI): - -1. **Fix orphaned lots issue** - - Investigate auction_id relationship - - Ensure auctions are being scraped - - Fix FK relationship - -2. **Fix bid history fetching** - - Currently only 1/1,591 lots with bids has history - - Debug why REST API calls are failing/skipped - - Ensure lot UUID extraction is working - -3. **Add `followersCount` field** - - High value, easy to extract - - Add column: `followers_count INTEGER` - - Extract from GraphQL response - - Update migration script - -4. **Add `estimatedFullPrice` extraction** - - Add columns: `estimated_min_price REAL`, `estimated_max_price REAL` - - Extract from GraphQL `lotDetails.estimatedFullPrice` - - Update migration script - -5. **Use direct `condition` field** - - Replace attribute-based condition extraction - - Cleaner, more reliable - - May fix 0% condition_score issue - -### Short-term Improvements: - -6. **Add structured location fields** - - Replace simple `location` string - - Add: `location_city`, `location_country`, `location_address` - -7. **Add category information** - - Extract structured category from API - - Add: `category_id`, `category_name`, `category_path` - -8. **Add cost calculation fields** - - Extract: `vat_percentage`, `buyer_premium_percentage` - - Calculate: `total_cost_estimate` - -9. **Fix status extraction** - - Currently 99.9% missing - - Use `biddingStatus` enum from API - -10. **Fix condition scoring** - - Currently 0% success rate - - Use direct `condition` field from API - -### Long-term Enhancements: - -11. **Video and document support** -12. **Viewing/pickup time parsing from remarks** -13. **Historical price tracking** (scrape repeatedly) -14. **Predictive modeling** (using followers, bid velocity, etc.) - ---- - -## Files Updated - -### Created: -- `validate_data.py` - Comprehensive data validation script -- `explore_api_fields.py` - API schema introspection -- `API_INTELLIGENCE_FINDINGS.md` - Detailed API analysis -- `VALIDATION_SUMMARY.md` - This document - -### Updated: -- `_wiki/ARCHITECTURE.md` - Complete documentation update: - - Updated Phase 3 diagram with API enrichment - - Expanded lots table schema with all fields - - Added bid_history table documentation - - Added API enrichment flow diagrams - - Added API Integration Architecture section - - Updated image download flow (concurrent) - - Updated rate limiting documentation - ---- - -## Next Steps - -See `API_INTELLIGENCE_FINDINGS.md` for: -- Detailed implementation plan -- Updated GraphQL query with all fields -- Database schema migrations needed -- Priority ordering of features - -**Priority order:** -1. Fix orphaned lots and bid history issues ← **Critical bugs** -2. Add followersCount and estimatedFullPrice ← **High value, easy wins** -3. Add structured location and category ← **Better data quality** -4. Add VAT/premium for cost calculations ← **User value** -5. Video/document support ← **Nice to have** - ---- - -## Validation Conclusion - -**Database status:** Working but with data quality issues (orphaned lots, missing bid history) - -**Data completeness:** Good for core fields (title, bid, closing time), needs improvement for enrichment fields - -**API capabilities:** Far more powerful than currently utilized - 15+ valuable fields available - -**Immediate action:** Fix data relationship bugs, then harvest additional API fields for 80%+ intelligence boost diff --git a/src/graphql_client.py b/src/graphql_client.py index 654d4bd..9201e5c 100644 --- a/src/graphql_client.py +++ b/src/graphql_client.py @@ -124,6 +124,7 @@ async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]: return None import aiohttp + import asyncio variables = { "lotDisplayId": lot_display_id, @@ -136,22 +137,57 @@ async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]: "variables": variables } - try: - async with aiohttp.ClientSession() as session: - async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response: - if response.status == 200: - data = await response.json() - lot_details = data.get('data', {}).get('lotDetails', {}) + # Some endpoints reject requests without browser-like headers + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0 Safari/537.36" + ), + "Accept": "application/json, text/plain, */*", + "Content-Type": "application/json", + # Pretend the query originates from the public website + "Origin": "https://www.troostwijkauctions.com", + "Referer": f"https://www.troostwijkauctions.com/l/{lot_display_id}", + } - if lot_details and lot_details.get('lot'): - return lot_details - return None - else: - print(f" GraphQL API error: {response.status}") - return None - except Exception as e: - print(f" GraphQL request failed: {e}") - return None + # Light retry for transient 403/429 + backoffs = [0, 0.6] + last_err_snippet = "" + for attempt, backoff in enumerate(backoffs, start=1): + try: + async with aiohttp.ClientSession(headers=headers) as session: + async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response: + if response.status == 200: + data = await response.json() + lot_details = data.get('data', {}).get('lotDetails', {}) + if lot_details and lot_details.get('lot'): + return lot_details + # No lot details found + return None + else: + # Try to get a short error body for diagnostics + try: + txt = await response.text() + last_err_snippet = (txt or "")[:200].replace("\n", " ") + except Exception: + last_err_snippet = "" + print( + f" GraphQL API error: {response.status} (lot={lot_display_id}) " + f"{('— ' + last_err_snippet) if last_err_snippet else ''}" + ) + # Only retry for 403/429 once + if response.status in (403, 429) and attempt < len(backoffs): + await asyncio.sleep(backoff) + continue + return None + except Exception as e: + print(f" GraphQL request failed (lot={lot_display_id}): {e}") + if attempt < len(backoffs): + await asyncio.sleep(backoff) + continue + return None + + return None def format_bid_data(lot_details: Dict) -> Dict: diff --git a/src/scraper.py b/src/scraper.py index 4a256ab..da61af8 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -9,6 +9,7 @@ import time import random import json import re +from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Set, Tuple from urllib.parse import urljoin @@ -589,13 +590,41 @@ class TroostwijkScraper: if images_to_download: import aiohttp async with aiohttp.ClientSession() as session: - download_tasks = [ - self._download_image(session, img_url, page_data['lot_id'], i) + total = len(images_to_download) + + async def dl(i, img_url): + path = await self._download_image(session, img_url, page_data['lot_id'], i) + return i, img_url, path + + tasks = [ + asyncio.create_task(dl(i, img_url)) for i, img_url in images_to_download ] - results = await asyncio.gather(*download_tasks, return_exceptions=True) - downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception)) - print(f" Downloaded: {downloaded_count}/{len(images_to_download)} new images") + + completed = 0 + succeeded: List[int] = [] + # In-place progress + print(f" Downloading images: 0/{total}", end="\r", flush=True) + for coro in asyncio.as_completed(tasks): + try: + i, img_url, path = await coro + if path: + succeeded.append(i) + except Exception: + pass + finally: + completed += 1 + print(f" Downloading images: {completed}/{total}", end="\r", flush=True) + + # Ensure next prints start on a new line + print() + print(f" Downloaded: {len(succeeded)}/{total} new images") + if succeeded: + succeeded.sort() + # Show which indexes were downloaded + idx_preview = ", ".join(str(x) for x in succeeded[:20]) + more = "" if len(succeeded) <= 20 else f" (+{len(succeeded)-20} more)" + print(f" Indexes: {idx_preview}{more}") else: print(f" All {len(images)} images already cached") diff --git a/test/test_graphql_403.py b/test/test_graphql_403.py new file mode 100644 index 0000000..55790c2 --- /dev/null +++ b/test/test_graphql_403.py @@ -0,0 +1,85 @@ +import asyncio +import types +import sys +from pathlib import Path +import pytest + + +@pytest.mark.asyncio +async def test_fetch_lot_bidding_data_403(monkeypatch): + """ + Simulate a 403 from the GraphQL endpoint and verify: + - Function returns None (graceful handling) + - It attempts a retry and logs a clear 403 message + """ + # Load modules directly from src using importlib to avoid path issues + project_root = Path(__file__).resolve().parents[1] + src_path = project_root / 'src' + import importlib.util + + def _load_module(name, file_path): + spec = importlib.util.spec_from_file_location(name, str(file_path)) + module = importlib.util.module_from_spec(spec) + sys.modules[name] = module + spec.loader.exec_module(module) # type: ignore + return module + + # Load config first because graphql_client imports it by module name + config = _load_module('config', src_path / 'config.py') + graphql_client = _load_module('graphql_client', src_path / 'graphql_client.py') + monkeypatch.setattr(config, "OFFLINE", False, raising=False) + + log_messages = [] + + def fake_print(*args, **kwargs): + msg = " ".join(str(a) for a in args) + log_messages.append(msg) + + import builtins + monkeypatch.setattr(builtins, "print", fake_print) + + class MockResponse: + def __init__(self, status=403, text_body="Forbidden"): + self.status = status + self._text_body = text_body + + async def json(self): + return {} + + async def text(self): + return self._text_body + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + class MockSession: + def __init__(self, *args, **kwargs): + pass + + def post(self, *args, **kwargs): + # Always return 403 + return MockResponse(403, "Forbidden by WAF") + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + # Patch aiohttp.ClientSession to our mock + import types as _types + dummy_aiohttp = _types.SimpleNamespace() + dummy_aiohttp.ClientSession = MockSession + # Ensure that an `import aiohttp` inside the function resolves to our dummy + monkeypatch.setitem(sys.modules, 'aiohttp', dummy_aiohttp) + + result = await graphql_client.fetch_lot_bidding_data("A1-40179-35") + + # Should gracefully return None + assert result is None + + # Should have logged a 403 at least once + assert any("GraphQL API error: 403" in m for m in log_messages)