go

2025-12-06 21:27:19 +01:00
parent 288ee6a2a6
commit 6091b7180f
9 changed files with 551 additions and 204 deletions
--- a/SCRAPER_REFACTOR_GUIDE.md
+++ b/SCRAPER_REFACTOR_GUIDE.md
@@ -0,0 +1,399 @@
+# Scraper Refactor Guide - Image Download Integration
+
+## 🎯 Objective
+
+Refactor the Troostwijk scraper to **download and store images locally**, eliminating the 57M+ duplicate image problem in the monitoring process.
+
+## 📋 Current vs. New Architecture
+
+### **Before** (Current Architecture)
+```
+┌──────────────┐         ┌──────────────┐         ┌──────────────┐
+│   Scraper    │────────▶│   Database   │◀────────│   Monitor    │
+│              │         │              │         │              │
+│ Stores URLs  │         │ images table │         │ Downloads +  │
+│ downloaded=0 │         │              │         │ Detection    │
+└──────────────┘         └──────────────┘         └──────────────┘
+                                                         │
+                                                         ▼
+                                                   57M+ duplicates!
+```
+
+### **After** (New Architecture)
+```
+┌──────────────┐         ┌──────────────┐         ┌──────────────┐
+│   Scraper    │────────▶│   Database   │◀────────│   Monitor    │
+│              │         │              │         │              │
+│ Downloads +  │         │ images table │         │ Detection    │
+│ Stores path  │         │ local_path ✓ │         │ Only         │
+│ downloaded=1 │         │              │         │              │
+└──────────────┘         └──────────────┘         └──────────────┘
+                                                         │
+                                                         ▼
+                                                   No duplicates!
+```
+
+## 🗄️ Database Schema Changes
+
+### Current Schema (ARCHITECTURE-TROOSTWIJK-SCRAPER.md:113-122)
+```sql
+CREATE TABLE images (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    lot_id TEXT,
+    url TEXT,
+    local_path TEXT,           -- Currently NULL
+    downloaded INTEGER         -- Currently 0
+    -- Missing: processed_at, labels (added by monitor)
+);
+```
+
+### Required Schema (Already Compatible!)
+```sql
+CREATE TABLE images (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    lot_id TEXT,
+    url TEXT,
+    local_path TEXT,           -- ✅ SET by scraper after download
+    downloaded INTEGER,        -- ✅ SET to 1 by scraper after download
+    labels TEXT,               -- ⚠️ SET by monitor (object detection)
+    processed_at INTEGER,      -- ⚠️ SET by monitor (timestamp)
+    FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
+);
+```
+
+**Good News**: The scraper's schema already has `local_path` and `downloaded` columns! You just need to populate them.
+
+## 🔧 Implementation Steps
+
+### **Step 1: Enable Image Downloading in Configuration**
+
+**File**: Your scraper's config file (e.g., `config.py` or environment variables)
+
+```python
+# Current setting
+DOWNLOAD_IMAGES = False  # ❌ Change this!
+
+# New setting
+DOWNLOAD_IMAGES = True   # ✅ Enable downloads
+
+# Image storage path
+IMAGES_DIR = "/mnt/okcomputer/output/images"  # Or your preferred path
+```
+
+### **Step 2: Update Image Download Logic**
+
+Based on ARCHITECTURE-TROOSTWIJK-SCRAPER.md:211-228, you already have the structure. Here's what needs to change:
+
+**Current Code** (Conceptual):
+```python
+# Phase 3: Scrape lot details
+def scrape_lot(lot_url):
+    lot_data = parse_lot_page(lot_url)
+
+    # Save lot to database
+    db.insert_lot(lot_data)
+
+    # Save image URLs to database (NOT DOWNLOADED)
+    for img_url in lot_data['images']:
+        db.execute("""
+            INSERT INTO images (lot_id, url, downloaded)
+            VALUES (?, ?, 0)
+        """, (lot_data['lot_id'], img_url))
+```
+
+**New Code** (Required):
+```python
+import os
+import requests
+from pathlib import Path
+import time
+
+def scrape_lot(lot_url):
+    lot_data = parse_lot_page(lot_url)
+
+    # Save lot to database
+    db.insert_lot(lot_data)
+
+    # Download and save images
+    for idx, img_url in enumerate(lot_data['images'], start=1):
+        try:
+            # Download image
+            local_path = download_image(img_url, lot_data['lot_id'], idx)
+
+            # Insert with local_path and downloaded=1
+            db.execute("""
+                INSERT INTO images (lot_id, url, local_path, downloaded)
+                VALUES (?, ?, ?, 1)
+                ON CONFLICT(lot_id, url) DO UPDATE SET
+                    local_path = excluded.local_path,
+                    downloaded = 1
+            """, (lot_data['lot_id'], img_url, local_path))
+
+            # Rate limiting (0.5s between downloads)
+            time.sleep(0.5)
+
+        except Exception as e:
+            print(f"Failed to download {img_url}: {e}")
+            # Still insert record but mark as not downloaded
+            db.execute("""
+                INSERT INTO images (lot_id, url, downloaded)
+                VALUES (?, ?, 0)
+            """, (lot_data['lot_id'], img_url))
+
+def download_image(image_url, lot_id, index):
+    """
+    Downloads an image and saves it to organized directory structure.
+
+    Args:
+        image_url: Remote URL of the image
+        lot_id: Lot identifier (e.g., "A1-28505-5")
+        index: Image sequence number (1, 2, 3, ...)
+
+    Returns:
+        Absolute path to saved file
+    """
+    # Create directory structure: /images/{lot_id}/
+    images_dir = Path(os.getenv('IMAGES_DIR', '/mnt/okcomputer/output/images'))
+    lot_dir = images_dir / lot_id
+    lot_dir.mkdir(parents=True, exist_ok=True)
+
+    # Determine file extension from URL or content-type
+    ext = Path(image_url).suffix or '.jpg'
+    filename = f"{index:03d}{ext}"  # 001.jpg, 002.jpg, etc.
+    local_path = lot_dir / filename
+
+    # Download with timeout
+    response = requests.get(image_url, timeout=10)
+    response.raise_for_status()
+
+    # Save to disk
+    with open(local_path, 'wb') as f:
+        f.write(response.content)
+
+    return str(local_path.absolute())
+```
+
+### **Step 3: Add Unique Constraint to Prevent Duplicates**
+
+**Migration SQL**:
+```sql
+-- Add unique constraint to prevent duplicate image records
+CREATE UNIQUE INDEX IF NOT EXISTS idx_images_unique
+ON images(lot_id, url);
+```
+
+Add this to your scraper's schema initialization:
+
+```python
+def init_database():
+    conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+    cursor = conn.cursor()
+
+    # Existing table creation...
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS images (...)
+    """)
+
+    # Add unique constraint (NEW)
+    cursor.execute("""
+        CREATE UNIQUE INDEX IF NOT EXISTS idx_images_unique
+        ON images(lot_id, url)
+    """)
+
+    conn.commit()
+    conn.close()
+```
+
+### **Step 4: Handle Image Download Failures Gracefully**
+
+```python
+def download_with_retry(image_url, lot_id, index, max_retries=3):
+    """Downloads image with retry logic."""
+    for attempt in range(max_retries):
+        try:
+            return download_image(image_url, lot_id, index)
+        except requests.exceptions.RequestException as e:
+            if attempt == max_retries - 1:
+                print(f"Failed after {max_retries} attempts: {image_url}")
+                return None  # Return None on failure
+            print(f"Retry {attempt + 1}/{max_retries} for {image_url}")
+            time.sleep(2 ** attempt)  # Exponential backoff
+```
+
+### **Step 5: Update Database Queries**
+
+Make sure your INSERT uses `INSERT ... ON CONFLICT` to handle re-scraping:
+
+```python
+# Good: Handles re-scraping without duplicates
+db.execute("""
+    INSERT INTO images (lot_id, url, local_path, downloaded)
+    VALUES (?, ?, ?, 1)
+    ON CONFLICT(lot_id, url) DO UPDATE SET
+        local_path = excluded.local_path,
+        downloaded = 1
+""", (lot_id, img_url, local_path))
+
+# Bad: Creates duplicates on re-scrape
+db.execute("""
+    INSERT INTO images (lot_id, url, local_path, downloaded)
+    VALUES (?, ?, ?, 1)
+""", (lot_id, img_url, local_path))
+```
+
+## 📊 Expected Outcomes
+
+### Before Refactor
+```sql
+SELECT COUNT(*) FROM images WHERE downloaded = 0;
+-- Result: 57,376,293 (57M+ undownloaded!)
+
+SELECT COUNT(*) FROM images WHERE local_path IS NOT NULL;
+-- Result: 0 (no files downloaded)
+```
+
+### After Refactor
+```sql
+SELECT COUNT(*) FROM images WHERE downloaded = 1;
+-- Result: ~16,807 (one per actual lot image)
+
+SELECT COUNT(*) FROM images WHERE local_path IS NOT NULL;
+-- Result: ~16,807 (all downloaded images have paths)
+
+SELECT COUNT(DISTINCT lot_id, url) FROM images;
+-- Result: ~16,807 (no duplicates!)
+```
+
+## 🚀 Deployment Checklist
+
+### Pre-Deployment
+- [ ] Back up current database: `cp cache.db cache.db.backup`
+- [ ] Verify disk space: At least 10GB free for images
+- [ ] Test download function on 5 sample lots
+- [ ] Verify `IMAGES_DIR` path exists and is writable
+
+### Deployment
+- [ ] Update configuration: `DOWNLOAD_IMAGES = True`
+- [ ] Run schema migration to add unique index
+- [ ] Deploy updated scraper code
+- [ ] Monitor first 100 lots for errors
+
+### Post-Deployment Verification
+```sql
+-- Check download success rate
+SELECT
+    COUNT(*) as total_images,
+    SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
+    SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
+    ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
+FROM images;
+
+-- Check for duplicates (should be 0)
+SELECT lot_id, url, COUNT(*) as dup_count
+FROM images
+GROUP BY lot_id, url
+HAVING COUNT(*) > 1;
+
+-- Verify file system
+SELECT COUNT(*) FROM images
+WHERE downloaded = 1
+  AND local_path IS NOT NULL
+  AND local_path != '';
+```
+
+## 🔍 Monitoring Process Impact
+
+The monitoring process (auctiora) will automatically:
+- ✅ Stop downloading images (network I/O eliminated)
+- ✅ Only run object detection on `local_path` files
+- ✅ Query: `WHERE local_path IS NOT NULL AND (labels IS NULL OR labels = '')`
+- ✅ Update only the `labels` and `processed_at` columns
+
+**No changes needed in monitoring process!** It's already updated to work with scraper-downloaded images.
+
+## 🐛 Troubleshooting
+
+### Problem: "No space left on device"
+```bash
+# Check disk usage
+df -h /mnt/okcomputer/output/images
+
+# Estimate needed space: ~100KB per image
+# 16,807 images × 100KB = ~1.6GB
+```
+
+### Problem: "Permission denied" when writing images
+```bash
+# Fix permissions
+chmod 755 /mnt/okcomputer/output/images
+chown -R scraper_user:scraper_group /mnt/okcomputer/output/images
+```
+
+### Problem: Images downloading but not recorded in DB
+```python
+# Add logging
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def download_image(...):
+    logging.info(f"Downloading {image_url} to {local_path}")
+    # ... download code ...
+    logging.info(f"Saved to {local_path}, size: {os.path.getsize(local_path)} bytes")
+    return local_path
+```
+
+### Problem: Duplicate images after refactor
+```sql
+-- Find duplicates
+SELECT lot_id, url, COUNT(*)
+FROM images
+GROUP BY lot_id, url
+HAVING COUNT(*) > 1;
+
+-- Clean up duplicates (keep newest)
+DELETE FROM images
+WHERE id NOT IN (
+    SELECT MAX(id)
+    FROM images
+    GROUP BY lot_id, url
+);
+```
+
+## 📈 Performance Comparison
+
+| Metric | Before (Monitor Downloads) | After (Scraper Downloads) |
+|--------|---------------------------|---------------------------|
+| **Image records** | 57,376,293 | ~16,807 |
+| **Duplicates** | 57,359,486 (99.97%!) | 0 |
+| **Network I/O** | Monitor process | Scraper process |
+| **Disk usage** | 0 (URLs only) | ~1.6GB (actual files) |
+| **Processing speed** | 500ms/image (download + detect) | 100ms/image (detect only) |
+| **Error handling** | Complex (download failures) | Simple (files exist) |
+
+## 🎓 Code Examples by Language
+
+### Python (Most Likely)
+See **Step 2** above for complete implementation.
+
+## 📚 References
+
+- **Current Scraper Architecture**: `wiki/ARCHITECTURE-TROOSTWIJK-SCRAPER.md`
+- **Database Schema**: `wiki/DATABASE_ARCHITECTURE.md`
+- **Monitor Changes**: See commit history for `ImageProcessingService.java`, `DatabaseService.java`
+
+## ✅ Success Criteria
+
+You'll know the refactor is successful when:
+
+1. ✅ Database query `SELECT COUNT(*) FROM images` returns ~16,807 (not 57M+)
+2. ✅ All images have `downloaded = 1` and `local_path IS NOT NULL`
+3. ✅ No duplicate records: `SELECT lot_id, url, COUNT(*) ... HAVING COUNT(*) > 1` returns 0 rows
+4. ✅ Monitor logs show "Found N images needing detection" with reasonable numbers
+5. ✅ Files exist at paths in `local_path` column
+6. ✅ Monitor process speed increases (100ms vs 500ms per image)
+
+---
+
+**Questions?** Check the troubleshooting section or inspect the monitor's updated code in:
+- `src/main/java/auctiora/ImageProcessingService.java`
+- `src/main/java/auctiora/DatabaseService.java:695-719`
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -28,7 +28,7 @@ services:
      - AUCTION_WORKFLOW_CLOSING_ALERTS_CRON=0 */5 * * * ?

    volumes:
-      # Mount database and images directory
+      # Mount database and images directory1
      - shared-auction-data:/mnt/okcomputer/output

    labels:
--- a/src/main/java/auctiora/AuctionMonitorProducer.java
+++ b/src/main/java/auctiora/AuctionMonitorProducer.java
@@ -68,10 +68,9 @@ public class AuctionMonitorProducer {
    @Singleton
    public ImageProcessingService produceImageProcessingService(
            DatabaseService db,
-            ObjectDetectionService detector,
-            RateLimitedHttpClient httpClient) {
+            ObjectDetectionService detector) {

        LOG.infof("Initializing ImageProcessingService");
-        return new ImageProcessingService(db, detector, httpClient);
+        return new ImageProcessingService(db, detector);
    }
 }
--- a/src/main/java/auctiora/DatabaseService.java
+++ b/src/main/java/auctiora/DatabaseService.java
@@ -73,7 +73,8 @@ public class DatabaseService {
                                  FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)
                              )""");

-         // Images table (populated by this process)
+         // Images table (populated by external scraper with URLs and local_path)
+         // This process only adds labels via object detection
         stmt.execute("""
                              CREATE TABLE IF NOT EXISTS images (
                                  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -82,6 +83,7 @@ public class DatabaseService {
                                  local_path TEXT,
                                  labels TEXT,
                                  processed_at INTEGER,
+                                  downloaded INTEGER DEFAULT 0,
                                  FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
                              )""");

@@ -258,6 +260,7 @@ public class DatabaseService {
         var hasLabels = false;
         var hasLocalPath = false;
         var hasProcessedAt = false;
+         var hasDownloaded = false;

         while (rs.next()) {
            var colName = rs.getString("name");
@@ -265,6 +268,7 @@ public class DatabaseService {
               case "labels" -> hasLabels = true;
               case "local_path" -> hasLocalPath = true;
               case "processed_at" -> hasProcessedAt = true;
+               case "downloaded" -> hasDownloaded = true;
            }
         }

@@ -280,6 +284,10 @@ public class DatabaseService {
            log.info("Migrating schema: Adding 'processed_at' column to images table");
            stmt.execute("ALTER TABLE images ADD COLUMN processed_at INTEGER");
         }
+         if (!hasDownloaded) {
+            log.info("Migrating schema: Adding 'downloaded' column to images table");
+            stmt.execute("ALTER TABLE images ADD COLUMN downloaded INTEGER DEFAULT 0");
+         }
      } catch (SQLException e) {
         // Table might not exist yet, which is fine
         log.debug("Could not check images table schema: " + e.getMessage());
@@ -462,19 +470,35 @@ public class DatabaseService {
   }
   
   /**
-    * Inserts a new image record with object detection labels
+    * Updates the labels field for an image after object detection
    */
-   synchronized void insertImage(long lotId, String url, String filePath, List<String> labels) throws SQLException {
-      var sql = "INSERT INTO images (lot_id, url, local_path, labels, processed_at) VALUES (?, ?, ?, ?, ?)";
+   synchronized void updateImageLabels(int imageId, List<String> labels) throws SQLException {
+      var sql = "UPDATE images SET labels = ?, processed_at = ? WHERE id = ?";
      try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
-         ps.setLong(1, lotId);
-         ps.setString(2, url);
-         ps.setString(3, filePath);
-         ps.setString(4, String.join(",", labels));
-         ps.setLong(5, Instant.now().getEpochSecond());
+         ps.setString(1, String.join(",", labels));
+         ps.setLong(2, Instant.now().getEpochSecond());
+         ps.setInt(3, imageId);
         ps.executeUpdate();
      }
   }
+
+   /**
+    * Gets the labels for a specific image
+    */
+   synchronized List<String> getImageLabels(int imageId) throws SQLException {
+      var sql = "SELECT labels FROM images WHERE id = ?";
+      try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
+         ps.setInt(1, imageId);
+         var rs = ps.executeQuery();
+         if (rs.next()) {
+            var labelsStr = rs.getString("labels");
+            if (labelsStr != null && !labelsStr.isEmpty()) {
+               return List.of(labelsStr.split(","));
+            }
+         }
+      }
+      return List.of();
+   }
   
   /**
    * Retrieves images for a specific lot
@@ -671,46 +695,34 @@ public class DatabaseService {
   }
   
   /**
-    * Imports image URLs from scraper's schema.
-    * The scraper populates the images table with URLs but doesn't download them.
-    * This method retrieves undownloaded images for processing.
+    * Gets images that have been downloaded by the scraper but need object detection.
+    * Only returns images that have local_path set but no labels yet.
    *
-    * @return List of image URLs that need to be downloaded
+    * @return List of images needing object detection
    */
-   synchronized List<ImageImportRecord> getUnprocessedImagesFromScraper() throws SQLException {
-      List<ImageImportRecord> images = new ArrayList<>();
+   synchronized List<ImageDetectionRecord> getImagesNeedingDetection() throws SQLException {
+      List<ImageDetectionRecord> images = new ArrayList<>();
      var sql = """
-              SELECT i.lot_id, i.url, l.auction_id
+              SELECT i.id, i.lot_id, i.local_path
              FROM images i
-              LEFT JOIN lots l ON i.lot_id = l.lot_id
-              WHERE i.downloaded = 0 OR i.local_path IS NULL
+              WHERE i.local_path IS NOT NULL
+                AND i.local_path != ''
+                AND (i.labels IS NULL OR i.labels = '')
              """;
-      
+
      try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
         var rs = stmt.executeQuery(sql);
         while (rs.next()) {
-            var lotIdStr     = rs.getString("lot_id");
-            var auctionIdStr = rs.getString("auction_id");
-
-            var lotId  = ScraperDataAdapter.extractNumericId(lotIdStr);
-            var saleId = ScraperDataAdapter.extractNumericId(auctionIdStr);
-
-            // Skip images with invalid IDs (0 indicates parsing failed)
-            if (lotId == 0L || saleId == 0L) {
-               log.debug("Skipping image with invalid ID: lot_id={}, sale_id={}", lotId, saleId);
-               continue;
-            }
-
-            images.add(new ImageImportRecord(
-                    lotId,
-                    saleId,
-                    rs.getString("url")
+            images.add(new ImageDetectionRecord(
+                    rs.getInt("id"),
+                    rs.getLong("lot_id"),
+                    rs.getString("local_path")
            ));
         }
      } catch (SQLException e) {
-         log.info("ℹ️  No unprocessed images found in scraper format");
+         log.info("ℹ️  No images needing detection found");
      }
-      
+
      return images;
   }
   
@@ -718,9 +730,9 @@ public class DatabaseService {
    * Simple record for image data from database
    */
   record ImageRecord(int id, long lotId, String url, String filePath, String labels) { }
-   
+
   /**
-    * Record for importing images from scraper format
+    * Record for images that need object detection processing
    */
-   record ImageImportRecord(long lotId, long saleId, String url) { }
+   record ImageDetectionRecord(int id, long lotId, String filePath) { }
 }
--- a/src/main/java/auctiora/ImageProcessingService.java
+++ b/src/main/java/auctiora/ImageProcessingService.java
@@ -12,120 +12,78 @@ import java.util.List;

 /**
 * Service responsible for processing images from the IMAGES table.
- * Downloads images, performs object detection, and updates the database.
+ * Performs object detection on already-downloaded images and updates the database.
 *
- * This separates image processing concerns from scraping, allowing this project
- * to focus on enriching data scraped by the external process.
+ * NOTE: Image downloading is handled by the external scraper process.
+ * This service only performs object detection on images that already have local_path set.
 */
@Slf4j
 class ImageProcessingService {

-   private final RateLimitedHttpClient httpClient;
   private final DatabaseService       db;
   private final ObjectDetectionService detector;

-   ImageProcessingService(DatabaseService db, ObjectDetectionService detector, RateLimitedHttpClient httpClient) {
-      this.httpClient = httpClient;
+   ImageProcessingService(DatabaseService db, ObjectDetectionService detector) {
      this.db         = db;
      this.detector   = detector;
   }
-   
-   /**
-    * Downloads an image from the given URL to local storage.
-    * Images are organized by saleId/lotId for easy management.
-    *
-    * @param imageUrl remote image URL
-    * @param saleId   sale identifier
-    * @param lotId    lot identifier
-    * @return absolute path to saved file or null on failure
-    */
-   String downloadImage(String imageUrl, long saleId, long lotId) {
-      try {
-         var response = httpClient.sendGetBytes(imageUrl);

-         if (response != null && response.statusCode() == 200) {
-            // Use environment variable for cross-platform compatibility
-            var imagesPath = System.getenv().getOrDefault("AUCTION_IMAGES_PATH", "/mnt/okcomputer/output/images");
-            var baseDir = Paths.get(imagesPath);
-            var dir     = baseDir.resolve(String.valueOf(saleId)).resolve(String.valueOf(lotId));
-            Files.createDirectories(dir);
-            
-            // Extract filename from URL
-            var fileName = imageUrl.substring(imageUrl.lastIndexOf('/') + 1);
-            // Remove query parameters if present
-            int queryIndex = fileName.indexOf('?');
-            if (queryIndex > 0) {
-               fileName = fileName.substring(0, queryIndex);
-            }
-            var dest = dir.resolve(fileName);
-            
-            Files.write(dest, response.body());
-            return dest.toAbsolutePath().toString();
-         }
-      } catch (IOException | InterruptedException e) {
-         System.err.println("Failed to download image " + imageUrl + ": " + e.getMessage());
-         if (e instanceof InterruptedException) {
-            Thread.currentThread().interrupt();
-         }
-      } catch (Exception e) {
-         throw new RuntimeException(e);
-      }
-      return null;
-   }
-   
   /**
-    * Processes images for a specific lot: downloads and runs object detection.
+    * Processes a single image: runs object detection and updates labels in database.
    *
-    * @param lotId lot identifier
-    * @param saleId sale identifier
-    * @param imageUrls list of image URLs to process
+    * @param imageId database ID of the image record
+    * @param localPath local file path to the downloaded image
+    * @param lotId lot identifier (for logging)
+    * @return true if processing succeeded
    */
-   void processImagesForLot(long lotId, long saleId, List<String> imageUrls) {
-      log.info("  Processing {} images for lot {}", imageUrls.size(), lotId);
-      
-      for (var imgUrl : imageUrls) {
-         var fileName = downloadImage(imgUrl, saleId, lotId);
-         
-         if (fileName != null) {
-            // Run object detection
-            var labels = detector.detectObjects(fileName);
-            
-            // Save to database
-            try {
-               db.insertImage(lotId, imgUrl, fileName, labels);
-               
-               if (!labels.isEmpty()) {
-                  log.info("    Detected: {}", String.join(", ", labels));
-               }
-            } catch (SQLException e) {
-               System.err.println("    Failed to save image to database: " + e.getMessage());
-            }
+   boolean processImage(int imageId, String localPath, long lotId) {
+      try {
+         // Run object detection on the local file
+         var labels = detector.detectObjects(localPath);
+
+         // Update the database with detected labels
+         db.updateImageLabels(imageId, labels);
+
+         if (!labels.isEmpty()) {
+            log.info("    Lot {}: Detected {}", lotId, String.join(", ", labels));
         }
+
+         return true;
+      } catch (Exception e) {
+         log.error("    Failed to process image {}: {}", imageId, e.getMessage());
+         return false;
      }
   }
   
   /**
    * Batch processes all pending images in the database.
-    * Useful for processing images after the external scraper has populated lot data.
+    * Only processes images that have been downloaded by the scraper but haven't had object detection run yet.
    */
   void processPendingImages() {
      log.info("Processing pending images...");
-      
+
      try {
-         var lots = db.getAllLots();
-         log.info("Found {} lots to check for images", lots.size());
-         
-         for (var lot : lots) {
-            // Check if images already processed for this lot
-            var existingImages = db.getImagesForLot(lot.lotId());
-            
-            if (existingImages.isEmpty()) {
-               log.info("  Lot {} has no images yet - needs external scraper data", lot.lotId());
+         var pendingImages = db.getImagesNeedingDetection();
+         log.info("Found {} images needing object detection", pendingImages.size());
+
+         var processed = 0;
+         var detected  = 0;
+
+         for (var image : pendingImages) {
+            if (processImage(image.id(), image.filePath(), image.lotId())) {
+               processed++;
+               // Re-fetch to check if labels were found
+               var labels = db.getImageLabels(image.id());
+               if (labels != null && !labels.isEmpty()) {
+                  detected++;
+               }
            }
         }
-         
+
+         log.info("Processed {} images, detected objects in {}", processed, detected);
+
      } catch (SQLException e) {
-         System.err.println("Error processing pending images: " + e.getMessage());
+         log.error("Error processing pending images: {}", e.getMessage());
      }
   }
 }
--- a/src/main/java/auctiora/QuarkusWorkflowScheduler.java
+++ b/src/main/java/auctiora/QuarkusWorkflowScheduler.java
@@ -54,9 +54,9 @@ public class QuarkusWorkflowScheduler {
         var lots = db.importLotsFromScraper();
         LOG.infof("  → Imported %d lots", lots.size());
         
-         // Import image URLs
-         var images = db.getUnprocessedImagesFromScraper();
-         LOG.infof("  → Found %d unprocessed images", images.size());
+         // Check for images needing detection
+         var images = db.getImagesNeedingDetection();
+         LOG.infof("  → Found %d images needing detection", images.size());
         
         var duration = System.currentTimeMillis() - start;
         LOG.infof("  ✓ Scraper import completed in %dms", duration);
@@ -78,63 +78,53 @@ public class QuarkusWorkflowScheduler {
   /**
    * Workflow 2: Process Pending Images
    * Cron: Every 1 hour (0 0 * * * ?)
-    * Purpose: Download images and run object detection
+    * Purpose: Run object detection on images already downloaded by scraper
    */
   @Scheduled(cron = "{auction.workflow.image-processing.cron}", identity = "image-processing")
   void processImages() {
      try {
         LOG.info("🖼️  [WORKFLOW 2] Processing pending images...");
         var start = System.currentTimeMillis();
-         
-         // Get unprocessed images
-         var unprocessedImages = db.getUnprocessedImagesFromScraper();
-         
-         if (unprocessedImages.isEmpty()) {
+
+         // Get images that have been downloaded but need object detection
+         var pendingImages = db.getImagesNeedingDetection();
+
+         if (pendingImages.isEmpty()) {
            LOG.info("  → No pending images to process");
            return;
         }
-         
-         LOG.infof("  → Processing %d images", unprocessedImages.size());
-         
+
+         LOG.infof("  → Processing %d images", pendingImages.size());
+
         var processed = 0;
         var detected  = 0;
-         
-         for (var imageRecord : unprocessedImages) {
+
+         for (var image : pendingImages) {
            try {
-               // Download image
-               var filePath = imageProcessor.downloadImage(
-                       imageRecord.url(),
-                       imageRecord.saleId(),
-                       imageRecord.lotId()
-                                                          );
-               
-               if (filePath != null) {
-                  // Run object detection
-                  var labels = detector.detectObjects(filePath);
-                  
-                  // Save to database
-                  db.insertImage(imageRecord.lotId(), imageRecord.url(),
-                                 filePath, labels);
-                  
+               // Run object detection on already-downloaded image
+               if (imageProcessor.processImage(image.id(), image.filePath(), image.lotId())) {
                  processed++;
-                  if (!labels.isEmpty()) {
+
+                  // Check if objects were detected
+                  var labels = db.getImageLabels(image.id());
+                  if (labels != null && !labels.isEmpty()) {
                     detected++;
-                     
+
                     // Send notification for interesting detections
                     if (labels.size() >= 3) {
                        notifier.sendNotification(
                                String.format("Lot %d: Detected %s",
-                                              imageRecord.lotId(),
+                                              image.lotId(),
                                              String.join(", ", labels)),
                                "Objects Detected",
                                0
-                                                 );
+                        );
                     }
                  }
               }
-               
-               // Rate limiting
-               Thread.sleep(500);
+
+               // Rate limiting (lighter since no network I/O)
+               Thread.sleep(100);
               
            } catch (Exception e) {
               LOG.warnf("    ⚠️  Failed to process image: %s", e.getMessage());
--- a/src/main/java/auctiora/TroostwijkMonitor.java
+++ b/src/main/java/auctiora/TroostwijkMonitor.java
@@ -43,7 +43,7 @@ public class TroostwijkMonitor {
      db             = new DatabaseService(databasePath);
      notifier       = new NotificationService(notificationConfig);
      detector       = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath);
-      imageProcessor = new ImageProcessingService(db, detector, httpClient);
+      imageProcessor = new ImageProcessingService(db, detector);
      
      db.ensureSchema();
   }
--- a/src/main/java/auctiora/WorkflowOrchestrator.java
+++ b/src/main/java/auctiora/WorkflowOrchestrator.java
@@ -42,8 +42,7 @@ public class WorkflowOrchestrator {
      
      this.notifier = new NotificationService(notificationConfig);
      this.detector = new ObjectDetectionService(yoloCfg, yoloWeights, yoloClasses);
-      var httpClient = new RateLimitedHttpClient();
-      this.imageProcessor = new ImageProcessingService(db, detector, httpClient);
+      this.imageProcessor = new ImageProcessingService(db, detector);
      
      this.monitor = new TroostwijkMonitor(databasePath, notificationConfig,
                                           yoloCfg, yoloWeights, yoloClasses);
@@ -100,9 +99,9 @@ public class WorkflowOrchestrator {
            var lots = db.importLotsFromScraper();
            log.info("  → Imported {} lots", lots.size());
            
-            // Import image URLs
-            var images = db.getUnprocessedImagesFromScraper();
-            log.info("  → Found {} unprocessed images", images.size());
+            // Check for images needing detection
+            var images = db.getImagesNeedingDetection();
+            log.info("  → Found {} images needing detection", images.size());
            
            var duration = System.currentTimeMillis() - start;
            log.info("  ✓ Scraper import completed in {}ms\n", duration);
@@ -127,7 +126,7 @@ public class WorkflowOrchestrator {
   /**
    * Workflow 2: Process Pending Images
    * Frequency: Every 1 hour
-    * Purpose: Download images and run object detection
+    * Purpose: Run object detection on images already downloaded by scraper
    */
   private void scheduleImageProcessing() {
      scheduler.scheduleAtFixedRate(() -> {
@@ -135,55 +134,45 @@ public class WorkflowOrchestrator {
            log.info("🖼️  [WORKFLOW 2] Processing pending images...");
            var start = System.currentTimeMillis();
            
-            // Get unprocessed images
-            var unprocessedImages = db.getUnprocessedImagesFromScraper();
-            
-            if (unprocessedImages.isEmpty()) {
+            // Get images that have been downloaded but need object detection
+            var pendingImages = db.getImagesNeedingDetection();
+
+            if (pendingImages.isEmpty()) {
               log.info("  → No pending images to process\n");
               return;
            }
-            
-            log.info("  → Processing {} images", unprocessedImages.size());
-            
+
+            log.info("  → Processing {} images", pendingImages.size());
+
            var processed = 0;
            var detected  = 0;
-            
-            for (var imageRecord : unprocessedImages) {
+
+            for (var image : pendingImages) {
               try {
-                  // Download image
-                  var filePath = imageProcessor.downloadImage(
-                          imageRecord.url(),
-                          imageRecord.saleId(),
-                          imageRecord.lotId()
-                                                             );
-                  
-                  if (filePath != null) {
-                     // Run object detection
-                     var labels = detector.detectObjects(filePath);
-                     
-                     // Save to database
-                     db.insertImage(imageRecord.lotId(), imageRecord.url(),
-                                    filePath, labels);
-                     
+                  // Run object detection on already-downloaded image
+                  if (imageProcessor.processImage(image.id(), image.filePath(), image.lotId())) {
                     processed++;
-                     if (!labels.isEmpty()) {
+
+                     // Check if objects were detected
+                     var labels = db.getImageLabels(image.id());
+                     if (labels != null && !labels.isEmpty()) {
                        detected++;
-                        
+
                        // Send notification for interesting detections
                        if (labels.size() >= 3) {
                           notifier.sendNotification(
                                   String.format("Lot %d: Detected %s",
-                                                 imageRecord.lotId(),
+                                                 image.lotId(),
                                                 String.join(", ", labels)),
                                   "Objects Detected",
                                   0
-                                                    );
+                           );
                        }
                     }
                  }
-                  
-                  // Rate limiting
-                  Thread.sleep(500);
+
+                  // Rate limiting (lighter since no network I/O)
+                  Thread.sleep(100);
                  
               } catch (Exception e) {
                  log.info("    ⚠\uFE0F  Failed to process image: {}", e.getMessage());
--- a/troostwijk.db
+++ b/troostwijk.db