go

2025-12-06 21:27:19 +01:00
parent 288ee6a2a6
commit 6091b7180f
9 changed files with 551 additions and 204 deletions
--- a/SCRAPER_REFACTOR_GUIDE.md
+++ b/SCRAPER_REFACTOR_GUIDE.md
@@ -0,0 +1,399 @@
 # Scraper Refactor Guide - Image Download Integration
 ## 🎯 Objective
 Refactor the Troostwijk scraper to **download and store images locally**, eliminating the 57M+ duplicate image problem in the monitoring process.
 ## 📋 Current vs. New Architecture
 ### **Before** (Current Architecture)
 ```
 ┌──────────────┐         ┌──────────────┐         ┌──────────────┐
 │   Scraper    │────────▶│   Database   │◀────────│   Monitor    │
 │              │         │              │         │              │
 │ Stores URLs  │         │ images table │         │ Downloads +  │
 │ downloaded=0 │         │              │         │ Detection    │
 └──────────────┘         └──────────────┘         └──────────────┘
                                                         │
                                                         ▼
                                                   57M+ duplicates!
 ```
 ### **After** (New Architecture)
 ```
 ┌──────────────┐         ┌──────────────┐         ┌──────────────┐
 │   Scraper    │────────▶│   Database   │◀────────│   Monitor    │
 │              │         │              │         │              │
 │ Downloads +  │         │ images table │         │ Detection    │
 │ Stores path  │         │ local_path ✓ │         │ Only         │
 │ downloaded=1 │         │              │         │              │
 └──────────────┘         └──────────────┘         └──────────────┘
                                                         │
                                                         ▼
                                                   No duplicates!
 ```
 ## 🗄️ Database Schema Changes
 ### Current Schema (ARCHITECTURE-TROOSTWIJK-SCRAPER.md:113-122)
 ```sql
 CREATE TABLE images (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    lot_id TEXT,
    url TEXT,
    local_path TEXT,           -- Currently NULL
    downloaded INTEGER         -- Currently 0
    -- Missing: processed_at, labels (added by monitor)
 );
 ```
 ### Required Schema (Already Compatible!)
 ```sql
 CREATE TABLE images (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    lot_id TEXT,
    url TEXT,
    local_path TEXT,           -- ✅ SET by scraper after download
    downloaded INTEGER,        -- ✅ SET to 1 by scraper after download
    labels TEXT,               -- ⚠️ SET by monitor (object detection)
    processed_at INTEGER,      -- ⚠️ SET by monitor (timestamp)
    FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
 );
 ```
 **Good News**: The scraper's schema already has `local_path` and `downloaded` columns! You just need to populate them.
 ## 🔧 Implementation Steps
 ### **Step 1: Enable Image Downloading in Configuration**
 **File**: Your scraper's config file (e.g., `config.py` or environment variables)
 ```python
 # Current setting
 DOWNLOAD_IMAGES = False  # ❌ Change this!
 # New setting
 DOWNLOAD_IMAGES = True   # ✅ Enable downloads
 # Image storage path
 IMAGES_DIR = "/mnt/okcomputer/output/images"  # Or your preferred path
 ```
 ### **Step 2: Update Image Download Logic**
 Based on ARCHITECTURE-TROOSTWIJK-SCRAPER.md:211-228, you already have the structure. Here's what needs to change:
 **Current Code** (Conceptual):
 ```python
 # Phase 3: Scrape lot details
 def scrape_lot(lot_url):
    lot_data = parse_lot_page(lot_url)
    # Save lot to database
    db.insert_lot(lot_data)
    # Save image URLs to database (NOT DOWNLOADED)
    for img_url in lot_data['images']:
        db.execute("""
            INSERT INTO images (lot_id, url, downloaded)
            VALUES (?, ?, 0)
        """, (lot_data['lot_id'], img_url))
 ```
 **New Code** (Required):
 ```python
 import os
 import requests
 from pathlib import Path
 import time
 def scrape_lot(lot_url):
    lot_data = parse_lot_page(lot_url)
    # Save lot to database
    db.insert_lot(lot_data)
    # Download and save images
    for idx, img_url in enumerate(lot_data['images'], start=1):
        try:
            # Download image
            local_path = download_image(img_url, lot_data['lot_id'], idx)
            # Insert with local_path and downloaded=1
            db.execute("""
                INSERT INTO images (lot_id, url, local_path, downloaded)
                VALUES (?, ?, ?, 1)
                ON CONFLICT(lot_id, url) DO UPDATE SET
                    local_path = excluded.local_path,
                    downloaded = 1
            """, (lot_data['lot_id'], img_url, local_path))
            # Rate limiting (0.5s between downloads)
            time.sleep(0.5)
        except Exception as e:
            print(f"Failed to download {img_url}: {e}")
            # Still insert record but mark as not downloaded
            db.execute("""
                INSERT INTO images (lot_id, url, downloaded)
                VALUES (?, ?, 0)
            """, (lot_data['lot_id'], img_url))
 def download_image(image_url, lot_id, index):
    """
    Downloads an image and saves it to organized directory structure.
    Args:
        image_url: Remote URL of the image
        lot_id: Lot identifier (e.g., "A1-28505-5")
        index: Image sequence number (1, 2, 3, ...)
    Returns:
        Absolute path to saved file
    """
    # Create directory structure: /images/{lot_id}/
    images_dir = Path(os.getenv('IMAGES_DIR', '/mnt/okcomputer/output/images'))
    lot_dir = images_dir / lot_id
    lot_dir.mkdir(parents=True, exist_ok=True)
    # Determine file extension from URL or content-type
    ext = Path(image_url).suffix or '.jpg'
    filename = f"{index:03d}{ext}"  # 001.jpg, 002.jpg, etc.
    local_path = lot_dir / filename
    # Download with timeout
    response = requests.get(image_url, timeout=10)
    response.raise_for_status()
    # Save to disk
    with open(local_path, 'wb') as f:
        f.write(response.content)
    return str(local_path.absolute())
 ```
 ### **Step 3: Add Unique Constraint to Prevent Duplicates**
 **Migration SQL**:
 ```sql
 -- Add unique constraint to prevent duplicate image records
 CREATE UNIQUE INDEX IF NOT EXISTS idx_images_unique
 ON images(lot_id, url);
 ```
 Add this to your scraper's schema initialization:
 ```python
 def init_database():
    conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
    cursor = conn.cursor()
    # Existing table creation...
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS images (...)
    """)
    # Add unique constraint (NEW)
    cursor.execute("""
        CREATE UNIQUE INDEX IF NOT EXISTS idx_images_unique
        ON images(lot_id, url)
    """)
    conn.commit()
    conn.close()
 ```
 ### **Step 4: Handle Image Download Failures Gracefully**
 ```python
 def download_with_retry(image_url, lot_id, index, max_retries=3):
    """Downloads image with retry logic."""
    for attempt in range(max_retries):
        try:
            return download_image(image_url, lot_id, index)
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                print(f"Failed after {max_retries} attempts: {image_url}")
                return None  # Return None on failure
            print(f"Retry {attempt + 1}/{max_retries} for {image_url}")
            time.sleep(2 ** attempt)  # Exponential backoff
 ```
 ### **Step 5: Update Database Queries**
 Make sure your INSERT uses `INSERT ... ON CONFLICT` to handle re-scraping:
 ```python
 # Good: Handles re-scraping without duplicates
 db.execute("""
    INSERT INTO images (lot_id, url, local_path, downloaded)
    VALUES (?, ?, ?, 1)
    ON CONFLICT(lot_id, url) DO UPDATE SET
        local_path = excluded.local_path,
        downloaded = 1
 """, (lot_id, img_url, local_path))
 # Bad: Creates duplicates on re-scrape
 db.execute("""
    INSERT INTO images (lot_id, url, local_path, downloaded)
    VALUES (?, ?, ?, 1)
 """, (lot_id, img_url, local_path))
 ```
 ## 📊 Expected Outcomes
 ### Before Refactor
 ```sql
 SELECT COUNT(*) FROM images WHERE downloaded = 0;
 -- Result: 57,376,293 (57M+ undownloaded!)
 SELECT COUNT(*) FROM images WHERE local_path IS NOT NULL;
 -- Result: 0 (no files downloaded)
 ```
 ### After Refactor
 ```sql
 SELECT COUNT(*) FROM images WHERE downloaded = 1;
 -- Result: ~16,807 (one per actual lot image)
 SELECT COUNT(*) FROM images WHERE local_path IS NOT NULL;
 -- Result: ~16,807 (all downloaded images have paths)
 SELECT COUNT(DISTINCT lot_id, url) FROM images;
 -- Result: ~16,807 (no duplicates!)
 ```
 ## 🚀 Deployment Checklist
 ### Pre-Deployment
 - [ ] Back up current database: `cp cache.db cache.db.backup`
 - [ ] Verify disk space: At least 10GB free for images
 - [ ] Test download function on 5 sample lots
 - [ ] Verify `IMAGES_DIR` path exists and is writable
 ### Deployment
 - [ ] Update configuration: `DOWNLOAD_IMAGES = True`
 - [ ] Run schema migration to add unique index
 - [ ] Deploy updated scraper code
 - [ ] Monitor first 100 lots for errors
 ### Post-Deployment Verification
 ```sql
 -- Check download success rate
 SELECT
    COUNT(*) as total_images,
    SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
    SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
    ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
 FROM images;
 -- Check for duplicates (should be 0)
 SELECT lot_id, url, COUNT(*) as dup_count
 FROM images
 GROUP BY lot_id, url
 HAVING COUNT(*) > 1;
 -- Verify file system
 SELECT COUNT(*) FROM images
 WHERE downloaded = 1
  AND local_path IS NOT NULL
  AND local_path != '';
 ```
 ## 🔍 Monitoring Process Impact
 The monitoring process (auctiora) will automatically:
 - ✅ Stop downloading images (network I/O eliminated)
 - ✅ Only run object detection on `local_path` files
 - ✅ Query: `WHERE local_path IS NOT NULL AND (labels IS NULL OR labels = '')`
 - ✅ Update only the `labels` and `processed_at` columns
 **No changes needed in monitoring process!** It's already updated to work with scraper-downloaded images.
 ## 🐛 Troubleshooting
 ### Problem: "No space left on device"
 ```bash
 # Check disk usage
 df -h /mnt/okcomputer/output/images
 # Estimate needed space: ~100KB per image
 # 16,807 images × 100KB = ~1.6GB
 ```
 ### Problem: "Permission denied" when writing images
 ```bash
 # Fix permissions
 chmod 755 /mnt/okcomputer/output/images
 chown -R scraper_user:scraper_group /mnt/okcomputer/output/images
 ```
 ### Problem: Images downloading but not recorded in DB
 ```python
 # Add logging
 import logging
 logging.basicConfig(level=logging.INFO)
 def download_image(...):
    logging.info(f"Downloading {image_url} to {local_path}")
    # ... download code ...
    logging.info(f"Saved to {local_path}, size: {os.path.getsize(local_path)} bytes")
    return local_path
 ```
 ### Problem: Duplicate images after refactor
 ```sql
 -- Find duplicates
 SELECT lot_id, url, COUNT(*)
 FROM images
 GROUP BY lot_id, url
 HAVING COUNT(*) > 1;
 -- Clean up duplicates (keep newest)
 DELETE FROM images
 WHERE id NOT IN (
    SELECT MAX(id)
    FROM images
    GROUP BY lot_id, url
 );
 ```
 ## 📈 Performance Comparison
 | Metric | Before (Monitor Downloads) | After (Scraper Downloads) |
 |--------|---------------------------|---------------------------|
 | **Image records** | 57,376,293 | ~16,807 |
 | **Duplicates** | 57,359,486 (99.97%!) | 0 |
 | **Network I/O** | Monitor process | Scraper process |
 | **Disk usage** | 0 (URLs only) | ~1.6GB (actual files) |
 | **Processing speed** | 500ms/image (download + detect) | 100ms/image (detect only) |
 | **Error handling** | Complex (download failures) | Simple (files exist) |
 ## 🎓 Code Examples by Language
 ### Python (Most Likely)
 See **Step 2** above for complete implementation.
 ## 📚 References
 - **Current Scraper Architecture**: `wiki/ARCHITECTURE-TROOSTWIJK-SCRAPER.md`
 - **Database Schema**: `wiki/DATABASE_ARCHITECTURE.md`
 - **Monitor Changes**: See commit history for `ImageProcessingService.java`, `DatabaseService.java`
 ## ✅ Success Criteria
 You'll know the refactor is successful when:
 1. ✅ Database query `SELECT COUNT(*) FROM images` returns ~16,807 (not 57M+)
 2. ✅ All images have `downloaded = 1` and `local_path IS NOT NULL`
 3. ✅ No duplicate records: `SELECT lot_id, url, COUNT(*) ... HAVING COUNT(*) > 1` returns 0 rows
 4. ✅ Monitor logs show "Found N images needing detection" with reasonable numbers
 5. ✅ Files exist at paths in `local_path` column
 6. ✅ Monitor process speed increases (100ms vs 500ms per image)
 ---
 **Questions?** Check the troubleshooting section or inspect the monitor's updated code in:
 - `src/main/java/auctiora/ImageProcessingService.java`
 - `src/main/java/auctiora/DatabaseService.java:695-719`
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -28,7 +28,7 @@ services:
      - AUCTION_WORKFLOW_CLOSING_ALERTS_CRON=0 */5 * * * ?
    volumes:
-      # Mount database and images directory
+      # Mount database and images directory1
      - shared-auction-data:/mnt/okcomputer/output
    labels:
--- a/src/main/java/auctiora/AuctionMonitorProducer.java
+++ b/src/main/java/auctiora/AuctionMonitorProducer.java
@@ -68,10 +68,9 @@ public class AuctionMonitorProducer {
    @Singleton
    public ImageProcessingService produceImageProcessingService(
            DatabaseService db,
-            ObjectDetectionService detector,
+            ObjectDetectionService detector) {
            RateLimitedHttpClient httpClient) {
        LOG.infof("Initializing ImageProcessingService");
-        return new ImageProcessingService(db, detector, httpClient);
+        return new ImageProcessingService(db, detector);
    }
 }
--- a/src/main/java/auctiora/DatabaseService.java
+++ b/src/main/java/auctiora/DatabaseService.java
@@ -73,7 +73,8 @@ public class DatabaseService {
                                  FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)
                              )""");
-         // Images table (populated by this process)
+         // Images table (populated by external scraper with URLs and local_path)
         // This process only adds labels via object detection
         stmt.execute("""
                              CREATE TABLE IF NOT EXISTS images (
                                  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -82,6 +83,7 @@ public class DatabaseService {
                                  local_path TEXT,
                                  labels TEXT,
                                  processed_at INTEGER,
                                  downloaded INTEGER DEFAULT 0,
                                  FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
                              )""");
@@ -258,6 +260,7 @@ public class DatabaseService {
         var hasLabels = false;
         var hasLocalPath = false;
         var hasProcessedAt = false;
         var hasDownloaded = false;
         while (rs.next()) {
            var colName = rs.getString("name");
@@ -265,6 +268,7 @@ public class DatabaseService {
               case "labels" -> hasLabels = true;
               case "local_path" -> hasLocalPath = true;
               case "processed_at" -> hasProcessedAt = true;
               case "downloaded" -> hasDownloaded = true;
            }
         }
@@ -280,6 +284,10 @@ public class DatabaseService {
            log.info("Migrating schema: Adding 'processed_at' column to images table");
            stmt.execute("ALTER TABLE images ADD COLUMN processed_at INTEGER");
         }
         if (!hasDownloaded) {
            log.info("Migrating schema: Adding 'downloaded' column to images table");
            stmt.execute("ALTER TABLE images ADD COLUMN downloaded INTEGER DEFAULT 0");
         }
      } catch (SQLException e) {
         // Table might not exist yet, which is fine
         log.debug("Could not check images table schema: " + e.getMessage());
@@ -462,20 +470,36 @@ public class DatabaseService {
   }
   /**
-    * Inserts a new image record with object detection labels
+    * Updates the labels field for an image after object detection
    */
-   synchronized void insertImage(long lotId, String url, String filePath, List<String> labels) throws SQLException {
+   synchronized void updateImageLabels(int imageId, List<String> labels) throws SQLException {
-      var sql = "INSERT INTO images (lot_id, url, local_path, labels, processed_at) VALUES (?, ?, ?, ?, ?)";
+      var sql = "UPDATE images SET labels = ?, processed_at = ? WHERE id = ?";
      try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
-         ps.setLong(1, lotId);
+         ps.setString(1, String.join(",", labels));
-         ps.setString(2, url);
+         ps.setLong(2, Instant.now().getEpochSecond());
-         ps.setString(3, filePath);
+         ps.setInt(3, imageId);
         ps.setString(4, String.join(",", labels));
         ps.setLong(5, Instant.now().getEpochSecond());
         ps.executeUpdate();
      }
   }
   /**
    * Gets the labels for a specific image
    */
   synchronized List<String> getImageLabels(int imageId) throws SQLException {
      var sql = "SELECT labels FROM images WHERE id = ?";
      try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
         ps.setInt(1, imageId);
         var rs = ps.executeQuery();
         if (rs.next()) {
            var labelsStr = rs.getString("labels");
            if (labelsStr != null && !labelsStr.isEmpty()) {
               return List.of(labelsStr.split(","));
            }
         }
      }
      return List.of();
   }
   /**
    * Retrieves images for a specific lot
    */
@@ -671,44 +695,32 @@ public class DatabaseService {
   }
   /**
-    * Imports image URLs from scraper's schema.
+    * Gets images that have been downloaded by the scraper but need object detection.
-    * The scraper populates the images table with URLs but doesn't download them.
+    * Only returns images that have local_path set but no labels yet.
    * This method retrieves undownloaded images for processing.
    *
-    * @return List of image URLs that need to be downloaded
+    * @return List of images needing object detection
    */
-   synchronized List<ImageImportRecord> getUnprocessedImagesFromScraper() throws SQLException {
+   synchronized List<ImageDetectionRecord> getImagesNeedingDetection() throws SQLException {
-      List<ImageImportRecord> images = new ArrayList<>();
+      List<ImageDetectionRecord> images = new ArrayList<>();
      var sql = """
-              SELECT i.lot_id, i.url, l.auction_id
+              SELECT i.id, i.lot_id, i.local_path
              FROM images i
-              LEFT JOIN lots l ON i.lot_id = l.lot_id
+              WHERE i.local_path IS NOT NULL
-              WHERE i.downloaded = 0 OR i.local_path IS NULL
+                AND i.local_path != ''
                AND (i.labels IS NULL OR i.labels = '')
              """;
      try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
         var rs = stmt.executeQuery(sql);
         while (rs.next()) {
-            var lotIdStr     = rs.getString("lot_id");
+            images.add(new ImageDetectionRecord(
-            var auctionIdStr = rs.getString("auction_id");
+                    rs.getInt("id"),
-
+                    rs.getLong("lot_id"),
-            var lotId  = ScraperDataAdapter.extractNumericId(lotIdStr);
+                    rs.getString("local_path")
            var saleId = ScraperDataAdapter.extractNumericId(auctionIdStr);
            // Skip images with invalid IDs (0 indicates parsing failed)
            if (lotId == 0L || saleId == 0L) {
               log.debug("Skipping image with invalid ID: lot_id={}, sale_id={}", lotId, saleId);
               continue;
            }
            images.add(new ImageImportRecord(
                    lotId,
                    saleId,
                    rs.getString("url")
            ));
         }
      } catch (SQLException e) {
-         log.info("ℹ️  No unprocessed images found in scraper format");
+         log.info("ℹ️  No images needing detection found");
      }
      return images;
@@ -720,7 +732,7 @@ public class DatabaseService {
   record ImageRecord(int id, long lotId, String url, String filePath, String labels) { }
   /**
-    * Record for importing images from scraper format
+    * Record for images that need object detection processing
    */
-   record ImageImportRecord(long lotId, long saleId, String url) { }
+   record ImageDetectionRecord(int id, long lotId, String filePath) { }
 }
--- a/src/main/java/auctiora/ImageProcessingService.java
+++ b/src/main/java/auctiora/ImageProcessingService.java
@@ -12,120 +12,78 @@ import java.util.List;
 /**
 * Service responsible for processing images from the IMAGES table.
- * Downloads images, performs object detection, and updates the database.
+ * Performs object detection on already-downloaded images and updates the database.
 *
- * This separates image processing concerns from scraping, allowing this project
+ * NOTE: Image downloading is handled by the external scraper process.
- * to focus on enriching data scraped by the external process.
+ * This service only performs object detection on images that already have local_path set.
 */
@Slf4j
 class ImageProcessingService {
   private final RateLimitedHttpClient httpClient;
   private final DatabaseService       db;
   private final ObjectDetectionService detector;
-   ImageProcessingService(DatabaseService db, ObjectDetectionService detector, RateLimitedHttpClient httpClient) {
+   ImageProcessingService(DatabaseService db, ObjectDetectionService detector) {
      this.httpClient = httpClient;
      this.db         = db;
      this.detector   = detector;
   }
   /**
-    * Downloads an image from the given URL to local storage.
+    * Processes a single image: runs object detection and updates labels in database.
    * Images are organized by saleId/lotId for easy management.
    *
-    * @param imageUrl remote image URL
+    * @param imageId database ID of the image record
-    * @param saleId   sale identifier
+    * @param localPath local file path to the downloaded image
-    * @param lotId    lot identifier
+    * @param lotId lot identifier (for logging)
-    * @return absolute path to saved file or null on failure
+    * @return true if processing succeeded
    */
-   String downloadImage(String imageUrl, long saleId, long lotId) {
+   boolean processImage(int imageId, String localPath, long lotId) {
      try {
-         var response = httpClient.sendGetBytes(imageUrl);
+         // Run object detection on the local file
         var labels = detector.detectObjects(localPath);
-         if (response != null && response.statusCode() == 200) {
+         // Update the database with detected labels
-            // Use environment variable for cross-platform compatibility
+         db.updateImageLabels(imageId, labels);
            var imagesPath = System.getenv().getOrDefault("AUCTION_IMAGES_PATH", "/mnt/okcomputer/output/images");
            var baseDir = Paths.get(imagesPath);
            var dir     = baseDir.resolve(String.valueOf(saleId)).resolve(String.valueOf(lotId));
            Files.createDirectories(dir);
-            // Extract filename from URL
+         if (!labels.isEmpty()) {
-            var fileName = imageUrl.substring(imageUrl.lastIndexOf('/') + 1);
+            log.info("    Lot {}: Detected {}", lotId, String.join(", ", labels));
            // Remove query parameters if present
            int queryIndex = fileName.indexOf('?');
            if (queryIndex > 0) {
               fileName = fileName.substring(0, queryIndex);
            }
            var dest = dir.resolve(fileName);
            Files.write(dest, response.body());
            return dest.toAbsolutePath().toString();
         }
      } catch (IOException | InterruptedException e) {
         System.err.println("Failed to download image " + imageUrl + ": " + e.getMessage());
         if (e instanceof InterruptedException) {
            Thread.currentThread().interrupt();
         }
         return true;
      } catch (Exception e) {
-         throw new RuntimeException(e);
+         log.error("    Failed to process image {}: {}", imageId, e.getMessage());
-      }
+         return false;
      return null;
   }
   /**
    * Processes images for a specific lot: downloads and runs object detection.
    *
    * @param lotId lot identifier
    * @param saleId sale identifier
    * @param imageUrls list of image URLs to process
    */
   void processImagesForLot(long lotId, long saleId, List<String> imageUrls) {
      log.info("  Processing {} images for lot {}", imageUrls.size(), lotId);
      for (var imgUrl : imageUrls) {
         var fileName = downloadImage(imgUrl, saleId, lotId);
         if (fileName != null) {
            // Run object detection
            var labels = detector.detectObjects(fileName);
            // Save to database
            try {
               db.insertImage(lotId, imgUrl, fileName, labels);
               if (!labels.isEmpty()) {
                  log.info("    Detected: {}", String.join(", ", labels));
               }
            } catch (SQLException e) {
               System.err.println("    Failed to save image to database: " + e.getMessage());
            }
         }
      }
   }
   /**
    * Batch processes all pending images in the database.
-    * Useful for processing images after the external scraper has populated lot data.
+    * Only processes images that have been downloaded by the scraper but haven't had object detection run yet.
    */
   void processPendingImages() {
      log.info("Processing pending images...");
      try {
-         var lots = db.getAllLots();
+         var pendingImages = db.getImagesNeedingDetection();
-         log.info("Found {} lots to check for images", lots.size());
+         log.info("Found {} images needing object detection", pendingImages.size());
-         for (var lot : lots) {
+         var processed = 0;
-            // Check if images already processed for this lot
+         var detected  = 0;
            var existingImages = db.getImagesForLot(lot.lotId());
-            if (existingImages.isEmpty()) {
+         for (var image : pendingImages) {
-               log.info("  Lot {} has no images yet - needs external scraper data", lot.lotId());
+            if (processImage(image.id(), image.filePath(), image.lotId())) {
               processed++;
               // Re-fetch to check if labels were found
               var labels = db.getImageLabels(image.id());
               if (labels != null && !labels.isEmpty()) {
                  detected++;
               }
            }
         }
         log.info("Processed {} images, detected objects in {}", processed, detected);
      } catch (SQLException e) {
-         System.err.println("Error processing pending images: " + e.getMessage());
+         log.error("Error processing pending images: {}", e.getMessage());
      }
   }
 }
--- a/src/main/java/auctiora/QuarkusWorkflowScheduler.java
+++ b/src/main/java/auctiora/QuarkusWorkflowScheduler.java
@@ -54,9 +54,9 @@ public class QuarkusWorkflowScheduler {
         var lots = db.importLotsFromScraper();
         LOG.infof("  → Imported %d lots", lots.size());
-         // Import image URLs
+         // Check for images needing detection
-         var images = db.getUnprocessedImagesFromScraper();
+         var images = db.getImagesNeedingDetection();
-         LOG.infof("  → Found %d unprocessed images", images.size());
+         LOG.infof("  → Found %d images needing detection", images.size());
         var duration = System.currentTimeMillis() - start;
         LOG.infof("  ✓ Scraper import completed in %dms", duration);
@@ -78,7 +78,7 @@ public class QuarkusWorkflowScheduler {
   /**
    * Workflow 2: Process Pending Images
    * Cron: Every 1 hour (0 0 * * * ?)
-    * Purpose: Download images and run object detection
+    * Purpose: Run object detection on images already downloaded by scraper
    */
   @Scheduled(cron = "{auction.workflow.image-processing.cron}", identity = "image-processing")
   void processImages() {
@@ -86,55 +86,45 @@ public class QuarkusWorkflowScheduler {
         LOG.info("🖼️  [WORKFLOW 2] Processing pending images...");
         var start = System.currentTimeMillis();
-         // Get unprocessed images
+         // Get images that have been downloaded but need object detection
-         var unprocessedImages = db.getUnprocessedImagesFromScraper();
+         var pendingImages = db.getImagesNeedingDetection();
-         if (unprocessedImages.isEmpty()) {
+         if (pendingImages.isEmpty()) {
            LOG.info("  → No pending images to process");
            return;
         }
-         LOG.infof("  → Processing %d images", unprocessedImages.size());
+         LOG.infof("  → Processing %d images", pendingImages.size());
         var processed = 0;
         var detected  = 0;
-         for (var imageRecord : unprocessedImages) {
+         for (var image : pendingImages) {
            try {
-               // Download image
+               // Run object detection on already-downloaded image
-               var filePath = imageProcessor.downloadImage(
+               if (imageProcessor.processImage(image.id(), image.filePath(), image.lotId())) {
                       imageRecord.url(),
                       imageRecord.saleId(),
                       imageRecord.lotId()
                                                          );
               if (filePath != null) {
                  // Run object detection
                  var labels = detector.detectObjects(filePath);
                  // Save to database
                  db.insertImage(imageRecord.lotId(), imageRecord.url(),
                                 filePath, labels);
                  processed++;
-                  if (!labels.isEmpty()) {
+
                  // Check if objects were detected
                  var labels = db.getImageLabels(image.id());
                  if (labels != null && !labels.isEmpty()) {
                     detected++;
                     // Send notification for interesting detections
                     if (labels.size() >= 3) {
                        notifier.sendNotification(
                                String.format("Lot %d: Detected %s",
-                                              imageRecord.lotId(),
+                                              image.lotId(),
                                              String.join(", ", labels)),
                                "Objects Detected",
                                0
-                                                 );
+                        );
                     }
                  }
               }
-               // Rate limiting
+               // Rate limiting (lighter since no network I/O)
-               Thread.sleep(500);
+               Thread.sleep(100);
            } catch (Exception e) {
               LOG.warnf("    ⚠️  Failed to process image: %s", e.getMessage());
--- a/src/main/java/auctiora/TroostwijkMonitor.java
+++ b/src/main/java/auctiora/TroostwijkMonitor.java
@@ -43,7 +43,7 @@ public class TroostwijkMonitor {
      db             = new DatabaseService(databasePath);
      notifier       = new NotificationService(notificationConfig);
      detector       = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath);
-      imageProcessor = new ImageProcessingService(db, detector, httpClient);
+      imageProcessor = new ImageProcessingService(db, detector);
      db.ensureSchema();
   }
--- a/src/main/java/auctiora/WorkflowOrchestrator.java
+++ b/src/main/java/auctiora/WorkflowOrchestrator.java
@@ -42,8 +42,7 @@ public class WorkflowOrchestrator {
      this.notifier = new NotificationService(notificationConfig);
      this.detector = new ObjectDetectionService(yoloCfg, yoloWeights, yoloClasses);
-      var httpClient = new RateLimitedHttpClient();
+      this.imageProcessor = new ImageProcessingService(db, detector);
      this.imageProcessor = new ImageProcessingService(db, detector, httpClient);
      this.monitor = new TroostwijkMonitor(databasePath, notificationConfig,
                                           yoloCfg, yoloWeights, yoloClasses);
@@ -100,9 +99,9 @@ public class WorkflowOrchestrator {
            var lots = db.importLotsFromScraper();
            log.info("  → Imported {} lots", lots.size());
-            // Import image URLs
+            // Check for images needing detection
-            var images = db.getUnprocessedImagesFromScraper();
+            var images = db.getImagesNeedingDetection();
-            log.info("  → Found {} unprocessed images", images.size());
+            log.info("  → Found {} images needing detection", images.size());
            var duration = System.currentTimeMillis() - start;
            log.info("  ✓ Scraper import completed in {}ms\n", duration);
@@ -127,7 +126,7 @@ public class WorkflowOrchestrator {
   /**
    * Workflow 2: Process Pending Images
    * Frequency: Every 1 hour
-    * Purpose: Download images and run object detection
+    * Purpose: Run object detection on images already downloaded by scraper
    */
   private void scheduleImageProcessing() {
      scheduler.scheduleAtFixedRate(() -> {
@@ -135,55 +134,45 @@ public class WorkflowOrchestrator {
            log.info("🖼️  [WORKFLOW 2] Processing pending images...");
            var start = System.currentTimeMillis();
-            // Get unprocessed images
+            // Get images that have been downloaded but need object detection
-            var unprocessedImages = db.getUnprocessedImagesFromScraper();
+            var pendingImages = db.getImagesNeedingDetection();
-            if (unprocessedImages.isEmpty()) {
+            if (pendingImages.isEmpty()) {
               log.info("  → No pending images to process\n");
               return;
            }
-            log.info("  → Processing {} images", unprocessedImages.size());
+            log.info("  → Processing {} images", pendingImages.size());
            var processed = 0;
            var detected  = 0;
-            for (var imageRecord : unprocessedImages) {
+            for (var image : pendingImages) {
               try {
-                  // Download image
+                  // Run object detection on already-downloaded image
-                  var filePath = imageProcessor.downloadImage(
+                  if (imageProcessor.processImage(image.id(), image.filePath(), image.lotId())) {
                          imageRecord.url(),
                          imageRecord.saleId(),
                          imageRecord.lotId()
                                                             );
                  if (filePath != null) {
                     // Run object detection
                     var labels = detector.detectObjects(filePath);
                     // Save to database
                     db.insertImage(imageRecord.lotId(), imageRecord.url(),
                                    filePath, labels);
                     processed++;
-                     if (!labels.isEmpty()) {
+
                     // Check if objects were detected
                     var labels = db.getImageLabels(image.id());
                     if (labels != null && !labels.isEmpty()) {
                        detected++;
                        // Send notification for interesting detections
                        if (labels.size() >= 3) {
                           notifier.sendNotification(
                                   String.format("Lot %d: Detected %s",
-                                                 imageRecord.lotId(),
+                                                 image.lotId(),
                                                 String.join(", ", labels)),
                                   "Objects Detected",
                                   0
-                                                    );
+                           );
                        }
                     }
                  }
-                  // Rate limiting
+                  // Rate limiting (lighter since no network I/O)
-                  Thread.sleep(500);
+                  Thread.sleep(100);
               } catch (Exception e) {
                  log.info("    ⚠\uFE0F  Failed to process image: {}", e.getMessage());
--- a/troostwijk.db
+++ b/troostwijk.db