diff --git a/SCRAPER_REFACTOR_GUIDE.md b/SCRAPER_REFACTOR_GUIDE.md new file mode 100644 index 0000000..0a12851 --- /dev/null +++ b/SCRAPER_REFACTOR_GUIDE.md @@ -0,0 +1,399 @@ +# Scraper Refactor Guide - Image Download Integration + +## 🎯 Objective + +Refactor the Troostwijk scraper to **download and store images locally**, eliminating the 57M+ duplicate image problem in the monitoring process. + +## πŸ“‹ Current vs. New Architecture + +### **Before** (Current Architecture) +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Scraper │────────▢│ Database │◀────────│ Monitor β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ Stores URLs β”‚ β”‚ images table β”‚ β”‚ Downloads + β”‚ +β”‚ downloaded=0 β”‚ β”‚ β”‚ β”‚ Detection β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό + 57M+ duplicates! +``` + +### **After** (New Architecture) +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Scraper │────────▢│ Database │◀────────│ Monitor β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ Downloads + β”‚ β”‚ images table β”‚ β”‚ Detection β”‚ +β”‚ Stores path β”‚ β”‚ local_path βœ“ β”‚ β”‚ Only β”‚ +β”‚ downloaded=1 β”‚ β”‚ β”‚ β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό + No duplicates! +``` + +## πŸ—„οΈ Database Schema Changes + +### Current Schema (ARCHITECTURE-TROOSTWIJK-SCRAPER.md:113-122) +```sql +CREATE TABLE images ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + lot_id TEXT, + url TEXT, + local_path TEXT, -- Currently NULL + downloaded INTEGER -- Currently 0 + -- Missing: processed_at, labels (added by monitor) +); +``` + +### Required Schema (Already Compatible!) +```sql +CREATE TABLE images ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + lot_id TEXT, + url TEXT, + local_path TEXT, -- βœ… SET by scraper after download + downloaded INTEGER, -- βœ… SET to 1 by scraper after download + labels TEXT, -- ⚠️ SET by monitor (object detection) + processed_at INTEGER, -- ⚠️ SET by monitor (timestamp) + FOREIGN KEY (lot_id) REFERENCES lots(lot_id) +); +``` + +**Good News**: The scraper's schema already has `local_path` and `downloaded` columns! You just need to populate them. + +## πŸ”§ Implementation Steps + +### **Step 1: Enable Image Downloading in Configuration** + +**File**: Your scraper's config file (e.g., `config.py` or environment variables) + +```python +# Current setting +DOWNLOAD_IMAGES = False # ❌ Change this! + +# New setting +DOWNLOAD_IMAGES = True # βœ… Enable downloads + +# Image storage path +IMAGES_DIR = "/mnt/okcomputer/output/images" # Or your preferred path +``` + +### **Step 2: Update Image Download Logic** + +Based on ARCHITECTURE-TROOSTWIJK-SCRAPER.md:211-228, you already have the structure. Here's what needs to change: + +**Current Code** (Conceptual): +```python +# Phase 3: Scrape lot details +def scrape_lot(lot_url): + lot_data = parse_lot_page(lot_url) + + # Save lot to database + db.insert_lot(lot_data) + + # Save image URLs to database (NOT DOWNLOADED) + for img_url in lot_data['images']: + db.execute(""" + INSERT INTO images (lot_id, url, downloaded) + VALUES (?, ?, 0) + """, (lot_data['lot_id'], img_url)) +``` + +**New Code** (Required): +```python +import os +import requests +from pathlib import Path +import time + +def scrape_lot(lot_url): + lot_data = parse_lot_page(lot_url) + + # Save lot to database + db.insert_lot(lot_data) + + # Download and save images + for idx, img_url in enumerate(lot_data['images'], start=1): + try: + # Download image + local_path = download_image(img_url, lot_data['lot_id'], idx) + + # Insert with local_path and downloaded=1 + db.execute(""" + INSERT INTO images (lot_id, url, local_path, downloaded) + VALUES (?, ?, ?, 1) + ON CONFLICT(lot_id, url) DO UPDATE SET + local_path = excluded.local_path, + downloaded = 1 + """, (lot_data['lot_id'], img_url, local_path)) + + # Rate limiting (0.5s between downloads) + time.sleep(0.5) + + except Exception as e: + print(f"Failed to download {img_url}: {e}") + # Still insert record but mark as not downloaded + db.execute(""" + INSERT INTO images (lot_id, url, downloaded) + VALUES (?, ?, 0) + """, (lot_data['lot_id'], img_url)) + +def download_image(image_url, lot_id, index): + """ + Downloads an image and saves it to organized directory structure. + + Args: + image_url: Remote URL of the image + lot_id: Lot identifier (e.g., "A1-28505-5") + index: Image sequence number (1, 2, 3, ...) + + Returns: + Absolute path to saved file + """ + # Create directory structure: /images/{lot_id}/ + images_dir = Path(os.getenv('IMAGES_DIR', '/mnt/okcomputer/output/images')) + lot_dir = images_dir / lot_id + lot_dir.mkdir(parents=True, exist_ok=True) + + # Determine file extension from URL or content-type + ext = Path(image_url).suffix or '.jpg' + filename = f"{index:03d}{ext}" # 001.jpg, 002.jpg, etc. + local_path = lot_dir / filename + + # Download with timeout + response = requests.get(image_url, timeout=10) + response.raise_for_status() + + # Save to disk + with open(local_path, 'wb') as f: + f.write(response.content) + + return str(local_path.absolute()) +``` + +### **Step 3: Add Unique Constraint to Prevent Duplicates** + +**Migration SQL**: +```sql +-- Add unique constraint to prevent duplicate image records +CREATE UNIQUE INDEX IF NOT EXISTS idx_images_unique +ON images(lot_id, url); +``` + +Add this to your scraper's schema initialization: + +```python +def init_database(): + conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') + cursor = conn.cursor() + + # Existing table creation... + cursor.execute(""" + CREATE TABLE IF NOT EXISTS images (...) + """) + + # Add unique constraint (NEW) + cursor.execute(""" + CREATE UNIQUE INDEX IF NOT EXISTS idx_images_unique + ON images(lot_id, url) + """) + + conn.commit() + conn.close() +``` + +### **Step 4: Handle Image Download Failures Gracefully** + +```python +def download_with_retry(image_url, lot_id, index, max_retries=3): + """Downloads image with retry logic.""" + for attempt in range(max_retries): + try: + return download_image(image_url, lot_id, index) + except requests.exceptions.RequestException as e: + if attempt == max_retries - 1: + print(f"Failed after {max_retries} attempts: {image_url}") + return None # Return None on failure + print(f"Retry {attempt + 1}/{max_retries} for {image_url}") + time.sleep(2 ** attempt) # Exponential backoff +``` + +### **Step 5: Update Database Queries** + +Make sure your INSERT uses `INSERT ... ON CONFLICT` to handle re-scraping: + +```python +# Good: Handles re-scraping without duplicates +db.execute(""" + INSERT INTO images (lot_id, url, local_path, downloaded) + VALUES (?, ?, ?, 1) + ON CONFLICT(lot_id, url) DO UPDATE SET + local_path = excluded.local_path, + downloaded = 1 +""", (lot_id, img_url, local_path)) + +# Bad: Creates duplicates on re-scrape +db.execute(""" + INSERT INTO images (lot_id, url, local_path, downloaded) + VALUES (?, ?, ?, 1) +""", (lot_id, img_url, local_path)) +``` + +## πŸ“Š Expected Outcomes + +### Before Refactor +```sql +SELECT COUNT(*) FROM images WHERE downloaded = 0; +-- Result: 57,376,293 (57M+ undownloaded!) + +SELECT COUNT(*) FROM images WHERE local_path IS NOT NULL; +-- Result: 0 (no files downloaded) +``` + +### After Refactor +```sql +SELECT COUNT(*) FROM images WHERE downloaded = 1; +-- Result: ~16,807 (one per actual lot image) + +SELECT COUNT(*) FROM images WHERE local_path IS NOT NULL; +-- Result: ~16,807 (all downloaded images have paths) + +SELECT COUNT(DISTINCT lot_id, url) FROM images; +-- Result: ~16,807 (no duplicates!) +``` + +## πŸš€ Deployment Checklist + +### Pre-Deployment +- [ ] Back up current database: `cp cache.db cache.db.backup` +- [ ] Verify disk space: At least 10GB free for images +- [ ] Test download function on 5 sample lots +- [ ] Verify `IMAGES_DIR` path exists and is writable + +### Deployment +- [ ] Update configuration: `DOWNLOAD_IMAGES = True` +- [ ] Run schema migration to add unique index +- [ ] Deploy updated scraper code +- [ ] Monitor first 100 lots for errors + +### Post-Deployment Verification +```sql +-- Check download success rate +SELECT + COUNT(*) as total_images, + SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded, + SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed, + ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate +FROM images; + +-- Check for duplicates (should be 0) +SELECT lot_id, url, COUNT(*) as dup_count +FROM images +GROUP BY lot_id, url +HAVING COUNT(*) > 1; + +-- Verify file system +SELECT COUNT(*) FROM images +WHERE downloaded = 1 + AND local_path IS NOT NULL + AND local_path != ''; +``` + +## πŸ” Monitoring Process Impact + +The monitoring process (auctiora) will automatically: +- βœ… Stop downloading images (network I/O eliminated) +- βœ… Only run object detection on `local_path` files +- βœ… Query: `WHERE local_path IS NOT NULL AND (labels IS NULL OR labels = '')` +- βœ… Update only the `labels` and `processed_at` columns + +**No changes needed in monitoring process!** It's already updated to work with scraper-downloaded images. + +## πŸ› Troubleshooting + +### Problem: "No space left on device" +```bash +# Check disk usage +df -h /mnt/okcomputer/output/images + +# Estimate needed space: ~100KB per image +# 16,807 images Γ— 100KB = ~1.6GB +``` + +### Problem: "Permission denied" when writing images +```bash +# Fix permissions +chmod 755 /mnt/okcomputer/output/images +chown -R scraper_user:scraper_group /mnt/okcomputer/output/images +``` + +### Problem: Images downloading but not recorded in DB +```python +# Add logging +import logging +logging.basicConfig(level=logging.INFO) + +def download_image(...): + logging.info(f"Downloading {image_url} to {local_path}") + # ... download code ... + logging.info(f"Saved to {local_path}, size: {os.path.getsize(local_path)} bytes") + return local_path +``` + +### Problem: Duplicate images after refactor +```sql +-- Find duplicates +SELECT lot_id, url, COUNT(*) +FROM images +GROUP BY lot_id, url +HAVING COUNT(*) > 1; + +-- Clean up duplicates (keep newest) +DELETE FROM images +WHERE id NOT IN ( + SELECT MAX(id) + FROM images + GROUP BY lot_id, url +); +``` + +## πŸ“ˆ Performance Comparison + +| Metric | Before (Monitor Downloads) | After (Scraper Downloads) | +|--------|---------------------------|---------------------------| +| **Image records** | 57,376,293 | ~16,807 | +| **Duplicates** | 57,359,486 (99.97%!) | 0 | +| **Network I/O** | Monitor process | Scraper process | +| **Disk usage** | 0 (URLs only) | ~1.6GB (actual files) | +| **Processing speed** | 500ms/image (download + detect) | 100ms/image (detect only) | +| **Error handling** | Complex (download failures) | Simple (files exist) | + +## πŸŽ“ Code Examples by Language + +### Python (Most Likely) +See **Step 2** above for complete implementation. + +## πŸ“š References + +- **Current Scraper Architecture**: `wiki/ARCHITECTURE-TROOSTWIJK-SCRAPER.md` +- **Database Schema**: `wiki/DATABASE_ARCHITECTURE.md` +- **Monitor Changes**: See commit history for `ImageProcessingService.java`, `DatabaseService.java` + +## βœ… Success Criteria + +You'll know the refactor is successful when: + +1. βœ… Database query `SELECT COUNT(*) FROM images` returns ~16,807 (not 57M+) +2. βœ… All images have `downloaded = 1` and `local_path IS NOT NULL` +3. βœ… No duplicate records: `SELECT lot_id, url, COUNT(*) ... HAVING COUNT(*) > 1` returns 0 rows +4. βœ… Monitor logs show "Found N images needing detection" with reasonable numbers +5. βœ… Files exist at paths in `local_path` column +6. βœ… Monitor process speed increases (100ms vs 500ms per image) + +--- + +**Questions?** Check the troubleshooting section or inspect the monitor's updated code in: +- `src/main/java/auctiora/ImageProcessingService.java` +- `src/main/java/auctiora/DatabaseService.java:695-719` diff --git a/docker-compose.yml b/docker-compose.yml index 8d816d7..56c0829 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -28,7 +28,7 @@ services: - AUCTION_WORKFLOW_CLOSING_ALERTS_CRON=0 */5 * * * ? volumes: - # Mount database and images directory + # Mount database and images directory1 - shared-auction-data:/mnt/okcomputer/output labels: diff --git a/src/main/java/auctiora/AuctionMonitorProducer.java b/src/main/java/auctiora/AuctionMonitorProducer.java index 5acd888..4c06517 100644 --- a/src/main/java/auctiora/AuctionMonitorProducer.java +++ b/src/main/java/auctiora/AuctionMonitorProducer.java @@ -68,10 +68,9 @@ public class AuctionMonitorProducer { @Singleton public ImageProcessingService produceImageProcessingService( DatabaseService db, - ObjectDetectionService detector, - RateLimitedHttpClient httpClient) { + ObjectDetectionService detector) { LOG.infof("Initializing ImageProcessingService"); - return new ImageProcessingService(db, detector, httpClient); + return new ImageProcessingService(db, detector); } } diff --git a/src/main/java/auctiora/DatabaseService.java b/src/main/java/auctiora/DatabaseService.java index 4dc32a6..3bd32e4 100644 --- a/src/main/java/auctiora/DatabaseService.java +++ b/src/main/java/auctiora/DatabaseService.java @@ -73,7 +73,8 @@ public class DatabaseService { FOREIGN KEY (sale_id) REFERENCES auctions(auction_id) )"""); - // Images table (populated by this process) + // Images table (populated by external scraper with URLs and local_path) + // This process only adds labels via object detection stmt.execute(""" CREATE TABLE IF NOT EXISTS images ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -82,6 +83,7 @@ public class DatabaseService { local_path TEXT, labels TEXT, processed_at INTEGER, + downloaded INTEGER DEFAULT 0, FOREIGN KEY (lot_id) REFERENCES lots(lot_id) )"""); @@ -258,6 +260,7 @@ public class DatabaseService { var hasLabels = false; var hasLocalPath = false; var hasProcessedAt = false; + var hasDownloaded = false; while (rs.next()) { var colName = rs.getString("name"); @@ -265,6 +268,7 @@ public class DatabaseService { case "labels" -> hasLabels = true; case "local_path" -> hasLocalPath = true; case "processed_at" -> hasProcessedAt = true; + case "downloaded" -> hasDownloaded = true; } } @@ -280,6 +284,10 @@ public class DatabaseService { log.info("Migrating schema: Adding 'processed_at' column to images table"); stmt.execute("ALTER TABLE images ADD COLUMN processed_at INTEGER"); } + if (!hasDownloaded) { + log.info("Migrating schema: Adding 'downloaded' column to images table"); + stmt.execute("ALTER TABLE images ADD COLUMN downloaded INTEGER DEFAULT 0"); + } } catch (SQLException e) { // Table might not exist yet, which is fine log.debug("Could not check images table schema: " + e.getMessage()); @@ -462,19 +470,35 @@ public class DatabaseService { } /** - * Inserts a new image record with object detection labels + * Updates the labels field for an image after object detection */ - synchronized void insertImage(long lotId, String url, String filePath, List labels) throws SQLException { - var sql = "INSERT INTO images (lot_id, url, local_path, labels, processed_at) VALUES (?, ?, ?, ?, ?)"; + synchronized void updateImageLabels(int imageId, List labels) throws SQLException { + var sql = "UPDATE images SET labels = ?, processed_at = ? WHERE id = ?"; try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) { - ps.setLong(1, lotId); - ps.setString(2, url); - ps.setString(3, filePath); - ps.setString(4, String.join(",", labels)); - ps.setLong(5, Instant.now().getEpochSecond()); + ps.setString(1, String.join(",", labels)); + ps.setLong(2, Instant.now().getEpochSecond()); + ps.setInt(3, imageId); ps.executeUpdate(); } } + + /** + * Gets the labels for a specific image + */ + synchronized List getImageLabels(int imageId) throws SQLException { + var sql = "SELECT labels FROM images WHERE id = ?"; + try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) { + ps.setInt(1, imageId); + var rs = ps.executeQuery(); + if (rs.next()) { + var labelsStr = rs.getString("labels"); + if (labelsStr != null && !labelsStr.isEmpty()) { + return List.of(labelsStr.split(",")); + } + } + } + return List.of(); + } /** * Retrieves images for a specific lot @@ -671,46 +695,34 @@ public class DatabaseService { } /** - * Imports image URLs from scraper's schema. - * The scraper populates the images table with URLs but doesn't download them. - * This method retrieves undownloaded images for processing. + * Gets images that have been downloaded by the scraper but need object detection. + * Only returns images that have local_path set but no labels yet. * - * @return List of image URLs that need to be downloaded + * @return List of images needing object detection */ - synchronized List getUnprocessedImagesFromScraper() throws SQLException { - List images = new ArrayList<>(); + synchronized List getImagesNeedingDetection() throws SQLException { + List images = new ArrayList<>(); var sql = """ - SELECT i.lot_id, i.url, l.auction_id + SELECT i.id, i.lot_id, i.local_path FROM images i - LEFT JOIN lots l ON i.lot_id = l.lot_id - WHERE i.downloaded = 0 OR i.local_path IS NULL + WHERE i.local_path IS NOT NULL + AND i.local_path != '' + AND (i.labels IS NULL OR i.labels = '') """; - + try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) { var rs = stmt.executeQuery(sql); while (rs.next()) { - var lotIdStr = rs.getString("lot_id"); - var auctionIdStr = rs.getString("auction_id"); - - var lotId = ScraperDataAdapter.extractNumericId(lotIdStr); - var saleId = ScraperDataAdapter.extractNumericId(auctionIdStr); - - // Skip images with invalid IDs (0 indicates parsing failed) - if (lotId == 0L || saleId == 0L) { - log.debug("Skipping image with invalid ID: lot_id={}, sale_id={}", lotId, saleId); - continue; - } - - images.add(new ImageImportRecord( - lotId, - saleId, - rs.getString("url") + images.add(new ImageDetectionRecord( + rs.getInt("id"), + rs.getLong("lot_id"), + rs.getString("local_path") )); } } catch (SQLException e) { - log.info("ℹ️ No unprocessed images found in scraper format"); + log.info("ℹ️ No images needing detection found"); } - + return images; } @@ -718,9 +730,9 @@ public class DatabaseService { * Simple record for image data from database */ record ImageRecord(int id, long lotId, String url, String filePath, String labels) { } - + /** - * Record for importing images from scraper format + * Record for images that need object detection processing */ - record ImageImportRecord(long lotId, long saleId, String url) { } + record ImageDetectionRecord(int id, long lotId, String filePath) { } } diff --git a/src/main/java/auctiora/ImageProcessingService.java b/src/main/java/auctiora/ImageProcessingService.java index 28f5d21..bbc02c4 100644 --- a/src/main/java/auctiora/ImageProcessingService.java +++ b/src/main/java/auctiora/ImageProcessingService.java @@ -12,120 +12,78 @@ import java.util.List; /** * Service responsible for processing images from the IMAGES table. - * Downloads images, performs object detection, and updates the database. + * Performs object detection on already-downloaded images and updates the database. * - * This separates image processing concerns from scraping, allowing this project - * to focus on enriching data scraped by the external process. + * NOTE: Image downloading is handled by the external scraper process. + * This service only performs object detection on images that already have local_path set. */ @Slf4j class ImageProcessingService { - private final RateLimitedHttpClient httpClient; private final DatabaseService db; private final ObjectDetectionService detector; - ImageProcessingService(DatabaseService db, ObjectDetectionService detector, RateLimitedHttpClient httpClient) { - this.httpClient = httpClient; + ImageProcessingService(DatabaseService db, ObjectDetectionService detector) { this.db = db; this.detector = detector; } - - /** - * Downloads an image from the given URL to local storage. - * Images are organized by saleId/lotId for easy management. - * - * @param imageUrl remote image URL - * @param saleId sale identifier - * @param lotId lot identifier - * @return absolute path to saved file or null on failure - */ - String downloadImage(String imageUrl, long saleId, long lotId) { - try { - var response = httpClient.sendGetBytes(imageUrl); - if (response != null && response.statusCode() == 200) { - // Use environment variable for cross-platform compatibility - var imagesPath = System.getenv().getOrDefault("AUCTION_IMAGES_PATH", "/mnt/okcomputer/output/images"); - var baseDir = Paths.get(imagesPath); - var dir = baseDir.resolve(String.valueOf(saleId)).resolve(String.valueOf(lotId)); - Files.createDirectories(dir); - - // Extract filename from URL - var fileName = imageUrl.substring(imageUrl.lastIndexOf('/') + 1); - // Remove query parameters if present - int queryIndex = fileName.indexOf('?'); - if (queryIndex > 0) { - fileName = fileName.substring(0, queryIndex); - } - var dest = dir.resolve(fileName); - - Files.write(dest, response.body()); - return dest.toAbsolutePath().toString(); - } - } catch (IOException | InterruptedException e) { - System.err.println("Failed to download image " + imageUrl + ": " + e.getMessage()); - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - return null; - } - /** - * Processes images for a specific lot: downloads and runs object detection. + * Processes a single image: runs object detection and updates labels in database. * - * @param lotId lot identifier - * @param saleId sale identifier - * @param imageUrls list of image URLs to process + * @param imageId database ID of the image record + * @param localPath local file path to the downloaded image + * @param lotId lot identifier (for logging) + * @return true if processing succeeded */ - void processImagesForLot(long lotId, long saleId, List imageUrls) { - log.info(" Processing {} images for lot {}", imageUrls.size(), lotId); - - for (var imgUrl : imageUrls) { - var fileName = downloadImage(imgUrl, saleId, lotId); - - if (fileName != null) { - // Run object detection - var labels = detector.detectObjects(fileName); - - // Save to database - try { - db.insertImage(lotId, imgUrl, fileName, labels); - - if (!labels.isEmpty()) { - log.info(" Detected: {}", String.join(", ", labels)); - } - } catch (SQLException e) { - System.err.println(" Failed to save image to database: " + e.getMessage()); - } + boolean processImage(int imageId, String localPath, long lotId) { + try { + // Run object detection on the local file + var labels = detector.detectObjects(localPath); + + // Update the database with detected labels + db.updateImageLabels(imageId, labels); + + if (!labels.isEmpty()) { + log.info(" Lot {}: Detected {}", lotId, String.join(", ", labels)); } + + return true; + } catch (Exception e) { + log.error(" Failed to process image {}: {}", imageId, e.getMessage()); + return false; } } /** * Batch processes all pending images in the database. - * Useful for processing images after the external scraper has populated lot data. + * Only processes images that have been downloaded by the scraper but haven't had object detection run yet. */ void processPendingImages() { log.info("Processing pending images..."); - + try { - var lots = db.getAllLots(); - log.info("Found {} lots to check for images", lots.size()); - - for (var lot : lots) { - // Check if images already processed for this lot - var existingImages = db.getImagesForLot(lot.lotId()); - - if (existingImages.isEmpty()) { - log.info(" Lot {} has no images yet - needs external scraper data", lot.lotId()); + var pendingImages = db.getImagesNeedingDetection(); + log.info("Found {} images needing object detection", pendingImages.size()); + + var processed = 0; + var detected = 0; + + for (var image : pendingImages) { + if (processImage(image.id(), image.filePath(), image.lotId())) { + processed++; + // Re-fetch to check if labels were found + var labels = db.getImageLabels(image.id()); + if (labels != null && !labels.isEmpty()) { + detected++; + } } } - + + log.info("Processed {} images, detected objects in {}", processed, detected); + } catch (SQLException e) { - System.err.println("Error processing pending images: " + e.getMessage()); + log.error("Error processing pending images: {}", e.getMessage()); } } } diff --git a/src/main/java/auctiora/QuarkusWorkflowScheduler.java b/src/main/java/auctiora/QuarkusWorkflowScheduler.java index f306ee0..ed9cdb0 100644 --- a/src/main/java/auctiora/QuarkusWorkflowScheduler.java +++ b/src/main/java/auctiora/QuarkusWorkflowScheduler.java @@ -54,9 +54,9 @@ public class QuarkusWorkflowScheduler { var lots = db.importLotsFromScraper(); LOG.infof(" β†’ Imported %d lots", lots.size()); - // Import image URLs - var images = db.getUnprocessedImagesFromScraper(); - LOG.infof(" β†’ Found %d unprocessed images", images.size()); + // Check for images needing detection + var images = db.getImagesNeedingDetection(); + LOG.infof(" β†’ Found %d images needing detection", images.size()); var duration = System.currentTimeMillis() - start; LOG.infof(" βœ“ Scraper import completed in %dms", duration); @@ -78,63 +78,53 @@ public class QuarkusWorkflowScheduler { /** * Workflow 2: Process Pending Images * Cron: Every 1 hour (0 0 * * * ?) - * Purpose: Download images and run object detection + * Purpose: Run object detection on images already downloaded by scraper */ @Scheduled(cron = "{auction.workflow.image-processing.cron}", identity = "image-processing") void processImages() { try { LOG.info("πŸ–ΌοΈ [WORKFLOW 2] Processing pending images..."); var start = System.currentTimeMillis(); - - // Get unprocessed images - var unprocessedImages = db.getUnprocessedImagesFromScraper(); - - if (unprocessedImages.isEmpty()) { + + // Get images that have been downloaded but need object detection + var pendingImages = db.getImagesNeedingDetection(); + + if (pendingImages.isEmpty()) { LOG.info(" β†’ No pending images to process"); return; } - - LOG.infof(" β†’ Processing %d images", unprocessedImages.size()); - + + LOG.infof(" β†’ Processing %d images", pendingImages.size()); + var processed = 0; var detected = 0; - - for (var imageRecord : unprocessedImages) { + + for (var image : pendingImages) { try { - // Download image - var filePath = imageProcessor.downloadImage( - imageRecord.url(), - imageRecord.saleId(), - imageRecord.lotId() - ); - - if (filePath != null) { - // Run object detection - var labels = detector.detectObjects(filePath); - - // Save to database - db.insertImage(imageRecord.lotId(), imageRecord.url(), - filePath, labels); - + // Run object detection on already-downloaded image + if (imageProcessor.processImage(image.id(), image.filePath(), image.lotId())) { processed++; - if (!labels.isEmpty()) { + + // Check if objects were detected + var labels = db.getImageLabels(image.id()); + if (labels != null && !labels.isEmpty()) { detected++; - + // Send notification for interesting detections if (labels.size() >= 3) { notifier.sendNotification( String.format("Lot %d: Detected %s", - imageRecord.lotId(), + image.lotId(), String.join(", ", labels)), "Objects Detected", 0 - ); + ); } } } - - // Rate limiting - Thread.sleep(500); + + // Rate limiting (lighter since no network I/O) + Thread.sleep(100); } catch (Exception e) { LOG.warnf(" ⚠️ Failed to process image: %s", e.getMessage()); diff --git a/src/main/java/auctiora/TroostwijkMonitor.java b/src/main/java/auctiora/TroostwijkMonitor.java index 90f6763..e04ff2d 100644 --- a/src/main/java/auctiora/TroostwijkMonitor.java +++ b/src/main/java/auctiora/TroostwijkMonitor.java @@ -43,7 +43,7 @@ public class TroostwijkMonitor { db = new DatabaseService(databasePath); notifier = new NotificationService(notificationConfig); detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath); - imageProcessor = new ImageProcessingService(db, detector, httpClient); + imageProcessor = new ImageProcessingService(db, detector); db.ensureSchema(); } diff --git a/src/main/java/auctiora/WorkflowOrchestrator.java b/src/main/java/auctiora/WorkflowOrchestrator.java index 0458402..b9c2c40 100644 --- a/src/main/java/auctiora/WorkflowOrchestrator.java +++ b/src/main/java/auctiora/WorkflowOrchestrator.java @@ -42,8 +42,7 @@ public class WorkflowOrchestrator { this.notifier = new NotificationService(notificationConfig); this.detector = new ObjectDetectionService(yoloCfg, yoloWeights, yoloClasses); - var httpClient = new RateLimitedHttpClient(); - this.imageProcessor = new ImageProcessingService(db, detector, httpClient); + this.imageProcessor = new ImageProcessingService(db, detector); this.monitor = new TroostwijkMonitor(databasePath, notificationConfig, yoloCfg, yoloWeights, yoloClasses); @@ -100,9 +99,9 @@ public class WorkflowOrchestrator { var lots = db.importLotsFromScraper(); log.info(" β†’ Imported {} lots", lots.size()); - // Import image URLs - var images = db.getUnprocessedImagesFromScraper(); - log.info(" β†’ Found {} unprocessed images", images.size()); + // Check for images needing detection + var images = db.getImagesNeedingDetection(); + log.info(" β†’ Found {} images needing detection", images.size()); var duration = System.currentTimeMillis() - start; log.info(" βœ“ Scraper import completed in {}ms\n", duration); @@ -127,7 +126,7 @@ public class WorkflowOrchestrator { /** * Workflow 2: Process Pending Images * Frequency: Every 1 hour - * Purpose: Download images and run object detection + * Purpose: Run object detection on images already downloaded by scraper */ private void scheduleImageProcessing() { scheduler.scheduleAtFixedRate(() -> { @@ -135,55 +134,45 @@ public class WorkflowOrchestrator { log.info("πŸ–ΌοΈ [WORKFLOW 2] Processing pending images..."); var start = System.currentTimeMillis(); - // Get unprocessed images - var unprocessedImages = db.getUnprocessedImagesFromScraper(); - - if (unprocessedImages.isEmpty()) { + // Get images that have been downloaded but need object detection + var pendingImages = db.getImagesNeedingDetection(); + + if (pendingImages.isEmpty()) { log.info(" β†’ No pending images to process\n"); return; } - - log.info(" β†’ Processing {} images", unprocessedImages.size()); - + + log.info(" β†’ Processing {} images", pendingImages.size()); + var processed = 0; var detected = 0; - - for (var imageRecord : unprocessedImages) { + + for (var image : pendingImages) { try { - // Download image - var filePath = imageProcessor.downloadImage( - imageRecord.url(), - imageRecord.saleId(), - imageRecord.lotId() - ); - - if (filePath != null) { - // Run object detection - var labels = detector.detectObjects(filePath); - - // Save to database - db.insertImage(imageRecord.lotId(), imageRecord.url(), - filePath, labels); - + // Run object detection on already-downloaded image + if (imageProcessor.processImage(image.id(), image.filePath(), image.lotId())) { processed++; - if (!labels.isEmpty()) { + + // Check if objects were detected + var labels = db.getImageLabels(image.id()); + if (labels != null && !labels.isEmpty()) { detected++; - + // Send notification for interesting detections if (labels.size() >= 3) { notifier.sendNotification( String.format("Lot %d: Detected %s", - imageRecord.lotId(), + image.lotId(), String.join(", ", labels)), "Objects Detected", 0 - ); + ); } } } - - // Rate limiting - Thread.sleep(500); + + // Rate limiting (lighter since no network I/O) + Thread.sleep(100); } catch (Exception e) { log.info(" ⚠\uFE0F Failed to process image: {}", e.getMessage()); diff --git a/troostwijk.db b/troostwijk.db deleted file mode 100644 index e69de29..0000000