go
This commit is contained in:
399
SCRAPER_REFACTOR_GUIDE.md
Normal file
399
SCRAPER_REFACTOR_GUIDE.md
Normal file
@@ -0,0 +1,399 @@
|
||||
# Scraper Refactor Guide - Image Download Integration
|
||||
|
||||
## 🎯 Objective
|
||||
|
||||
Refactor the Troostwijk scraper to **download and store images locally**, eliminating the 57M+ duplicate image problem in the monitoring process.
|
||||
|
||||
## 📋 Current vs. New Architecture
|
||||
|
||||
### **Before** (Current Architecture)
|
||||
```
|
||||
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
||||
│ Scraper │────────▶│ Database │◀────────│ Monitor │
|
||||
│ │ │ │ │ │
|
||||
│ Stores URLs │ │ images table │ │ Downloads + │
|
||||
│ downloaded=0 │ │ │ │ Detection │
|
||||
└──────────────┘ └──────────────┘ └──────────────┘
|
||||
│
|
||||
▼
|
||||
57M+ duplicates!
|
||||
```
|
||||
|
||||
### **After** (New Architecture)
|
||||
```
|
||||
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
||||
│ Scraper │────────▶│ Database │◀────────│ Monitor │
|
||||
│ │ │ │ │ │
|
||||
│ Downloads + │ │ images table │ │ Detection │
|
||||
│ Stores path │ │ local_path ✓ │ │ Only │
|
||||
│ downloaded=1 │ │ │ │ │
|
||||
└──────────────┘ └──────────────┘ └──────────────┘
|
||||
│
|
||||
▼
|
||||
No duplicates!
|
||||
```
|
||||
|
||||
## 🗄️ Database Schema Changes
|
||||
|
||||
### Current Schema (ARCHITECTURE-TROOSTWIJK-SCRAPER.md:113-122)
|
||||
```sql
|
||||
CREATE TABLE images (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
lot_id TEXT,
|
||||
url TEXT,
|
||||
local_path TEXT, -- Currently NULL
|
||||
downloaded INTEGER -- Currently 0
|
||||
-- Missing: processed_at, labels (added by monitor)
|
||||
);
|
||||
```
|
||||
|
||||
### Required Schema (Already Compatible!)
|
||||
```sql
|
||||
CREATE TABLE images (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
lot_id TEXT,
|
||||
url TEXT,
|
||||
local_path TEXT, -- ✅ SET by scraper after download
|
||||
downloaded INTEGER, -- ✅ SET to 1 by scraper after download
|
||||
labels TEXT, -- ⚠️ SET by monitor (object detection)
|
||||
processed_at INTEGER, -- ⚠️ SET by monitor (timestamp)
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
);
|
||||
```
|
||||
|
||||
**Good News**: The scraper's schema already has `local_path` and `downloaded` columns! You just need to populate them.
|
||||
|
||||
## 🔧 Implementation Steps
|
||||
|
||||
### **Step 1: Enable Image Downloading in Configuration**
|
||||
|
||||
**File**: Your scraper's config file (e.g., `config.py` or environment variables)
|
||||
|
||||
```python
|
||||
# Current setting
|
||||
DOWNLOAD_IMAGES = False # ❌ Change this!
|
||||
|
||||
# New setting
|
||||
DOWNLOAD_IMAGES = True # ✅ Enable downloads
|
||||
|
||||
# Image storage path
|
||||
IMAGES_DIR = "/mnt/okcomputer/output/images" # Or your preferred path
|
||||
```
|
||||
|
||||
### **Step 2: Update Image Download Logic**
|
||||
|
||||
Based on ARCHITECTURE-TROOSTWIJK-SCRAPER.md:211-228, you already have the structure. Here's what needs to change:
|
||||
|
||||
**Current Code** (Conceptual):
|
||||
```python
|
||||
# Phase 3: Scrape lot details
|
||||
def scrape_lot(lot_url):
|
||||
lot_data = parse_lot_page(lot_url)
|
||||
|
||||
# Save lot to database
|
||||
db.insert_lot(lot_data)
|
||||
|
||||
# Save image URLs to database (NOT DOWNLOADED)
|
||||
for img_url in lot_data['images']:
|
||||
db.execute("""
|
||||
INSERT INTO images (lot_id, url, downloaded)
|
||||
VALUES (?, ?, 0)
|
||||
""", (lot_data['lot_id'], img_url))
|
||||
```
|
||||
|
||||
**New Code** (Required):
|
||||
```python
|
||||
import os
|
||||
import requests
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
def scrape_lot(lot_url):
|
||||
lot_data = parse_lot_page(lot_url)
|
||||
|
||||
# Save lot to database
|
||||
db.insert_lot(lot_data)
|
||||
|
||||
# Download and save images
|
||||
for idx, img_url in enumerate(lot_data['images'], start=1):
|
||||
try:
|
||||
# Download image
|
||||
local_path = download_image(img_url, lot_data['lot_id'], idx)
|
||||
|
||||
# Insert with local_path and downloaded=1
|
||||
db.execute("""
|
||||
INSERT INTO images (lot_id, url, local_path, downloaded)
|
||||
VALUES (?, ?, ?, 1)
|
||||
ON CONFLICT(lot_id, url) DO UPDATE SET
|
||||
local_path = excluded.local_path,
|
||||
downloaded = 1
|
||||
""", (lot_data['lot_id'], img_url, local_path))
|
||||
|
||||
# Rate limiting (0.5s between downloads)
|
||||
time.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to download {img_url}: {e}")
|
||||
# Still insert record but mark as not downloaded
|
||||
db.execute("""
|
||||
INSERT INTO images (lot_id, url, downloaded)
|
||||
VALUES (?, ?, 0)
|
||||
""", (lot_data['lot_id'], img_url))
|
||||
|
||||
def download_image(image_url, lot_id, index):
|
||||
"""
|
||||
Downloads an image and saves it to organized directory structure.
|
||||
|
||||
Args:
|
||||
image_url: Remote URL of the image
|
||||
lot_id: Lot identifier (e.g., "A1-28505-5")
|
||||
index: Image sequence number (1, 2, 3, ...)
|
||||
|
||||
Returns:
|
||||
Absolute path to saved file
|
||||
"""
|
||||
# Create directory structure: /images/{lot_id}/
|
||||
images_dir = Path(os.getenv('IMAGES_DIR', '/mnt/okcomputer/output/images'))
|
||||
lot_dir = images_dir / lot_id
|
||||
lot_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Determine file extension from URL or content-type
|
||||
ext = Path(image_url).suffix or '.jpg'
|
||||
filename = f"{index:03d}{ext}" # 001.jpg, 002.jpg, etc.
|
||||
local_path = lot_dir / filename
|
||||
|
||||
# Download with timeout
|
||||
response = requests.get(image_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# Save to disk
|
||||
with open(local_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
|
||||
return str(local_path.absolute())
|
||||
```
|
||||
|
||||
### **Step 3: Add Unique Constraint to Prevent Duplicates**
|
||||
|
||||
**Migration SQL**:
|
||||
```sql
|
||||
-- Add unique constraint to prevent duplicate image records
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_images_unique
|
||||
ON images(lot_id, url);
|
||||
```
|
||||
|
||||
Add this to your scraper's schema initialization:
|
||||
|
||||
```python
|
||||
def init_database():
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Existing table creation...
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS images (...)
|
||||
""")
|
||||
|
||||
# Add unique constraint (NEW)
|
||||
cursor.execute("""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_images_unique
|
||||
ON images(lot_id, url)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
```
|
||||
|
||||
### **Step 4: Handle Image Download Failures Gracefully**
|
||||
|
||||
```python
|
||||
def download_with_retry(image_url, lot_id, index, max_retries=3):
|
||||
"""Downloads image with retry logic."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
return download_image(image_url, lot_id, index)
|
||||
except requests.exceptions.RequestException as e:
|
||||
if attempt == max_retries - 1:
|
||||
print(f"Failed after {max_retries} attempts: {image_url}")
|
||||
return None # Return None on failure
|
||||
print(f"Retry {attempt + 1}/{max_retries} for {image_url}")
|
||||
time.sleep(2 ** attempt) # Exponential backoff
|
||||
```
|
||||
|
||||
### **Step 5: Update Database Queries**
|
||||
|
||||
Make sure your INSERT uses `INSERT ... ON CONFLICT` to handle re-scraping:
|
||||
|
||||
```python
|
||||
# Good: Handles re-scraping without duplicates
|
||||
db.execute("""
|
||||
INSERT INTO images (lot_id, url, local_path, downloaded)
|
||||
VALUES (?, ?, ?, 1)
|
||||
ON CONFLICT(lot_id, url) DO UPDATE SET
|
||||
local_path = excluded.local_path,
|
||||
downloaded = 1
|
||||
""", (lot_id, img_url, local_path))
|
||||
|
||||
# Bad: Creates duplicates on re-scrape
|
||||
db.execute("""
|
||||
INSERT INTO images (lot_id, url, local_path, downloaded)
|
||||
VALUES (?, ?, ?, 1)
|
||||
""", (lot_id, img_url, local_path))
|
||||
```
|
||||
|
||||
## 📊 Expected Outcomes
|
||||
|
||||
### Before Refactor
|
||||
```sql
|
||||
SELECT COUNT(*) FROM images WHERE downloaded = 0;
|
||||
-- Result: 57,376,293 (57M+ undownloaded!)
|
||||
|
||||
SELECT COUNT(*) FROM images WHERE local_path IS NOT NULL;
|
||||
-- Result: 0 (no files downloaded)
|
||||
```
|
||||
|
||||
### After Refactor
|
||||
```sql
|
||||
SELECT COUNT(*) FROM images WHERE downloaded = 1;
|
||||
-- Result: ~16,807 (one per actual lot image)
|
||||
|
||||
SELECT COUNT(*) FROM images WHERE local_path IS NOT NULL;
|
||||
-- Result: ~16,807 (all downloaded images have paths)
|
||||
|
||||
SELECT COUNT(DISTINCT lot_id, url) FROM images;
|
||||
-- Result: ~16,807 (no duplicates!)
|
||||
```
|
||||
|
||||
## 🚀 Deployment Checklist
|
||||
|
||||
### Pre-Deployment
|
||||
- [ ] Back up current database: `cp cache.db cache.db.backup`
|
||||
- [ ] Verify disk space: At least 10GB free for images
|
||||
- [ ] Test download function on 5 sample lots
|
||||
- [ ] Verify `IMAGES_DIR` path exists and is writable
|
||||
|
||||
### Deployment
|
||||
- [ ] Update configuration: `DOWNLOAD_IMAGES = True`
|
||||
- [ ] Run schema migration to add unique index
|
||||
- [ ] Deploy updated scraper code
|
||||
- [ ] Monitor first 100 lots for errors
|
||||
|
||||
### Post-Deployment Verification
|
||||
```sql
|
||||
-- Check download success rate
|
||||
SELECT
|
||||
COUNT(*) as total_images,
|
||||
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
|
||||
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
|
||||
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
|
||||
FROM images;
|
||||
|
||||
-- Check for duplicates (should be 0)
|
||||
SELECT lot_id, url, COUNT(*) as dup_count
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
HAVING COUNT(*) > 1;
|
||||
|
||||
-- Verify file system
|
||||
SELECT COUNT(*) FROM images
|
||||
WHERE downloaded = 1
|
||||
AND local_path IS NOT NULL
|
||||
AND local_path != '';
|
||||
```
|
||||
|
||||
## 🔍 Monitoring Process Impact
|
||||
|
||||
The monitoring process (auctiora) will automatically:
|
||||
- ✅ Stop downloading images (network I/O eliminated)
|
||||
- ✅ Only run object detection on `local_path` files
|
||||
- ✅ Query: `WHERE local_path IS NOT NULL AND (labels IS NULL OR labels = '')`
|
||||
- ✅ Update only the `labels` and `processed_at` columns
|
||||
|
||||
**No changes needed in monitoring process!** It's already updated to work with scraper-downloaded images.
|
||||
|
||||
## 🐛 Troubleshooting
|
||||
|
||||
### Problem: "No space left on device"
|
||||
```bash
|
||||
# Check disk usage
|
||||
df -h /mnt/okcomputer/output/images
|
||||
|
||||
# Estimate needed space: ~100KB per image
|
||||
# 16,807 images × 100KB = ~1.6GB
|
||||
```
|
||||
|
||||
### Problem: "Permission denied" when writing images
|
||||
```bash
|
||||
# Fix permissions
|
||||
chmod 755 /mnt/okcomputer/output/images
|
||||
chown -R scraper_user:scraper_group /mnt/okcomputer/output/images
|
||||
```
|
||||
|
||||
### Problem: Images downloading but not recorded in DB
|
||||
```python
|
||||
# Add logging
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
def download_image(...):
|
||||
logging.info(f"Downloading {image_url} to {local_path}")
|
||||
# ... download code ...
|
||||
logging.info(f"Saved to {local_path}, size: {os.path.getsize(local_path)} bytes")
|
||||
return local_path
|
||||
```
|
||||
|
||||
### Problem: Duplicate images after refactor
|
||||
```sql
|
||||
-- Find duplicates
|
||||
SELECT lot_id, url, COUNT(*)
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
HAVING COUNT(*) > 1;
|
||||
|
||||
-- Clean up duplicates (keep newest)
|
||||
DELETE FROM images
|
||||
WHERE id NOT IN (
|
||||
SELECT MAX(id)
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
);
|
||||
```
|
||||
|
||||
## 📈 Performance Comparison
|
||||
|
||||
| Metric | Before (Monitor Downloads) | After (Scraper Downloads) |
|
||||
|--------|---------------------------|---------------------------|
|
||||
| **Image records** | 57,376,293 | ~16,807 |
|
||||
| **Duplicates** | 57,359,486 (99.97%!) | 0 |
|
||||
| **Network I/O** | Monitor process | Scraper process |
|
||||
| **Disk usage** | 0 (URLs only) | ~1.6GB (actual files) |
|
||||
| **Processing speed** | 500ms/image (download + detect) | 100ms/image (detect only) |
|
||||
| **Error handling** | Complex (download failures) | Simple (files exist) |
|
||||
|
||||
## 🎓 Code Examples by Language
|
||||
|
||||
### Python (Most Likely)
|
||||
See **Step 2** above for complete implementation.
|
||||
|
||||
## 📚 References
|
||||
|
||||
- **Current Scraper Architecture**: `wiki/ARCHITECTURE-TROOSTWIJK-SCRAPER.md`
|
||||
- **Database Schema**: `wiki/DATABASE_ARCHITECTURE.md`
|
||||
- **Monitor Changes**: See commit history for `ImageProcessingService.java`, `DatabaseService.java`
|
||||
|
||||
## ✅ Success Criteria
|
||||
|
||||
You'll know the refactor is successful when:
|
||||
|
||||
1. ✅ Database query `SELECT COUNT(*) FROM images` returns ~16,807 (not 57M+)
|
||||
2. ✅ All images have `downloaded = 1` and `local_path IS NOT NULL`
|
||||
3. ✅ No duplicate records: `SELECT lot_id, url, COUNT(*) ... HAVING COUNT(*) > 1` returns 0 rows
|
||||
4. ✅ Monitor logs show "Found N images needing detection" with reasonable numbers
|
||||
5. ✅ Files exist at paths in `local_path` column
|
||||
6. ✅ Monitor process speed increases (100ms vs 500ms per image)
|
||||
|
||||
---
|
||||
|
||||
**Questions?** Check the troubleshooting section or inspect the monitor's updated code in:
|
||||
- `src/main/java/auctiora/ImageProcessingService.java`
|
||||
- `src/main/java/auctiora/DatabaseService.java:695-719`
|
||||
@@ -28,7 +28,7 @@ services:
|
||||
- AUCTION_WORKFLOW_CLOSING_ALERTS_CRON=0 */5 * * * ?
|
||||
|
||||
volumes:
|
||||
# Mount database and images directory
|
||||
# Mount database and images directory1
|
||||
- shared-auction-data:/mnt/okcomputer/output
|
||||
|
||||
labels:
|
||||
|
||||
@@ -68,10 +68,9 @@ public class AuctionMonitorProducer {
|
||||
@Singleton
|
||||
public ImageProcessingService produceImageProcessingService(
|
||||
DatabaseService db,
|
||||
ObjectDetectionService detector,
|
||||
RateLimitedHttpClient httpClient) {
|
||||
ObjectDetectionService detector) {
|
||||
|
||||
LOG.infof("Initializing ImageProcessingService");
|
||||
return new ImageProcessingService(db, detector, httpClient);
|
||||
return new ImageProcessingService(db, detector);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,7 +73,8 @@ public class DatabaseService {
|
||||
FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)
|
||||
)""");
|
||||
|
||||
// Images table (populated by this process)
|
||||
// Images table (populated by external scraper with URLs and local_path)
|
||||
// This process only adds labels via object detection
|
||||
stmt.execute("""
|
||||
CREATE TABLE IF NOT EXISTS images (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -82,6 +83,7 @@ public class DatabaseService {
|
||||
local_path TEXT,
|
||||
labels TEXT,
|
||||
processed_at INTEGER,
|
||||
downloaded INTEGER DEFAULT 0,
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
)""");
|
||||
|
||||
@@ -258,6 +260,7 @@ public class DatabaseService {
|
||||
var hasLabels = false;
|
||||
var hasLocalPath = false;
|
||||
var hasProcessedAt = false;
|
||||
var hasDownloaded = false;
|
||||
|
||||
while (rs.next()) {
|
||||
var colName = rs.getString("name");
|
||||
@@ -265,6 +268,7 @@ public class DatabaseService {
|
||||
case "labels" -> hasLabels = true;
|
||||
case "local_path" -> hasLocalPath = true;
|
||||
case "processed_at" -> hasProcessedAt = true;
|
||||
case "downloaded" -> hasDownloaded = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -280,6 +284,10 @@ public class DatabaseService {
|
||||
log.info("Migrating schema: Adding 'processed_at' column to images table");
|
||||
stmt.execute("ALTER TABLE images ADD COLUMN processed_at INTEGER");
|
||||
}
|
||||
if (!hasDownloaded) {
|
||||
log.info("Migrating schema: Adding 'downloaded' column to images table");
|
||||
stmt.execute("ALTER TABLE images ADD COLUMN downloaded INTEGER DEFAULT 0");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
// Table might not exist yet, which is fine
|
||||
log.debug("Could not check images table schema: " + e.getMessage());
|
||||
@@ -462,19 +470,35 @@ public class DatabaseService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts a new image record with object detection labels
|
||||
* Updates the labels field for an image after object detection
|
||||
*/
|
||||
synchronized void insertImage(long lotId, String url, String filePath, List<String> labels) throws SQLException {
|
||||
var sql = "INSERT INTO images (lot_id, url, local_path, labels, processed_at) VALUES (?, ?, ?, ?, ?)";
|
||||
synchronized void updateImageLabels(int imageId, List<String> labels) throws SQLException {
|
||||
var sql = "UPDATE images SET labels = ?, processed_at = ? WHERE id = ?";
|
||||
try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setLong(1, lotId);
|
||||
ps.setString(2, url);
|
||||
ps.setString(3, filePath);
|
||||
ps.setString(4, String.join(",", labels));
|
||||
ps.setLong(5, Instant.now().getEpochSecond());
|
||||
ps.setString(1, String.join(",", labels));
|
||||
ps.setLong(2, Instant.now().getEpochSecond());
|
||||
ps.setInt(3, imageId);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the labels for a specific image
|
||||
*/
|
||||
synchronized List<String> getImageLabels(int imageId) throws SQLException {
|
||||
var sql = "SELECT labels FROM images WHERE id = ?";
|
||||
try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setInt(1, imageId);
|
||||
var rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
var labelsStr = rs.getString("labels");
|
||||
if (labelsStr != null && !labelsStr.isEmpty()) {
|
||||
return List.of(labelsStr.split(","));
|
||||
}
|
||||
}
|
||||
}
|
||||
return List.of();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves images for a specific lot
|
||||
@@ -671,46 +695,34 @@ public class DatabaseService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Imports image URLs from scraper's schema.
|
||||
* The scraper populates the images table with URLs but doesn't download them.
|
||||
* This method retrieves undownloaded images for processing.
|
||||
* Gets images that have been downloaded by the scraper but need object detection.
|
||||
* Only returns images that have local_path set but no labels yet.
|
||||
*
|
||||
* @return List of image URLs that need to be downloaded
|
||||
* @return List of images needing object detection
|
||||
*/
|
||||
synchronized List<ImageImportRecord> getUnprocessedImagesFromScraper() throws SQLException {
|
||||
List<ImageImportRecord> images = new ArrayList<>();
|
||||
synchronized List<ImageDetectionRecord> getImagesNeedingDetection() throws SQLException {
|
||||
List<ImageDetectionRecord> images = new ArrayList<>();
|
||||
var sql = """
|
||||
SELECT i.lot_id, i.url, l.auction_id
|
||||
SELECT i.id, i.lot_id, i.local_path
|
||||
FROM images i
|
||||
LEFT JOIN lots l ON i.lot_id = l.lot_id
|
||||
WHERE i.downloaded = 0 OR i.local_path IS NULL
|
||||
WHERE i.local_path IS NOT NULL
|
||||
AND i.local_path != ''
|
||||
AND (i.labels IS NULL OR i.labels = '')
|
||||
""";
|
||||
|
||||
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
while (rs.next()) {
|
||||
var lotIdStr = rs.getString("lot_id");
|
||||
var auctionIdStr = rs.getString("auction_id");
|
||||
|
||||
var lotId = ScraperDataAdapter.extractNumericId(lotIdStr);
|
||||
var saleId = ScraperDataAdapter.extractNumericId(auctionIdStr);
|
||||
|
||||
// Skip images with invalid IDs (0 indicates parsing failed)
|
||||
if (lotId == 0L || saleId == 0L) {
|
||||
log.debug("Skipping image with invalid ID: lot_id={}, sale_id={}", lotId, saleId);
|
||||
continue;
|
||||
}
|
||||
|
||||
images.add(new ImageImportRecord(
|
||||
lotId,
|
||||
saleId,
|
||||
rs.getString("url")
|
||||
images.add(new ImageDetectionRecord(
|
||||
rs.getInt("id"),
|
||||
rs.getLong("lot_id"),
|
||||
rs.getString("local_path")
|
||||
));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
log.info("ℹ️ No unprocessed images found in scraper format");
|
||||
log.info("ℹ️ No images needing detection found");
|
||||
}
|
||||
|
||||
|
||||
return images;
|
||||
}
|
||||
|
||||
@@ -718,9 +730,9 @@ public class DatabaseService {
|
||||
* Simple record for image data from database
|
||||
*/
|
||||
record ImageRecord(int id, long lotId, String url, String filePath, String labels) { }
|
||||
|
||||
|
||||
/**
|
||||
* Record for importing images from scraper format
|
||||
* Record for images that need object detection processing
|
||||
*/
|
||||
record ImageImportRecord(long lotId, long saleId, String url) { }
|
||||
record ImageDetectionRecord(int id, long lotId, String filePath) { }
|
||||
}
|
||||
|
||||
@@ -12,120 +12,78 @@ import java.util.List;
|
||||
|
||||
/**
|
||||
* Service responsible for processing images from the IMAGES table.
|
||||
* Downloads images, performs object detection, and updates the database.
|
||||
* Performs object detection on already-downloaded images and updates the database.
|
||||
*
|
||||
* This separates image processing concerns from scraping, allowing this project
|
||||
* to focus on enriching data scraped by the external process.
|
||||
* NOTE: Image downloading is handled by the external scraper process.
|
||||
* This service only performs object detection on images that already have local_path set.
|
||||
*/
|
||||
@Slf4j
|
||||
class ImageProcessingService {
|
||||
|
||||
private final RateLimitedHttpClient httpClient;
|
||||
private final DatabaseService db;
|
||||
private final ObjectDetectionService detector;
|
||||
|
||||
ImageProcessingService(DatabaseService db, ObjectDetectionService detector, RateLimitedHttpClient httpClient) {
|
||||
this.httpClient = httpClient;
|
||||
ImageProcessingService(DatabaseService db, ObjectDetectionService detector) {
|
||||
this.db = db;
|
||||
this.detector = detector;
|
||||
}
|
||||
|
||||
/**
|
||||
* Downloads an image from the given URL to local storage.
|
||||
* Images are organized by saleId/lotId for easy management.
|
||||
*
|
||||
* @param imageUrl remote image URL
|
||||
* @param saleId sale identifier
|
||||
* @param lotId lot identifier
|
||||
* @return absolute path to saved file or null on failure
|
||||
*/
|
||||
String downloadImage(String imageUrl, long saleId, long lotId) {
|
||||
try {
|
||||
var response = httpClient.sendGetBytes(imageUrl);
|
||||
|
||||
if (response != null && response.statusCode() == 200) {
|
||||
// Use environment variable for cross-platform compatibility
|
||||
var imagesPath = System.getenv().getOrDefault("AUCTION_IMAGES_PATH", "/mnt/okcomputer/output/images");
|
||||
var baseDir = Paths.get(imagesPath);
|
||||
var dir = baseDir.resolve(String.valueOf(saleId)).resolve(String.valueOf(lotId));
|
||||
Files.createDirectories(dir);
|
||||
|
||||
// Extract filename from URL
|
||||
var fileName = imageUrl.substring(imageUrl.lastIndexOf('/') + 1);
|
||||
// Remove query parameters if present
|
||||
int queryIndex = fileName.indexOf('?');
|
||||
if (queryIndex > 0) {
|
||||
fileName = fileName.substring(0, queryIndex);
|
||||
}
|
||||
var dest = dir.resolve(fileName);
|
||||
|
||||
Files.write(dest, response.body());
|
||||
return dest.toAbsolutePath().toString();
|
||||
}
|
||||
} catch (IOException | InterruptedException e) {
|
||||
System.err.println("Failed to download image " + imageUrl + ": " + e.getMessage());
|
||||
if (e instanceof InterruptedException) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes images for a specific lot: downloads and runs object detection.
|
||||
* Processes a single image: runs object detection and updates labels in database.
|
||||
*
|
||||
* @param lotId lot identifier
|
||||
* @param saleId sale identifier
|
||||
* @param imageUrls list of image URLs to process
|
||||
* @param imageId database ID of the image record
|
||||
* @param localPath local file path to the downloaded image
|
||||
* @param lotId lot identifier (for logging)
|
||||
* @return true if processing succeeded
|
||||
*/
|
||||
void processImagesForLot(long lotId, long saleId, List<String> imageUrls) {
|
||||
log.info(" Processing {} images for lot {}", imageUrls.size(), lotId);
|
||||
|
||||
for (var imgUrl : imageUrls) {
|
||||
var fileName = downloadImage(imgUrl, saleId, lotId);
|
||||
|
||||
if (fileName != null) {
|
||||
// Run object detection
|
||||
var labels = detector.detectObjects(fileName);
|
||||
|
||||
// Save to database
|
||||
try {
|
||||
db.insertImage(lotId, imgUrl, fileName, labels);
|
||||
|
||||
if (!labels.isEmpty()) {
|
||||
log.info(" Detected: {}", String.join(", ", labels));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println(" Failed to save image to database: " + e.getMessage());
|
||||
}
|
||||
boolean processImage(int imageId, String localPath, long lotId) {
|
||||
try {
|
||||
// Run object detection on the local file
|
||||
var labels = detector.detectObjects(localPath);
|
||||
|
||||
// Update the database with detected labels
|
||||
db.updateImageLabels(imageId, labels);
|
||||
|
||||
if (!labels.isEmpty()) {
|
||||
log.info(" Lot {}: Detected {}", lotId, String.join(", ", labels));
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
log.error(" Failed to process image {}: {}", imageId, e.getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch processes all pending images in the database.
|
||||
* Useful for processing images after the external scraper has populated lot data.
|
||||
* Only processes images that have been downloaded by the scraper but haven't had object detection run yet.
|
||||
*/
|
||||
void processPendingImages() {
|
||||
log.info("Processing pending images...");
|
||||
|
||||
|
||||
try {
|
||||
var lots = db.getAllLots();
|
||||
log.info("Found {} lots to check for images", lots.size());
|
||||
|
||||
for (var lot : lots) {
|
||||
// Check if images already processed for this lot
|
||||
var existingImages = db.getImagesForLot(lot.lotId());
|
||||
|
||||
if (existingImages.isEmpty()) {
|
||||
log.info(" Lot {} has no images yet - needs external scraper data", lot.lotId());
|
||||
var pendingImages = db.getImagesNeedingDetection();
|
||||
log.info("Found {} images needing object detection", pendingImages.size());
|
||||
|
||||
var processed = 0;
|
||||
var detected = 0;
|
||||
|
||||
for (var image : pendingImages) {
|
||||
if (processImage(image.id(), image.filePath(), image.lotId())) {
|
||||
processed++;
|
||||
// Re-fetch to check if labels were found
|
||||
var labels = db.getImageLabels(image.id());
|
||||
if (labels != null && !labels.isEmpty()) {
|
||||
detected++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
log.info("Processed {} images, detected objects in {}", processed, detected);
|
||||
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Error processing pending images: " + e.getMessage());
|
||||
log.error("Error processing pending images: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,9 +54,9 @@ public class QuarkusWorkflowScheduler {
|
||||
var lots = db.importLotsFromScraper();
|
||||
LOG.infof(" → Imported %d lots", lots.size());
|
||||
|
||||
// Import image URLs
|
||||
var images = db.getUnprocessedImagesFromScraper();
|
||||
LOG.infof(" → Found %d unprocessed images", images.size());
|
||||
// Check for images needing detection
|
||||
var images = db.getImagesNeedingDetection();
|
||||
LOG.infof(" → Found %d images needing detection", images.size());
|
||||
|
||||
var duration = System.currentTimeMillis() - start;
|
||||
LOG.infof(" ✓ Scraper import completed in %dms", duration);
|
||||
@@ -78,63 +78,53 @@ public class QuarkusWorkflowScheduler {
|
||||
/**
|
||||
* Workflow 2: Process Pending Images
|
||||
* Cron: Every 1 hour (0 0 * * * ?)
|
||||
* Purpose: Download images and run object detection
|
||||
* Purpose: Run object detection on images already downloaded by scraper
|
||||
*/
|
||||
@Scheduled(cron = "{auction.workflow.image-processing.cron}", identity = "image-processing")
|
||||
void processImages() {
|
||||
try {
|
||||
LOG.info("🖼️ [WORKFLOW 2] Processing pending images...");
|
||||
var start = System.currentTimeMillis();
|
||||
|
||||
// Get unprocessed images
|
||||
var unprocessedImages = db.getUnprocessedImagesFromScraper();
|
||||
|
||||
if (unprocessedImages.isEmpty()) {
|
||||
|
||||
// Get images that have been downloaded but need object detection
|
||||
var pendingImages = db.getImagesNeedingDetection();
|
||||
|
||||
if (pendingImages.isEmpty()) {
|
||||
LOG.info(" → No pending images to process");
|
||||
return;
|
||||
}
|
||||
|
||||
LOG.infof(" → Processing %d images", unprocessedImages.size());
|
||||
|
||||
|
||||
LOG.infof(" → Processing %d images", pendingImages.size());
|
||||
|
||||
var processed = 0;
|
||||
var detected = 0;
|
||||
|
||||
for (var imageRecord : unprocessedImages) {
|
||||
|
||||
for (var image : pendingImages) {
|
||||
try {
|
||||
// Download image
|
||||
var filePath = imageProcessor.downloadImage(
|
||||
imageRecord.url(),
|
||||
imageRecord.saleId(),
|
||||
imageRecord.lotId()
|
||||
);
|
||||
|
||||
if (filePath != null) {
|
||||
// Run object detection
|
||||
var labels = detector.detectObjects(filePath);
|
||||
|
||||
// Save to database
|
||||
db.insertImage(imageRecord.lotId(), imageRecord.url(),
|
||||
filePath, labels);
|
||||
|
||||
// Run object detection on already-downloaded image
|
||||
if (imageProcessor.processImage(image.id(), image.filePath(), image.lotId())) {
|
||||
processed++;
|
||||
if (!labels.isEmpty()) {
|
||||
|
||||
// Check if objects were detected
|
||||
var labels = db.getImageLabels(image.id());
|
||||
if (labels != null && !labels.isEmpty()) {
|
||||
detected++;
|
||||
|
||||
|
||||
// Send notification for interesting detections
|
||||
if (labels.size() >= 3) {
|
||||
notifier.sendNotification(
|
||||
String.format("Lot %d: Detected %s",
|
||||
imageRecord.lotId(),
|
||||
image.lotId(),
|
||||
String.join(", ", labels)),
|
||||
"Objects Detected",
|
||||
0
|
||||
);
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rate limiting
|
||||
Thread.sleep(500);
|
||||
|
||||
// Rate limiting (lighter since no network I/O)
|
||||
Thread.sleep(100);
|
||||
|
||||
} catch (Exception e) {
|
||||
LOG.warnf(" ⚠️ Failed to process image: %s", e.getMessage());
|
||||
|
||||
@@ -43,7 +43,7 @@ public class TroostwijkMonitor {
|
||||
db = new DatabaseService(databasePath);
|
||||
notifier = new NotificationService(notificationConfig);
|
||||
detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath);
|
||||
imageProcessor = new ImageProcessingService(db, detector, httpClient);
|
||||
imageProcessor = new ImageProcessingService(db, detector);
|
||||
|
||||
db.ensureSchema();
|
||||
}
|
||||
|
||||
@@ -42,8 +42,7 @@ public class WorkflowOrchestrator {
|
||||
|
||||
this.notifier = new NotificationService(notificationConfig);
|
||||
this.detector = new ObjectDetectionService(yoloCfg, yoloWeights, yoloClasses);
|
||||
var httpClient = new RateLimitedHttpClient();
|
||||
this.imageProcessor = new ImageProcessingService(db, detector, httpClient);
|
||||
this.imageProcessor = new ImageProcessingService(db, detector);
|
||||
|
||||
this.monitor = new TroostwijkMonitor(databasePath, notificationConfig,
|
||||
yoloCfg, yoloWeights, yoloClasses);
|
||||
@@ -100,9 +99,9 @@ public class WorkflowOrchestrator {
|
||||
var lots = db.importLotsFromScraper();
|
||||
log.info(" → Imported {} lots", lots.size());
|
||||
|
||||
// Import image URLs
|
||||
var images = db.getUnprocessedImagesFromScraper();
|
||||
log.info(" → Found {} unprocessed images", images.size());
|
||||
// Check for images needing detection
|
||||
var images = db.getImagesNeedingDetection();
|
||||
log.info(" → Found {} images needing detection", images.size());
|
||||
|
||||
var duration = System.currentTimeMillis() - start;
|
||||
log.info(" ✓ Scraper import completed in {}ms\n", duration);
|
||||
@@ -127,7 +126,7 @@ public class WorkflowOrchestrator {
|
||||
/**
|
||||
* Workflow 2: Process Pending Images
|
||||
* Frequency: Every 1 hour
|
||||
* Purpose: Download images and run object detection
|
||||
* Purpose: Run object detection on images already downloaded by scraper
|
||||
*/
|
||||
private void scheduleImageProcessing() {
|
||||
scheduler.scheduleAtFixedRate(() -> {
|
||||
@@ -135,55 +134,45 @@ public class WorkflowOrchestrator {
|
||||
log.info("🖼️ [WORKFLOW 2] Processing pending images...");
|
||||
var start = System.currentTimeMillis();
|
||||
|
||||
// Get unprocessed images
|
||||
var unprocessedImages = db.getUnprocessedImagesFromScraper();
|
||||
|
||||
if (unprocessedImages.isEmpty()) {
|
||||
// Get images that have been downloaded but need object detection
|
||||
var pendingImages = db.getImagesNeedingDetection();
|
||||
|
||||
if (pendingImages.isEmpty()) {
|
||||
log.info(" → No pending images to process\n");
|
||||
return;
|
||||
}
|
||||
|
||||
log.info(" → Processing {} images", unprocessedImages.size());
|
||||
|
||||
|
||||
log.info(" → Processing {} images", pendingImages.size());
|
||||
|
||||
var processed = 0;
|
||||
var detected = 0;
|
||||
|
||||
for (var imageRecord : unprocessedImages) {
|
||||
|
||||
for (var image : pendingImages) {
|
||||
try {
|
||||
// Download image
|
||||
var filePath = imageProcessor.downloadImage(
|
||||
imageRecord.url(),
|
||||
imageRecord.saleId(),
|
||||
imageRecord.lotId()
|
||||
);
|
||||
|
||||
if (filePath != null) {
|
||||
// Run object detection
|
||||
var labels = detector.detectObjects(filePath);
|
||||
|
||||
// Save to database
|
||||
db.insertImage(imageRecord.lotId(), imageRecord.url(),
|
||||
filePath, labels);
|
||||
|
||||
// Run object detection on already-downloaded image
|
||||
if (imageProcessor.processImage(image.id(), image.filePath(), image.lotId())) {
|
||||
processed++;
|
||||
if (!labels.isEmpty()) {
|
||||
|
||||
// Check if objects were detected
|
||||
var labels = db.getImageLabels(image.id());
|
||||
if (labels != null && !labels.isEmpty()) {
|
||||
detected++;
|
||||
|
||||
|
||||
// Send notification for interesting detections
|
||||
if (labels.size() >= 3) {
|
||||
notifier.sendNotification(
|
||||
String.format("Lot %d: Detected %s",
|
||||
imageRecord.lotId(),
|
||||
image.lotId(),
|
||||
String.join(", ", labels)),
|
||||
"Objects Detected",
|
||||
0
|
||||
);
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rate limiting
|
||||
Thread.sleep(500);
|
||||
|
||||
// Rate limiting (lighter since no network I/O)
|
||||
Thread.sleep(100);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.info(" ⚠\uFE0F Failed to process image: {}", e.getMessage());
|
||||
|
||||
Reference in New Issue
Block a user