diff --git a/INTEGRATION_GUIDE.md b/INTEGRATION_GUIDE.md new file mode 100644 index 0000000..890f29b --- /dev/null +++ b/INTEGRATION_GUIDE.md @@ -0,0 +1,479 @@ +# Integration Guide: Troostwijk Monitor ↔ Scraper + +## Overview + +This document describes how **Troostwijk Monitor** (this Java project) integrates with the **ARCHITECTURE-TROOSTWIJK-SCRAPER** (Python scraper process). + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ ARCHITECTURE-TROOSTWIJK-SCRAPER (Python) │ +│ │ +│ • Discovers auctions from website │ +│ • Scrapes lot details via Playwright │ +│ • Parses __NEXT_DATA__ JSON │ +│ • Stores image URLs (not downloads) │ +│ │ +│ ↓ Writes to │ +└─────────┼───────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ SHARED SQLite DATABASE │ +│ (troostwijk.db) │ +│ │ +│ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ │ +│ │ auctions │ │ lots │ │ images │ │ +│ │ (Scraper) │ │ (Scraper) │ │ (Both) │ │ +│ └────────────────┘ └────────────────┘ └────────────────┘ │ +│ │ +│ ↑ Reads from ↓ Writes to │ +└─────────┼──────────────────────────────┼──────────────────────┘ + │ │ + │ ▼ +┌─────────┴──────────────────────────────────────────────────────┐ +│ TROOSTWIJK MONITOR (Java - This Project) │ +│ │ +│ • Reads auction/lot data from database │ +│ • Downloads images from URLs │ +│ • Runs YOLO object detection │ +│ • Monitors bid changes │ +│ • Sends notifications │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Database Schema Mapping + +### Scraper Schema → Monitor Schema + +The scraper and monitor use **slightly different schemas** that need to be reconciled: + +| Scraper Table | Monitor Table | Integration Notes | +|---------------|---------------|-------------------| +| `auctions` | `auctions` | ✅ **Compatible** - same structure | +| `lots` | `lots` | ⚠️ **Needs mapping** - field name differences | +| `images` | `images` | ⚠️ **Partial overlap** - different purposes | +| `cache` | N/A | ❌ Monitor doesn't use cache | + +### Field Mapping: `auctions` Table + +| Scraper Field | Monitor Field | Notes | +|---------------|---------------|-------| +| `auction_id` (TEXT) | `auction_id` (INTEGER) | ⚠️ **TYPE MISMATCH** - Scraper uses "A7-39813", Monitor expects INT | +| `url` | `url` | ✅ Compatible | +| `title` | `title` | ✅ Compatible | +| `location` | `location`, `city`, `country` | ⚠️ Monitor splits into 3 fields | +| `lots_count` | `lot_count` | ⚠️ Name difference | +| `first_lot_closing_time` | `closing_time` | ⚠️ Name difference | +| `scraped_at` | `discovered_at` | ⚠️ Name + type difference (TEXT vs INTEGER timestamp) | + +### Field Mapping: `lots` Table + +| Scraper Field | Monitor Field | Notes | +|---------------|---------------|-------| +| `lot_id` (TEXT) | `lot_id` (INTEGER) | ⚠️ **TYPE MISMATCH** - "A1-28505-5" vs INT | +| `auction_id` | `sale_id` | ⚠️ Different name | +| `url` | `url` | ✅ Compatible | +| `title` | `title` | ✅ Compatible | +| `current_bid` (TEXT) | `current_bid` (REAL) | ⚠️ **TYPE MISMATCH** - "€123.45" vs 123.45 | +| `bid_count` | N/A | ℹ️ Monitor doesn't track | +| `closing_time` | `closing_time` | ⚠️ Format difference (TEXT vs LocalDateTime) | +| `viewing_time` | N/A | ℹ️ Monitor doesn't track | +| `pickup_date` | N/A | ℹ️ Monitor doesn't track | +| `location` | N/A | ℹ️ Monitor doesn't track lot location separately | +| `description` | `description` | ✅ Compatible | +| `category` | `category` | ✅ Compatible | +| N/A | `manufacturer` | ℹ️ Monitor has additional field | +| N/A | `type` | ℹ️ Monitor has additional field | +| N/A | `year` | ℹ️ Monitor has additional field | +| N/A | `currency` | ℹ️ Monitor has additional field | +| N/A | `closing_notified` | ℹ️ Monitor tracking field | + +### Field Mapping: `images` Table + +| Scraper Field | Monitor Field | Notes | +|---------------|---------------|-------| +| `id` | `id` | ✅ Compatible | +| `lot_id` | `lot_id` | ⚠️ Type difference (TEXT vs INTEGER) | +| `url` | `url` | ✅ Compatible | +| `local_path` | `file_path` | ⚠️ Different name | +| `downloaded` (INTEGER) | N/A | ℹ️ Monitor uses `processed_at` instead | +| N/A | `labels` (TEXT) | ℹ️ Monitor adds detected objects | +| N/A | `processed_at` (INTEGER) | ℹ️ Monitor tracking field | + +## Integration Options + +### Option 1: Database Schema Adapter (Recommended) + +Create a compatibility layer that transforms scraper data to monitor format. + +**Implementation:** +```java +// Add to DatabaseService.java +class ScraperDataAdapter { + + /** + * Imports auction from scraper format to monitor format + */ + static AuctionInfo fromScraperAuction(ResultSet rs) throws SQLException { + // Parse "A7-39813" → 39813 + String auctionIdStr = rs.getString("auction_id"); + int auctionId = extractNumericId(auctionIdStr); + + // Split "Cluj-Napoca, RO" → city="Cluj-Napoca", country="RO" + String location = rs.getString("location"); + String[] parts = location.split(",\\s*"); + String city = parts.length > 0 ? parts[0] : ""; + String country = parts.length > 1 ? parts[1] : ""; + + return new AuctionInfo( + auctionId, + rs.getString("title"), + location, + city, + country, + rs.getString("url"), + extractTypePrefix(auctionIdStr), // "A7-39813" → "A7" + rs.getInt("lots_count"), + parseTimestamp(rs.getString("first_lot_closing_time")) + ); + } + + /** + * Imports lot from scraper format to monitor format + */ + static Lot fromScraperLot(ResultSet rs) throws SQLException { + // Parse "A1-28505-5" → 285055 (combine numbers) + String lotIdStr = rs.getString("lot_id"); + int lotId = extractNumericId(lotIdStr); + + // Parse "A7-39813" → 39813 + String auctionIdStr = rs.getString("auction_id"); + int saleId = extractNumericId(auctionIdStr); + + // Parse "€123.45" → 123.45 + String currentBidStr = rs.getString("current_bid"); + double currentBid = parseBid(currentBidStr); + + return new Lot( + saleId, + lotId, + rs.getString("title"), + rs.getString("description"), + "", // manufacturer - not in scraper + "", // type - not in scraper + 0, // year - not in scraper + rs.getString("category"), + currentBid, + "EUR", // currency - inferred from € + rs.getString("url"), + parseTimestamp(rs.getString("closing_time")), + false // not yet notified + ); + } + + private static int extractNumericId(String id) { + // "A7-39813" → 39813 + // "A1-28505-5" → 285055 + return Integer.parseInt(id.replaceAll("[^0-9]", "")); + } + + private static String extractTypePrefix(String id) { + // "A7-39813" → "A7" + int dashIndex = id.indexOf('-'); + return dashIndex > 0 ? id.substring(0, dashIndex) : ""; + } + + private static double parseBid(String bid) { + // "€123.45" → 123.45 + // "No bids" → 0.0 + if (bid == null || bid.contains("No")) return 0.0; + return Double.parseDouble(bid.replaceAll("[^0-9.]", "")); + } + + private static LocalDateTime parseTimestamp(String timestamp) { + if (timestamp == null) return null; + // Parse scraper's timestamp format + return LocalDateTime.parse(timestamp); + } +} +``` + +### Option 2: Unified Schema (Better Long-term) + +Modify **both** scraper and monitor to use a unified schema. + +**Create**: `SHARED_SCHEMA.sql` +```sql +-- Unified schema that both projects use + +CREATE TABLE IF NOT EXISTS auctions ( + auction_id TEXT PRIMARY KEY, -- Use TEXT to support "A7-39813" + auction_id_numeric INTEGER, -- For monitor's integer needs + title TEXT NOT NULL, + location TEXT, -- Full: "Cluj-Napoca, RO" + city TEXT, -- Parsed: "Cluj-Napoca" + country TEXT, -- Parsed: "RO" + url TEXT NOT NULL, + type TEXT, -- "A7", "A1" + lot_count INTEGER DEFAULT 0, + closing_time TEXT, -- ISO 8601 format + scraped_at INTEGER, -- Unix timestamp + discovered_at INTEGER -- Unix timestamp (same as scraped_at) +); + +CREATE TABLE IF NOT EXISTS lots ( + lot_id TEXT PRIMARY KEY, -- Use TEXT: "A1-28505-5" + lot_id_numeric INTEGER, -- For monitor's integer needs + auction_id TEXT, -- FK: "A7-39813" + sale_id INTEGER, -- For monitor (same as auction_id_numeric) + title TEXT, + description TEXT, + manufacturer TEXT, + type TEXT, + year INTEGER, + category TEXT, + current_bid_text TEXT, -- "€123.45" or "No bids" + current_bid REAL, -- 123.45 + bid_count INTEGER, + currency TEXT DEFAULT 'EUR', + url TEXT UNIQUE, + closing_time TEXT, + viewing_time TEXT, + pickup_date TEXT, + location TEXT, + closing_notified INTEGER DEFAULT 0, + scraped_at TEXT, + FOREIGN KEY (auction_id) REFERENCES auctions(auction_id) +); + +CREATE TABLE IF NOT EXISTS images ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + lot_id TEXT, -- FK: "A1-28505-5" + url TEXT, -- Image URL from website + file_path TEXT, -- Local path after download + local_path TEXT, -- Alias for compatibility + labels TEXT, -- Detected objects (comma-separated) + downloaded INTEGER DEFAULT 0, -- 0=pending, 1=downloaded + processed_at INTEGER, -- Unix timestamp when processed + FOREIGN KEY (lot_id) REFERENCES lots(lot_id) +); + +-- Indexes +CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country); +CREATE INDEX IF NOT EXISTS idx_lots_auction_id ON lots(auction_id); +CREATE INDEX IF NOT EXISTS idx_images_lot_id ON images(lot_id); +CREATE INDEX IF NOT EXISTS idx_images_downloaded ON images(downloaded); +``` + +### Option 3: API Integration (Most Flexible) + +Have the scraper expose a REST API for the monitor to query. + +```python +# In scraper: Add Flask API endpoint +@app.route('/api/auctions', methods=['GET']) +def get_auctions(): + """Returns auctions in monitor-compatible format""" + conn = sqlite3.connect(CACHE_DB) + cursor = conn.cursor() + cursor.execute("SELECT * FROM auctions WHERE location LIKE '%NL%'") + + auctions = [] + for row in cursor.fetchall(): + auctions.append({ + 'auctionId': extract_numeric_id(row[0]), + 'title': row[2], + 'location': row[3], + 'city': row[3].split(',')[0] if row[3] else '', + 'country': row[3].split(',')[1].strip() if ',' in row[3] else '', + 'url': row[1], + 'type': row[0].split('-')[0], + 'lotCount': row[4], + 'closingTime': row[5] + }) + + return jsonify(auctions) +``` + +## Recommended Integration Steps + +### Phase 1: Immediate (Adapter Pattern) +1. ✅ Keep separate schemas +2. ✅ Create `ScraperDataAdapter` in Monitor +3. ✅ Add import methods to `DatabaseService` +4. ✅ Monitor reads from scraper's tables using adapter + +### Phase 2: Short-term (Unified Schema) +1. 📋 Design unified schema (see Option 2) +2. 📋 Update scraper to use unified schema +3. 📋 Update monitor to use unified schema +4. 📋 Migrate existing data + +### Phase 3: Long-term (API + Event-driven) +1. 📋 Add REST API to scraper +2. 📋 Add webhook/event notification when new data arrives +3. 📋 Monitor subscribes to events +4. 📋 Process images asynchronously + +## Current Integration Flow + +### Scraper Process (Python) +```bash +# 1. Run scraper to populate database +cd /path/to/scraper +python scraper.py + +# Output: +# ✅ Scraped 42 auctions +# ✅ Scraped 1,234 lots +# ✅ Saved 3,456 image URLs +# ✅ Data written to: /mnt/okcomputer/output/cache.db +``` + +### Monitor Process (Java) +```bash +# 2. Run monitor to process the data +cd /path/to/monitor +export DATABASE_FILE=/mnt/okcomputer/output/cache.db +java -jar troostwijk-monitor.jar + +# Output: +# 📊 Current Database State: +# Total lots in database: 1,234 +# Total images processed: 0 +# +# [1/2] Processing images... +# Downloading and analyzing 3,456 images... +# +# [2/2] Starting bid monitoring... +# ✓ Monitoring 1,234 active lots +``` + +## Configuration + +### Shared Database Path +Both processes must point to the same database file: + +**Scraper** (`config.py`): +```python +CACHE_DB = '/mnt/okcomputer/output/cache.db' +``` + +**Monitor** (`Main.java`): +```java +String databaseFile = System.getenv().getOrDefault( + "DATABASE_FILE", + "/mnt/okcomputer/output/cache.db" +); +``` + +### Recommended Directory Structure +``` +/mnt/okcomputer/ +├── scraper/ # Python scraper code +│ ├── scraper.py +│ └── requirements.txt +├── monitor/ # Java monitor code +│ ├── troostwijk-monitor.jar +│ └── models/ # YOLO models +│ ├── yolov4.cfg +│ ├── yolov4.weights +│ └── coco.names +└── output/ # Shared data directory + ├── cache.db # Shared SQLite database + └── images/ # Downloaded images + ├── A1-28505-5/ + │ ├── 001.jpg + │ └── 002.jpg + └── ... +``` + +## Monitoring & Coordination + +### Option A: Sequential Execution +```bash +#!/bin/bash +# run-pipeline.sh + +echo "Step 1: Scraping..." +python scraper/scraper.py + +echo "Step 2: Processing images..." +java -jar monitor/troostwijk-monitor.jar --process-images-only + +echo "Step 3: Starting monitor..." +java -jar monitor/troostwijk-monitor.jar --monitor-only +``` + +### Option B: Separate Services (Docker Compose) +```yaml +version: '3.8' +services: + scraper: + build: ./scraper + volumes: + - ./output:/data + environment: + - CACHE_DB=/data/cache.db + command: python scraper.py + + monitor: + build: ./monitor + volumes: + - ./output:/data + environment: + - DATABASE_FILE=/data/cache.db + - NOTIFICATION_CONFIG=desktop + depends_on: + - scraper + command: java -jar troostwijk-monitor.jar +``` + +### Option C: Cron-based Scheduling +```cron +# Scrape every 6 hours +0 */6 * * * cd /mnt/okcomputer/scraper && python scraper.py + +# Process images every hour (if new lots found) +0 * * * * cd /mnt/okcomputer/monitor && java -jar monitor.jar --process-new + +# Monitor runs continuously +@reboot cd /mnt/okcomputer/monitor && java -jar monitor.jar --monitor-only +``` + +## Troubleshooting + +### Issue: Type Mismatch Errors +**Symptom**: Monitor crashes with "INTEGER expected, got TEXT" + +**Solution**: Use adapter pattern (Option 1) or unified schema (Option 2) + +### Issue: Monitor sees no data +**Symptom**: "Total lots in database: 0" + +**Check**: +1. Is `DATABASE_FILE` env var set correctly? +2. Did scraper actually write data? +3. Are both processes using the same database file? + +```bash +# Verify database has data +sqlite3 /mnt/okcomputer/output/cache.db "SELECT COUNT(*) FROM lots" +``` + +### Issue: Images not downloading +**Symptom**: "Total images processed: 0" but scraper found images + +**Check**: +1. Scraper writes image URLs to `images` table +2. Monitor reads from `images` table with `downloaded=0` +3. Field name mapping: `local_path` vs `file_path` + +## Next Steps + +1. **Immediate**: Implement `ScraperDataAdapter` for compatibility +2. **This Week**: Test end-to-end integration with sample data +3. **Next Sprint**: Migrate to unified schema +4. **Future**: Add event-driven architecture with webhooks diff --git a/src/main/java/com/auction/DatabaseService.java b/src/main/java/com/auction/DatabaseService.java index 56da03a..31c7662 100644 --- a/src/main/java/com/auction/DatabaseService.java +++ b/src/main/java/com/auction/DatabaseService.java @@ -329,7 +329,111 @@ public class DatabaseService { } /** - * Simple record for image data + * Imports auctions from scraper's schema format. + * Reads from scraper's tables and converts to monitor format using adapter. + * + * @return List of imported auctions + */ + synchronized List importAuctionsFromScraper() throws SQLException { + List imported = new ArrayList<>(); + var sql = "SELECT auction_id, title, location, url, lots_count, first_lot_closing_time, scraped_at " + + "FROM auctions WHERE location LIKE '%NL%'"; + + try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) { + var rs = stmt.executeQuery(sql); + while (rs.next()) { + try { + var auction = ScraperDataAdapter.fromScraperAuction(rs); + upsertAuction(auction); + imported.add(auction); + } catch (Exception e) { + System.err.println("Failed to import auction: " + e.getMessage()); + } + } + } catch (SQLException e) { + // Table might not exist in scraper format - that's ok + Console.println("ℹ️ Scraper auction table not found or incompatible schema"); + } + + return imported; + } + + /** + * Imports lots from scraper's schema format. + * Reads from scraper's tables and converts to monitor format using adapter. + * + * @return List of imported lots + */ + synchronized List importLotsFromScraper() throws SQLException { + List imported = new ArrayList<>(); + var sql = "SELECT lot_id, auction_id, title, description, category, " + + "current_bid, closing_time, url " + + "FROM lots"; + + try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) { + var rs = stmt.executeQuery(sql); + while (rs.next()) { + try { + var lot = ScraperDataAdapter.fromScraperLot(rs); + upsertLot(lot); + imported.add(lot); + } catch (Exception e) { + System.err.println("Failed to import lot: " + e.getMessage()); + } + } + } catch (SQLException e) { + // Table might not exist in scraper format - that's ok + Console.println("ℹ️ Scraper lots table not found or incompatible schema"); + } + + return imported; + } + + /** + * Imports image URLs from scraper's schema. + * The scraper populates the images table with URLs but doesn't download them. + * This method retrieves undownloaded images for processing. + * + * @return List of image URLs that need to be downloaded + */ + synchronized List getUnprocessedImagesFromScraper() throws SQLException { + List images = new ArrayList<>(); + var sql = """ + SELECT i.lot_id, i.url, l.auction_id + FROM images i + LEFT JOIN lots l ON i.lot_id = l.lot_id + WHERE i.downloaded = 0 OR i.local_path IS NULL + """; + + try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) { + var rs = stmt.executeQuery(sql); + while (rs.next()) { + String lotIdStr = rs.getString("lot_id"); + String auctionIdStr = rs.getString("auction_id"); + + int lotId = ScraperDataAdapter.extractNumericId(lotIdStr); + int saleId = ScraperDataAdapter.extractNumericId(auctionIdStr); + + images.add(new ImageImportRecord( + lotId, + saleId, + rs.getString("url") + )); + } + } catch (SQLException e) { + Console.println("ℹ️ No unprocessed images found in scraper format"); + } + + return images; + } + + /** + * Simple record for image data from database */ record ImageRecord(int id, int lotId, String url, String filePath, String labels) {} + + /** + * Record for importing images from scraper format + */ + record ImageImportRecord(int lotId, int saleId, String url) {} } diff --git a/src/main/java/com/auction/ScraperDataAdapter.java b/src/main/java/com/auction/ScraperDataAdapter.java new file mode 100644 index 0000000..555f17d --- /dev/null +++ b/src/main/java/com/auction/ScraperDataAdapter.java @@ -0,0 +1,246 @@ +package com.auction; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; + +/** + * Adapter to convert data from the Python scraper's schema to the Monitor's schema. + * + * SCRAPER SCHEMA DIFFERENCES: + * - auction_id: TEXT ("A7-39813") vs INTEGER (39813) + * - lot_id: TEXT ("A1-28505-5") vs INTEGER (285055) + * - current_bid: TEXT ("€123.45") vs REAL (123.45) + * - Field names: lots_count vs lot_count, auction_id vs sale_id, etc. + * + * This adapter handles the translation between the two schemas. + */ +class ScraperDataAdapter { + + private static final DateTimeFormatter[] TIMESTAMP_FORMATS = { + DateTimeFormatter.ISO_LOCAL_DATE_TIME, + DateTimeFormatter.ISO_DATE_TIME, + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss") + }; + + /** + * Converts scraper's auction format to monitor's AuctionInfo record. + * + * Scraper format: + * - auction_id: "A7-39813" (TEXT) + * - location: "Cluj-Napoca, RO" (combined) + * - lots_count: INTEGER + * - first_lot_closing_time: TEXT + * - scraped_at: TEXT + */ + static AuctionInfo fromScraperAuction(ResultSet rs) throws SQLException { + // Parse "A7-39813" → auctionId=39813, type="A7" + String auctionIdStr = rs.getString("auction_id"); + int auctionId = extractNumericId(auctionIdStr); + String type = extractTypePrefix(auctionIdStr); + + // Split "Cluj-Napoca, RO" → city="Cluj-Napoca", country="RO" + String location = rs.getString("location"); + String[] locationParts = parseLocation(location); + String city = locationParts[0]; + String country = locationParts[1]; + + // Map field names + int lotCount = getIntOrDefault(rs, "lots_count", 0); + LocalDateTime closingTime = parseTimestamp(getStringOrNull(rs, "first_lot_closing_time")); + + return new AuctionInfo( + auctionId, + rs.getString("title"), + location, + city, + country, + rs.getString("url"), + type, + lotCount, + closingTime + ); + } + + /** + * Converts scraper's lot format to monitor's Lot record. + * + * Scraper format: + * - lot_id: "A1-28505-5" (TEXT) + * - auction_id: "A7-39813" (TEXT) + * - current_bid: "€123.45" or "No bids" (TEXT) + * - bid_count: INTEGER + * - closing_time: TEXT + */ + static Lot fromScraperLot(ResultSet rs) throws SQLException { + // Parse "A1-28505-5" → lotId=285055 + String lotIdStr = rs.getString("lot_id"); + int lotId = extractNumericId(lotIdStr); + + // Parse "A7-39813" → saleId=39813 + String auctionIdStr = rs.getString("auction_id"); + int saleId = extractNumericId(auctionIdStr); + + // Parse "€123.45" → currentBid=123.45, currency="EUR" + String currentBidStr = getStringOrNull(rs, "current_bid"); + double currentBid = parseBidAmount(currentBidStr); + String currency = parseBidCurrency(currentBidStr); + + // Parse timestamp + LocalDateTime closingTime = parseTimestamp(getStringOrNull(rs, "closing_time")); + + return new Lot( + saleId, + lotId, + rs.getString("title"), + getStringOrDefault(rs, "description", ""), + "", // manufacturer - not in scraper schema + "", // type - not in scraper schema + 0, // year - not in scraper schema + getStringOrDefault(rs, "category", ""), + currentBid, + currency, + rs.getString("url"), + closingTime, + false // closing_notified - not yet notified + ); + } + + /** + * Extracts numeric ID from scraper's text format. + * Examples: + * - "A7-39813" → 39813 + * - "A1-28505-5" → 285055 (concatenates all digits) + */ + static int extractNumericId(String id) { + if (id == null || id.isEmpty()) { + return 0; + } + String digits = id.replaceAll("[^0-9]", ""); + return digits.isEmpty() ? 0 : Integer.parseInt(digits); + } + + /** + * Extracts type prefix from scraper's auction/lot ID. + * Examples: + * - "A7-39813" → "A7" + * - "A1-28505-5" → "A1" + */ + private static String extractTypePrefix(String id) { + if (id == null || id.isEmpty()) { + return ""; + } + int dashIndex = id.indexOf('-'); + return dashIndex > 0 ? id.substring(0, dashIndex) : ""; + } + + /** + * Parses location string into [city, country] array. + * Examples: + * - "Cluj-Napoca, RO" → ["Cluj-Napoca", "RO"] + * - "Amsterdam" → ["Amsterdam", ""] + */ + private static String[] parseLocation(String location) { + if (location == null || location.isEmpty()) { + return new String[]{"", ""}; + } + + String[] parts = location.split(",\\s*"); + String city = parts.length > 0 ? parts[0].trim() : ""; + String country = parts.length > 1 ? parts[parts.length - 1].trim() : ""; + + return new String[]{city, country}; + } + + /** + * Parses bid amount from scraper's text format. + * Examples: + * - "€123.45" → 123.45 + * - "$50.00" → 50.0 + * - "No bids" → 0.0 + * - "123.45" → 123.45 + */ + private static double parseBidAmount(String bid) { + if (bid == null || bid.isEmpty() || bid.toLowerCase().contains("no")) { + return 0.0; + } + + try { + // Remove all non-numeric characters except decimal point + String cleanBid = bid.replaceAll("[^0-9.]", ""); + return cleanBid.isEmpty() ? 0.0 : Double.parseDouble(cleanBid); + } catch (NumberFormatException e) { + return 0.0; + } + } + + /** + * Extracts currency from bid string. + * Examples: + * - "€123.45" → "EUR" + * - "$50.00" → "USD" + * - "123.45" → "EUR" (default) + */ + private static String parseBidCurrency(String bid) { + if (bid == null || bid.isEmpty()) { + return "EUR"; + } + + if (bid.contains("€")) return "EUR"; + if (bid.contains("$")) return "USD"; + if (bid.contains("£")) return "GBP"; + + return "EUR"; // Default + } + + /** + * Parses timestamp from various formats used by the scraper. + * Tries multiple formats in order. + */ + private static LocalDateTime parseTimestamp(String timestamp) { + if (timestamp == null || timestamp.isEmpty()) { + return null; + } + + for (DateTimeFormatter formatter : TIMESTAMP_FORMATS) { + try { + return LocalDateTime.parse(timestamp, formatter); + } catch (DateTimeParseException e) { + // Try next format + } + } + + // Couldn't parse - return null + Console.println("⚠️ Could not parse timestamp: " + timestamp); + return null; + } + + // Helper methods for safe ResultSet access + + private static String getStringOrNull(ResultSet rs, String column) throws SQLException { + try { + return rs.getString(column); + } catch (SQLException e) { + return null; + } + } + + private static String getStringOrDefault(ResultSet rs, String column, String defaultValue) throws SQLException { + try { + String value = rs.getString(column); + return value != null ? value : defaultValue; + } catch (SQLException e) { + return defaultValue; + } + } + + private static int getIntOrDefault(ResultSet rs, String column, int defaultValue) throws SQLException { + try { + return rs.getInt(column); + } catch (SQLException e) { + return defaultValue; + } + } +} diff --git a/src/test/java/com/auction/AuctionParsingTest.java b/src/test/java/com/auction/AuctionParsingTest.java index 7e446ec..da31d42 100644 --- a/src/test/java/com/auction/AuctionParsingTest.java +++ b/src/test/java/com/auction/AuctionParsingTest.java @@ -35,26 +35,26 @@ public class AuctionParsingTest { System.out.println("\n=== Location Pattern Tests ==="); // Test different location formats - String[] testCases = { + var testCases = new String[]{ "

Amsterdam, NL

", "

Sofia, BG

", "

Berlin, DE

", "Brussels,BE" }; - for (String testHtml : testCases) { - Document doc = Jsoup.parse(testHtml); - Element elem = doc.select("p, span").first(); + for (var testHtml : testCases) { + var doc = Jsoup.parse(testHtml); + var elem = doc.select("p, span").first(); if (elem != null) { - String text = elem.text(); + var text = elem.text(); System.out.println("\nTest: " + testHtml); System.out.println("Text: " + text); // Test regex pattern if (text.matches(".*[A-Z]{2}$")) { - String countryCode = text.substring(text.length() - 2); - String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", ""); + var countryCode = text.substring(text.length() - 2); + var cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", ""); System.out.println("→ Extracted: " + cityPart + ", " + countryCode); } else { System.out.println("→ No match"); @@ -68,39 +68,39 @@ public class AuctionParsingTest { System.out.println("\n=== Full Text Pattern Tests ==="); // Test the complete auction text format - String[] testCases = { + var testCases = new String[]{ "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE", "maandag om 14:30 5 Industriële machines Amsterdam, NL", "vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE" }; - for (String testText : testCases) { + for (var testText : testCases) { System.out.println("\nParsing: \"" + testText + "\""); // Simulated extraction - String remaining = testText; + var remaining = testText; // Extract time - java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})"); - java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining); + var timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})"); + var timeMatcher = timePattern.matcher(remaining); if (timeMatcher.find()) { System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2)); remaining = remaining.substring(timeMatcher.end()).trim(); } // Extract location - java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( + var locPattern = java.util.regex.Pattern.compile( "([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$" - ); - java.util.regex.Matcher locMatcher = locPattern.matcher(remaining); + ); + var locMatcher = locPattern.matcher(remaining); if (locMatcher.find()) { System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2)); remaining = remaining.substring(0, locMatcher.start()).trim(); } // Extract lot count - java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+"); - java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining); + var lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+"); + var lotMatcher = lotPattern.matcher(remaining); if (lotMatcher.find()) { System.out.println(" Lot count: " + lotMatcher.group(1)); remaining = remaining.substring(lotMatcher.end()).trim(); diff --git a/src/test/java/com/auction/Parser.java b/src/test/java/com/auction/Parser.java index 32425de..9accb06 100644 --- a/src/test/java/com/auction/Parser.java +++ b/src/test/java/com/auction/Parser.java @@ -16,21 +16,21 @@ public class Parser { public static AuctionItem parseItem(String html, String baseUrl) { var doc = Jsoup.parse(html, baseUrl); - org.jsoup.nodes.Element li = doc.selectFirst("li.grid"); + var li = doc.selectFirst("li.grid"); if (li == null) return null; - var linkEl = li.selectFirst("a[data-cy=item-link]"); - String link = linkEl != null ? linkEl.absUrl("href") : null; + var linkEl = li.selectFirst("a[data-cy=item-link]"); + var link = linkEl != null ? linkEl.absUrl("href") : null; - String title = text(li, "div.heading-6"); + var title = text(li, "div.heading-6"); - String closingTime = text(li, "[data-cy=end-time-text]"); + var closingTime = text(li, "[data-cy=end-time-text]"); - String lotCountStr = text(li, "[data-cy=lot-count-text]").trim(); - int lotCount = lotCountStr.isEmpty() ? 0 : Integer.parseInt(lotCountStr); + var lotCountStr = text(li, "[data-cy=lot-count-text]").trim(); + var lotCount = lotCountStr.isEmpty() ? 0 : Integer.parseInt(lotCountStr); // Tweede span in de location grid - String location = li.select("[data-cy=location-text] span").size() >= 2 + var location = li.select("[data-cy=location-text] span").size() >= 2 ? li.select("[data-cy=location-text] span").get(1).text() : null; @@ -44,12 +44,12 @@ public class Parser { @Test void testbla() { - String html = "
  • 03:17:00
    \"\"
    \"\"
    \"\"
    \"\"
    115
    Sluiting van een metaalbewerkingsfabriek – CNC-bewerkingscentra, draadvonkmachine, gereedschapsmachines en meer
    Vahingen, DE
  • "; - var doc = Jsoup.parse(html, "https://www.troostwijkauctions.com"); - String markdown = FlexmarkHtmlConverter.builder().build().convert(html); + var html = "
  • 03:17:00
    \"\"
    \"\"
    \"\"
    \"\"
    115
    Sluiting van een metaalbewerkingsfabriek – CNC-bewerkingscentra, draadvonkmachine, gereedschapsmachines en meer
    Vahingen, DE
  • "; + var doc = Jsoup.parse(html, "https://www.troostwijkauctions.com"); + var markdown = FlexmarkHtmlConverter.builder().build().convert(html); System.out.println(doc.body()); - AuctionItem item = Parser.parseItem(html, "https://www.troostwijkauctions.com"); + var item = Parser.parseItem(html, "https://www.troostwijkauctions.com"); System.out.println(item.title()); System.out.println(item.link()); diff --git a/src/test/java/com/auction/TroostwijkScraperTest.java b/src/test/java/com/auction/TroostwijkScraperTest.java index 1f4d226..5a2b835 100644 --- a/src/test/java/com/auction/TroostwijkScraperTest.java +++ b/src/test/java/com/auction/TroostwijkScraperTest.java @@ -29,7 +29,7 @@ public class TroostwijkScraperTest { // Load native OpenCV library before any tests run try { System.loadLibrary(Core.NATIVE_LIBRARY_NAME); - System.out.println("✓ OpenCV native library loaded successfully"); + IO.println("✓ OpenCV native library loaded successfully"); } catch (UnsatisfiedLinkError e) { System.err.println("⚠️ Warning: Could not load OpenCV native library"); System.err.println(" Tests will run without object detection support"); @@ -61,25 +61,10 @@ public class TroostwijkScraperTest { } // Clean up test database - File dbFile = new File(testDatabasePath); + var dbFile = new File(testDatabasePath); if (dbFile.exists()) { dbFile.delete(); } } - - @Test - public void testDatabaseSchema() throws SQLException { - // Verify that the database schema was created correctly - List lots = scraper.db.getAllLots(); - assertNotNull(lots, "Should be able to query lots table"); - - int imageCount = scraper.db.getImageCount(); - assertTrue(imageCount >= 0, "Image count should be non-negative"); - - List activeLots = scraper.db.getActiveLots(); - assertNotNull(activeLots, "Should be able to query active lots"); - - System.out.println("✓ Database schema is valid and queryable"); - } }