This commit is contained in:
Tour
2025-12-03 15:40:19 +01:00
parent d3dc37576d
commit febd08821a
6 changed files with 861 additions and 47 deletions

View File

@@ -329,7 +329,111 @@ public class DatabaseService {
}
/**
* Simple record for image data
* Imports auctions from scraper's schema format.
* Reads from scraper's tables and converts to monitor format using adapter.
*
* @return List of imported auctions
*/
synchronized List<AuctionInfo> importAuctionsFromScraper() throws SQLException {
List<AuctionInfo> imported = new ArrayList<>();
var sql = "SELECT auction_id, title, location, url, lots_count, first_lot_closing_time, scraped_at " +
"FROM auctions WHERE location LIKE '%NL%'";
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery(sql);
while (rs.next()) {
try {
var auction = ScraperDataAdapter.fromScraperAuction(rs);
upsertAuction(auction);
imported.add(auction);
} catch (Exception e) {
System.err.println("Failed to import auction: " + e.getMessage());
}
}
} catch (SQLException e) {
// Table might not exist in scraper format - that's ok
Console.println(" Scraper auction table not found or incompatible schema");
}
return imported;
}
/**
* Imports lots from scraper's schema format.
* Reads from scraper's tables and converts to monitor format using adapter.
*
* @return List of imported lots
*/
synchronized List<Lot> importLotsFromScraper() throws SQLException {
List<Lot> imported = new ArrayList<>();
var sql = "SELECT lot_id, auction_id, title, description, category, " +
"current_bid, closing_time, url " +
"FROM lots";
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery(sql);
while (rs.next()) {
try {
var lot = ScraperDataAdapter.fromScraperLot(rs);
upsertLot(lot);
imported.add(lot);
} catch (Exception e) {
System.err.println("Failed to import lot: " + e.getMessage());
}
}
} catch (SQLException e) {
// Table might not exist in scraper format - that's ok
Console.println(" Scraper lots table not found or incompatible schema");
}
return imported;
}
/**
* Imports image URLs from scraper's schema.
* The scraper populates the images table with URLs but doesn't download them.
* This method retrieves undownloaded images for processing.
*
* @return List of image URLs that need to be downloaded
*/
synchronized List<ImageImportRecord> getUnprocessedImagesFromScraper() throws SQLException {
List<ImageImportRecord> images = new ArrayList<>();
var sql = """
SELECT i.lot_id, i.url, l.auction_id
FROM images i
LEFT JOIN lots l ON i.lot_id = l.lot_id
WHERE i.downloaded = 0 OR i.local_path IS NULL
""";
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery(sql);
while (rs.next()) {
String lotIdStr = rs.getString("lot_id");
String auctionIdStr = rs.getString("auction_id");
int lotId = ScraperDataAdapter.extractNumericId(lotIdStr);
int saleId = ScraperDataAdapter.extractNumericId(auctionIdStr);
images.add(new ImageImportRecord(
lotId,
saleId,
rs.getString("url")
));
}
} catch (SQLException e) {
Console.println(" No unprocessed images found in scraper format");
}
return images;
}
/**
* Simple record for image data from database
*/
record ImageRecord(int id, int lotId, String url, String filePath, String labels) {}
/**
* Record for importing images from scraper format
*/
record ImageImportRecord(int lotId, int saleId, String url) {}
}

View File

@@ -0,0 +1,246 @@
package com.auction;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
/**
* Adapter to convert data from the Python scraper's schema to the Monitor's schema.
*
* SCRAPER SCHEMA DIFFERENCES:
* - auction_id: TEXT ("A7-39813") vs INTEGER (39813)
* - lot_id: TEXT ("A1-28505-5") vs INTEGER (285055)
* - current_bid: TEXT ("€123.45") vs REAL (123.45)
* - Field names: lots_count vs lot_count, auction_id vs sale_id, etc.
*
* This adapter handles the translation between the two schemas.
*/
class ScraperDataAdapter {
private static final DateTimeFormatter[] TIMESTAMP_FORMATS = {
DateTimeFormatter.ISO_LOCAL_DATE_TIME,
DateTimeFormatter.ISO_DATE_TIME,
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
};
/**
* Converts scraper's auction format to monitor's AuctionInfo record.
*
* Scraper format:
* - auction_id: "A7-39813" (TEXT)
* - location: "Cluj-Napoca, RO" (combined)
* - lots_count: INTEGER
* - first_lot_closing_time: TEXT
* - scraped_at: TEXT
*/
static AuctionInfo fromScraperAuction(ResultSet rs) throws SQLException {
// Parse "A7-39813" → auctionId=39813, type="A7"
String auctionIdStr = rs.getString("auction_id");
int auctionId = extractNumericId(auctionIdStr);
String type = extractTypePrefix(auctionIdStr);
// Split "Cluj-Napoca, RO" → city="Cluj-Napoca", country="RO"
String location = rs.getString("location");
String[] locationParts = parseLocation(location);
String city = locationParts[0];
String country = locationParts[1];
// Map field names
int lotCount = getIntOrDefault(rs, "lots_count", 0);
LocalDateTime closingTime = parseTimestamp(getStringOrNull(rs, "first_lot_closing_time"));
return new AuctionInfo(
auctionId,
rs.getString("title"),
location,
city,
country,
rs.getString("url"),
type,
lotCount,
closingTime
);
}
/**
* Converts scraper's lot format to monitor's Lot record.
*
* Scraper format:
* - lot_id: "A1-28505-5" (TEXT)
* - auction_id: "A7-39813" (TEXT)
* - current_bid: "€123.45" or "No bids" (TEXT)
* - bid_count: INTEGER
* - closing_time: TEXT
*/
static Lot fromScraperLot(ResultSet rs) throws SQLException {
// Parse "A1-28505-5" → lotId=285055
String lotIdStr = rs.getString("lot_id");
int lotId = extractNumericId(lotIdStr);
// Parse "A7-39813" → saleId=39813
String auctionIdStr = rs.getString("auction_id");
int saleId = extractNumericId(auctionIdStr);
// Parse "€123.45" → currentBid=123.45, currency="EUR"
String currentBidStr = getStringOrNull(rs, "current_bid");
double currentBid = parseBidAmount(currentBidStr);
String currency = parseBidCurrency(currentBidStr);
// Parse timestamp
LocalDateTime closingTime = parseTimestamp(getStringOrNull(rs, "closing_time"));
return new Lot(
saleId,
lotId,
rs.getString("title"),
getStringOrDefault(rs, "description", ""),
"", // manufacturer - not in scraper schema
"", // type - not in scraper schema
0, // year - not in scraper schema
getStringOrDefault(rs, "category", ""),
currentBid,
currency,
rs.getString("url"),
closingTime,
false // closing_notified - not yet notified
);
}
/**
* Extracts numeric ID from scraper's text format.
* Examples:
* - "A7-39813" → 39813
* - "A1-28505-5" → 285055 (concatenates all digits)
*/
static int extractNumericId(String id) {
if (id == null || id.isEmpty()) {
return 0;
}
String digits = id.replaceAll("[^0-9]", "");
return digits.isEmpty() ? 0 : Integer.parseInt(digits);
}
/**
* Extracts type prefix from scraper's auction/lot ID.
* Examples:
* - "A7-39813" → "A7"
* - "A1-28505-5" → "A1"
*/
private static String extractTypePrefix(String id) {
if (id == null || id.isEmpty()) {
return "";
}
int dashIndex = id.indexOf('-');
return dashIndex > 0 ? id.substring(0, dashIndex) : "";
}
/**
* Parses location string into [city, country] array.
* Examples:
* - "Cluj-Napoca, RO" → ["Cluj-Napoca", "RO"]
* - "Amsterdam" → ["Amsterdam", ""]
*/
private static String[] parseLocation(String location) {
if (location == null || location.isEmpty()) {
return new String[]{"", ""};
}
String[] parts = location.split(",\\s*");
String city = parts.length > 0 ? parts[0].trim() : "";
String country = parts.length > 1 ? parts[parts.length - 1].trim() : "";
return new String[]{city, country};
}
/**
* Parses bid amount from scraper's text format.
* Examples:
* - "€123.45" → 123.45
* - "$50.00" → 50.0
* - "No bids" → 0.0
* - "123.45" → 123.45
*/
private static double parseBidAmount(String bid) {
if (bid == null || bid.isEmpty() || bid.toLowerCase().contains("no")) {
return 0.0;
}
try {
// Remove all non-numeric characters except decimal point
String cleanBid = bid.replaceAll("[^0-9.]", "");
return cleanBid.isEmpty() ? 0.0 : Double.parseDouble(cleanBid);
} catch (NumberFormatException e) {
return 0.0;
}
}
/**
* Extracts currency from bid string.
* Examples:
* - "€123.45" → "EUR"
* - "$50.00" → "USD"
* - "123.45" → "EUR" (default)
*/
private static String parseBidCurrency(String bid) {
if (bid == null || bid.isEmpty()) {
return "EUR";
}
if (bid.contains("")) return "EUR";
if (bid.contains("$")) return "USD";
if (bid.contains("£")) return "GBP";
return "EUR"; // Default
}
/**
* Parses timestamp from various formats used by the scraper.
* Tries multiple formats in order.
*/
private static LocalDateTime parseTimestamp(String timestamp) {
if (timestamp == null || timestamp.isEmpty()) {
return null;
}
for (DateTimeFormatter formatter : TIMESTAMP_FORMATS) {
try {
return LocalDateTime.parse(timestamp, formatter);
} catch (DateTimeParseException e) {
// Try next format
}
}
// Couldn't parse - return null
Console.println("⚠️ Could not parse timestamp: " + timestamp);
return null;
}
// Helper methods for safe ResultSet access
private static String getStringOrNull(ResultSet rs, String column) throws SQLException {
try {
return rs.getString(column);
} catch (SQLException e) {
return null;
}
}
private static String getStringOrDefault(ResultSet rs, String column, String defaultValue) throws SQLException {
try {
String value = rs.getString(column);
return value != null ? value : defaultValue;
} catch (SQLException e) {
return defaultValue;
}
}
private static int getIntOrDefault(ResultSet rs, String column, int defaultValue) throws SQLException {
try {
return rs.getInt(column);
} catch (SQLException e) {
return defaultValue;
}
}
}

View File

@@ -35,26 +35,26 @@ public class AuctionParsingTest {
System.out.println("\n=== Location Pattern Tests ===");
// Test different location formats
String[] testCases = {
var testCases = new String[]{
"<p>Amsterdam, NL</p>",
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
"<p>Berlin, DE</p>",
"<span>Brussels,</span>BE"
};
for (String testHtml : testCases) {
Document doc = Jsoup.parse(testHtml);
Element elem = doc.select("p, span").first();
for (var testHtml : testCases) {
var doc = Jsoup.parse(testHtml);
var elem = doc.select("p, span").first();
if (elem != null) {
String text = elem.text();
var text = elem.text();
System.out.println("\nTest: " + testHtml);
System.out.println("Text: " + text);
// Test regex pattern
if (text.matches(".*[A-Z]{2}$")) {
String countryCode = text.substring(text.length() - 2);
String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
var countryCode = text.substring(text.length() - 2);
var cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
} else {
System.out.println("→ No match");
@@ -68,39 +68,39 @@ public class AuctionParsingTest {
System.out.println("\n=== Full Text Pattern Tests ===");
// Test the complete auction text format
String[] testCases = {
var testCases = new String[]{
"woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE",
"maandag om 14:30 5 Industriële machines Amsterdam, NL",
"vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE"
};
for (String testText : testCases) {
for (var testText : testCases) {
System.out.println("\nParsing: \"" + testText + "\"");
// Simulated extraction
String remaining = testText;
var remaining = testText;
// Extract time
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})");
java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining);
var timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})");
var timeMatcher = timePattern.matcher(remaining);
if (timeMatcher.find()) {
System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2));
remaining = remaining.substring(timeMatcher.end()).trim();
}
// Extract location
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
var locPattern = java.util.regex.Pattern.compile(
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
);
java.util.regex.Matcher locMatcher = locPattern.matcher(remaining);
);
var locMatcher = locPattern.matcher(remaining);
if (locMatcher.find()) {
System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2));
remaining = remaining.substring(0, locMatcher.start()).trim();
}
// Extract lot count
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+");
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining);
var lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+");
var lotMatcher = lotPattern.matcher(remaining);
if (lotMatcher.find()) {
System.out.println(" Lot count: " + lotMatcher.group(1));
remaining = remaining.substring(lotMatcher.end()).trim();

File diff suppressed because one or more lines are too long

View File

@@ -29,7 +29,7 @@ public class TroostwijkScraperTest {
// Load native OpenCV library before any tests run
try {
System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
System.out.println("✓ OpenCV native library loaded successfully");
IO.println("✓ OpenCV native library loaded successfully");
} catch (UnsatisfiedLinkError e) {
System.err.println("⚠️ Warning: Could not load OpenCV native library");
System.err.println(" Tests will run without object detection support");
@@ -61,25 +61,10 @@ public class TroostwijkScraperTest {
}
// Clean up test database
File dbFile = new File(testDatabasePath);
var dbFile = new File(testDatabasePath);
if (dbFile.exists()) {
dbFile.delete();
}
}
@Test
public void testDatabaseSchema() throws SQLException {
// Verify that the database schema was created correctly
List<Lot> lots = scraper.db.getAllLots();
assertNotNull(lots, "Should be able to query lots table");
int imageCount = scraper.db.getImageCount();
assertTrue(imageCount >= 0, "Image count should be non-negative");
List<Lot> activeLots = scraper.db.getActiveLots();
assertNotNull(activeLots, "Should be able to query active lots");
System.out.println("✓ Database schema is valid and queryable");
}
}