start
This commit is contained in:
24
src/main/java/com/auction/AuctionInfo.java
Normal file
24
src/main/java/com/auction/AuctionInfo.java
Normal file
@@ -0,0 +1,24 @@
|
||||
package com.auction;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
/**
|
||||
* Represents auction metadata (veiling informatie)
|
||||
*/
|
||||
public final class AuctionInfo {
|
||||
|
||||
public int auctionId; // Unique auction ID (from URL)
|
||||
public String title; // Auction title
|
||||
public String location; // Location (e.g., "Amsterdam, NL")
|
||||
public String city; // City name
|
||||
public String country; // Country code (e.g., "NL")
|
||||
public String url; // Full auction URL
|
||||
public String type; // Auction type (A1 or A7)
|
||||
public int lotCount; // Number of lots/kavels
|
||||
public LocalDateTime closingTime; // Closing time if available
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', lots=%d, url='%s'}",
|
||||
auctionId, type, title, location, lotCount, url);
|
||||
}
|
||||
}
|
||||
165
src/main/java/com/auction/CacheDatabase.java
Normal file
165
src/main/java/com/auction/CacheDatabase.java
Normal file
@@ -0,0 +1,165 @@
|
||||
package com.auction;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.time.Instant;
|
||||
/**
|
||||
* SQLite-based caching system for HTML pages with expiration support
|
||||
*/
|
||||
class CacheDatabase {
|
||||
|
||||
private final String dbPath;
|
||||
private Connection connection;
|
||||
|
||||
public CacheDatabase(String dbPath) {
|
||||
this.dbPath = dbPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize database and create schema
|
||||
*/
|
||||
public void initialize() throws SQLException, IOException {
|
||||
// Create cache directory if it doesn't exist
|
||||
var cacheDir = Paths.get(dbPath).getParent();
|
||||
if (cacheDir != null) {
|
||||
Files.createDirectories(cacheDir);
|
||||
}
|
||||
|
||||
connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
|
||||
|
||||
// Create cache table with URL as primary key
|
||||
var createTable = "CREATE TABLE IF NOT EXISTS page_cache (\n" +
|
||||
" url TEXT PRIMARY KEY,\n" +
|
||||
" html TEXT NOT NULL,\n" +
|
||||
" cached_at INTEGER NOT NULL,\n" +
|
||||
" expires_at INTEGER NOT NULL\n" +
|
||||
")\n";
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.execute(createTable);
|
||||
// Create index on expires_at for efficient cleanup
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
|
||||
}
|
||||
|
||||
// Clean up expired entries on initialization
|
||||
cleanupExpired();
|
||||
|
||||
System.out.println("✓ Cache database initialized");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cached HTML for a URL if it exists and hasn't expired
|
||||
*
|
||||
* @param url The URL to look up
|
||||
* @return Cached HTML or null if not found/expired
|
||||
*/
|
||||
public synchronized String get(String url) {
|
||||
var sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
|
||||
|
||||
try (var ps = connection.prepareStatement(sql)) {
|
||||
ps.setString(1, url);
|
||||
ps.setLong(2, Instant.now().getEpochSecond());
|
||||
|
||||
var rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return rs.getString("html");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache read error: " + e.getMessage());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store HTML in cache with expiration time
|
||||
*
|
||||
* @param url The URL to cache
|
||||
* @param html The HTML content
|
||||
* @param expirationHours Hours until cache expires
|
||||
*/
|
||||
public synchronized void put(String url, String html, long expirationHours) {
|
||||
var sql = "INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)\n" +
|
||||
"VALUES (?, ?, ?, ?)\n";
|
||||
|
||||
var now = Instant.now().getEpochSecond();
|
||||
var expiresAt = now + (expirationHours * 3600);
|
||||
|
||||
try (var ps = connection.prepareStatement(sql)) {
|
||||
ps.setString(1, url);
|
||||
ps.setString(2, html);
|
||||
ps.setLong(3, now);
|
||||
ps.setLong(4, expiresAt);
|
||||
ps.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache write error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove expired cache entries
|
||||
*/
|
||||
public synchronized void cleanupExpired() {
|
||||
var sql = "DELETE FROM page_cache WHERE expires_at <= ?";
|
||||
|
||||
try (var ps = connection.prepareStatement(sql)) {
|
||||
ps.setLong(1, Instant.now().getEpochSecond());
|
||||
var deleted = ps.executeUpdate();
|
||||
if (deleted > 0) {
|
||||
System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache cleanup error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cache statistics
|
||||
*/
|
||||
public synchronized void printStats() {
|
||||
var sql = "SELECT COUNT(*) as total, " +
|
||||
"SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
|
||||
"SUM(LENGTH(html)) as total_size " +
|
||||
"FROM page_cache";
|
||||
|
||||
try (var ps = connection.prepareStatement(sql)) {
|
||||
ps.setLong(1, Instant.now().getEpochSecond());
|
||||
var rs = ps.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
var total = rs.getInt("total");
|
||||
var valid = rs.getInt("valid");
|
||||
var size = rs.getLong("total_size");
|
||||
|
||||
System.out.println("\n=== Cache Statistics ===");
|
||||
System.out.println("Total entries: " + total);
|
||||
System.out.println("Valid entries: " + valid);
|
||||
System.out.println("Expired entries: " + (total - valid));
|
||||
System.out.println("Total size: " + (size / 1024) + " KB");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache stats error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close database connection
|
||||
*/
|
||||
public void close() {
|
||||
if (connection != null) {
|
||||
try {
|
||||
connection.close();
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Error closing cache database: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
303
src/main/java/com/auction/DatabaseService.java
Normal file
303
src/main/java/com/auction/DatabaseService.java
Normal file
@@ -0,0 +1,303 @@
|
||||
package com.auction;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.time.Instant;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
/**
|
||||
* Service for persisting auctions, lots, images, and object labels into
|
||||
* a SQLite database. Uses the Xerial JDBC driver which connects to
|
||||
* SQLite via a URL of the form "jdbc:sqlite:path_to_file"【329850066306528†L40-L63】.
|
||||
*/
|
||||
public class DatabaseService {
|
||||
|
||||
private final String url;
|
||||
DatabaseService(String dbPath) {
|
||||
this.url = "jdbc:sqlite:" + dbPath;
|
||||
}
|
||||
/**
|
||||
* Creates tables if they do not already exist. The schema includes
|
||||
* tables for auctions, lots, images, and object labels. This method is
|
||||
* idempotent; it can be called multiple times.
|
||||
*/
|
||||
void ensureSchema() throws SQLException {
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
// Auctions table (veilingen)
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS auctions ("
|
||||
+ "auction_id INTEGER PRIMARY KEY,"
|
||||
+ "title TEXT NOT NULL,"
|
||||
+ "location TEXT,"
|
||||
+ "city TEXT,"
|
||||
+ "country TEXT,"
|
||||
+ "url TEXT NOT NULL,"
|
||||
+ "type TEXT,"
|
||||
+ "lot_count INTEGER DEFAULT 0,"
|
||||
+ "closing_time TEXT,"
|
||||
+ "discovered_at INTEGER" // Unix timestamp
|
||||
+ ")");
|
||||
|
||||
// Sales table (legacy - keep for compatibility)
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS sales ("
|
||||
+ "sale_id INTEGER PRIMARY KEY,"
|
||||
+ "title TEXT,"
|
||||
+ "location TEXT,"
|
||||
+ "closing_time TEXT"
|
||||
+ ")");
|
||||
|
||||
// Lots table
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS lots ("
|
||||
+ "lot_id INTEGER PRIMARY KEY,"
|
||||
+ "sale_id INTEGER,"
|
||||
+ "title TEXT,"
|
||||
+ "description TEXT,"
|
||||
+ "manufacturer TEXT,"
|
||||
+ "type TEXT,"
|
||||
+ "year INTEGER,"
|
||||
+ "category TEXT,"
|
||||
+ "current_bid REAL,"
|
||||
+ "currency TEXT,"
|
||||
+ "url TEXT,"
|
||||
+ "closing_time TEXT,"
|
||||
+ "closing_notified INTEGER DEFAULT 0,"
|
||||
+ "FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)"
|
||||
+ ")");
|
||||
|
||||
// Images table
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS images ("
|
||||
+ "id INTEGER PRIMARY KEY AUTOINCREMENT,"
|
||||
+ "lot_id INTEGER,"
|
||||
+ "url TEXT,"
|
||||
+ "file_path TEXT,"
|
||||
+ "labels TEXT,"
|
||||
+ "FOREIGN KEY (lot_id) REFERENCES lots(lot_id)"
|
||||
+ ")");
|
||||
|
||||
// Create indexes for better query performance
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)");
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts or updates an auction record
|
||||
*/
|
||||
synchronized void upsertAuction(AuctionInfo auction) throws SQLException {
|
||||
var sql = "INSERT INTO auctions (auction_id, title, location, city, country, url, type, lot_count, closing_time, discovered_at)"
|
||||
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
||||
+ " ON CONFLICT(auction_id) DO UPDATE SET "
|
||||
+ "title = excluded.title, location = excluded.location, city = excluded.city, "
|
||||
+ "country = excluded.country, url = excluded.url, type = excluded.type, "
|
||||
+ "lot_count = excluded.lot_count, closing_time = excluded.closing_time";
|
||||
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setInt(1, auction.auctionId);
|
||||
ps.setString(2, auction.title);
|
||||
ps.setString(3, auction.location);
|
||||
ps.setString(4, auction.city);
|
||||
ps.setString(5, auction.country);
|
||||
ps.setString(6, auction.url);
|
||||
ps.setString(7, auction.type);
|
||||
ps.setInt(8, auction.lotCount);
|
||||
ps.setString(9, auction.closingTime != null ? auction.closingTime.toString() : null);
|
||||
ps.setLong(10, Instant.now().getEpochSecond());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all auctions from the database
|
||||
*/
|
||||
synchronized List<AuctionInfo> getAllAuctions() throws SQLException {
|
||||
List<AuctionInfo> auctions = new ArrayList<>();
|
||||
var sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time FROM auctions";
|
||||
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
while (rs.next()) {
|
||||
var auction = new AuctionInfo();
|
||||
auction.auctionId = rs.getInt("auction_id");
|
||||
auction.title = rs.getString("title");
|
||||
auction.location = rs.getString("location");
|
||||
auction.city = rs.getString("city");
|
||||
auction.country = rs.getString("country");
|
||||
auction.url = rs.getString("url");
|
||||
auction.type = rs.getString("type");
|
||||
auction.lotCount = rs.getInt("lot_count");
|
||||
var closing = rs.getString("closing_time");
|
||||
if (closing != null) {
|
||||
auction.closingTime = LocalDateTime.parse(closing);
|
||||
}
|
||||
auctions.add(auction);
|
||||
}
|
||||
}
|
||||
return auctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves auctions by country code
|
||||
*/
|
||||
synchronized List<AuctionInfo> getAuctionsByCountry(String countryCode) throws SQLException {
|
||||
List<AuctionInfo> auctions = new ArrayList<>();
|
||||
var sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time "
|
||||
+ "FROM auctions WHERE country = ?";
|
||||
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setString(1, countryCode);
|
||||
var rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
var auction = new AuctionInfo();
|
||||
auction.auctionId = rs.getInt("auction_id");
|
||||
auction.title = rs.getString("title");
|
||||
auction.location = rs.getString("location");
|
||||
auction.city = rs.getString("city");
|
||||
auction.country = rs.getString("country");
|
||||
auction.url = rs.getString("url");
|
||||
auction.type = rs.getString("type");
|
||||
auction.lotCount = rs.getInt("lot_count");
|
||||
var closing = rs.getString("closing_time");
|
||||
if (closing != null) {
|
||||
auction.closingTime = LocalDateTime.parse(closing);
|
||||
}
|
||||
auctions.add(auction);
|
||||
}
|
||||
}
|
||||
return auctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts or updates a lot record. Uses INSERT OR REPLACE to
|
||||
* implement upsert semantics so that existing rows are replaced.
|
||||
*/
|
||||
synchronized void upsertLot(Lot lot) throws SQLException {
|
||||
var sql = "INSERT INTO lots (lot_id, sale_id, title, description, manufacturer, type, year, category, current_bid, currency, url, closing_time, closing_notified)"
|
||||
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
||||
+ " ON CONFLICT(lot_id) DO UPDATE SET "
|
||||
+ "sale_id = excluded.sale_id, title = excluded.title, description = excluded.description, "
|
||||
+ "manufacturer = excluded.manufacturer, type = excluded.type, year = excluded.year, category = excluded.category, "
|
||||
+ "current_bid = excluded.current_bid, currency = excluded.currency, url = excluded.url, closing_time = excluded.closing_time";
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setInt(1, lot.lotId);
|
||||
ps.setInt(2, lot.saleId);
|
||||
ps.setString(3, lot.title);
|
||||
ps.setString(4, lot.description);
|
||||
ps.setString(5, lot.manufacturer);
|
||||
ps.setString(6, lot.type);
|
||||
ps.setInt(7, lot.year);
|
||||
ps.setString(8, lot.category);
|
||||
ps.setDouble(9, lot.currentBid);
|
||||
ps.setString(10, lot.currency);
|
||||
ps.setString(11, lot.url);
|
||||
ps.setString(12, lot.closingTime != null ? lot.closingTime.toString() : null);
|
||||
ps.setInt(13, lot.closingNotified ? 1 : 0);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts a new image record. Each image is associated with a lot and
|
||||
* stores both the original URL and the local file path. Detected
|
||||
* labels are stored as a comma separated string.
|
||||
*/
|
||||
synchronized void insertImage(int lotId, String url, String filePath, List<String> labels) throws SQLException {
|
||||
var sql = "INSERT INTO images (lot_id, url, file_path, labels) VALUES (?, ?, ?, ?)";
|
||||
try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setInt(1, lotId);
|
||||
ps.setString(2, url);
|
||||
ps.setString(3, filePath);
|
||||
ps.setString(4, String.join(",", labels));
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all lots that are still active (i.e., have a closing time
|
||||
* in the future or unknown). Only these lots need to be monitored.
|
||||
*/
|
||||
synchronized List<Lot> getActiveLots() throws SQLException {
|
||||
List<Lot> list = new ArrayList<>();
|
||||
var sql = "SELECT lot_id, sale_id, current_bid, currency, closing_time, closing_notified FROM lots";
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
while (rs.next()) {
|
||||
var lot = new Lot();
|
||||
lot.lotId = rs.getInt("lot_id");
|
||||
lot.saleId = rs.getInt("sale_id");
|
||||
lot.currentBid = rs.getDouble("current_bid");
|
||||
lot.currency = rs.getString("currency");
|
||||
var closing = rs.getString("closing_time");
|
||||
lot.closingNotified = rs.getInt("closing_notified") != 0;
|
||||
if (closing != null) {
|
||||
lot.closingTime = LocalDateTime.parse(closing);
|
||||
}
|
||||
list.add(lot);
|
||||
}
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all lots from the database.
|
||||
*/
|
||||
synchronized List<Lot> getAllLots() throws SQLException {
|
||||
List<Lot> list = new ArrayList<>();
|
||||
var sql = "SELECT lot_id, sale_id, title, current_bid, currency FROM lots";
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
while (rs.next()) {
|
||||
var lot = new Lot();
|
||||
lot.lotId = rs.getInt("lot_id");
|
||||
lot.saleId = rs.getInt("sale_id");
|
||||
lot.title = rs.getString("title");
|
||||
lot.currentBid = rs.getDouble("current_bid");
|
||||
lot.currency = rs.getString("currency");
|
||||
list.add(lot);
|
||||
}
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the total number of images in the database.
|
||||
*/
|
||||
synchronized int getImageCount() throws SQLException {
|
||||
var sql = "SELECT COUNT(*) as count FROM images";
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
if (rs.next()) {
|
||||
return rs.getInt("count");
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the current bid of a lot after a bid refresh.
|
||||
*/
|
||||
synchronized void updateLotCurrentBid(Lot lot) throws SQLException {
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(
|
||||
"UPDATE lots SET current_bid = ? WHERE lot_id = ?")) {
|
||||
ps.setDouble(1, lot.currentBid);
|
||||
ps.setInt(2, lot.lotId);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the closingNotified flag of a lot (set to 1 when we have
|
||||
* warned the user about its imminent closure).
|
||||
*/
|
||||
synchronized void updateLotNotificationFlags(Lot lot) throws SQLException {
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(
|
||||
"UPDATE lots SET closing_notified = ? WHERE lot_id = ?")) {
|
||||
ps.setInt(1, lot.closingNotified ? 1 : 0);
|
||||
ps.setInt(2, lot.lotId);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
29
src/main/java/com/auction/Lot.java
Normal file
29
src/main/java/com/auction/Lot.java
Normal file
@@ -0,0 +1,29 @@
|
||||
package com.auction;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
/**
|
||||
* Simple POJO representing a lot (kavel) in an auction. It keeps track
|
||||
* of the sale it belongs to, current bid and closing time. The method
|
||||
* minutesUntilClose computes how many minutes remain until the lot closes.
|
||||
*/
|
||||
final class Lot {
|
||||
|
||||
int saleId;
|
||||
int lotId;
|
||||
String title;
|
||||
String description;
|
||||
String manufacturer;
|
||||
String type;
|
||||
int year;
|
||||
String category;
|
||||
double currentBid;
|
||||
String currency;
|
||||
String url;
|
||||
LocalDateTime closingTime; // null if unknown
|
||||
boolean closingNotified;
|
||||
|
||||
long minutesUntilClose() {
|
||||
if (closingTime == null) return Long.MAX_VALUE;
|
||||
return java.time.Duration.between(LocalDateTime.now(), closingTime).toMinutes();
|
||||
}
|
||||
}
|
||||
@@ -1,23 +1,82 @@
|
||||
package com.auction;
|
||||
|
||||
import org.opencv.core.Core;
|
||||
import java.util.List;
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
public static void main2(String[] args) {
|
||||
// If arguments are passed, this is likely a one-off command via dokku run
|
||||
// Just exit immediately to allow the command to run
|
||||
if (args.length > 0) {
|
||||
System.out.println("Command mode - exiting to allow shell commands");
|
||||
IO.println("Command mode - exiting to allow shell commands");
|
||||
return;
|
||||
}
|
||||
|
||||
System.out.println("Starting Troostwijk Auction Scraper...");
|
||||
System.out.println("Container is running and healthy.");
|
||||
IO.println("Starting Troostwijk Auction Scraper...");
|
||||
IO.println("Container is running and healthy.");
|
||||
|
||||
// Keep container alive
|
||||
try {
|
||||
Thread.sleep(Long.MAX_VALUE);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
System.out.println("Container interrupted, exiting.");
|
||||
IO.println("Container interrupted, exiting.");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Entry point. Configure database location, notification settings, and
|
||||
* YOLO model paths here before running. Once started the scraper
|
||||
* discovers Dutch auctions, scrapes lots, and begins monitoring.
|
||||
*/
|
||||
public static void main(String[] args) throws Exception {
|
||||
IO.println("=== Troostwijk Auction Scraper ===\n");
|
||||
|
||||
// Configuration parameters (replace with your own values)
|
||||
String databaseFile = "troostwijk.db";
|
||||
|
||||
// Notification configuration - choose one:
|
||||
// Option 1: Desktop notifications only (free, no setup required)
|
||||
String notificationConfig = System.getenv().getOrDefault("NOTIFICATION_CONFIG", "desktop");
|
||||
|
||||
// Option 2: Desktop + Email via Gmail (free, requires Gmail app password)
|
||||
// Format: "smtp:username:appPassword:toEmail"
|
||||
// Example: "smtp:your.email@gmail.com:abcd1234efgh5678:recipient@example.com"
|
||||
// Get app password: Google Account > Security > 2-Step Verification > App passwords
|
||||
|
||||
// YOLO model paths (optional - scraper works without object detection)
|
||||
String yoloCfg = "models/yolov4.cfg";
|
||||
String yoloWeights = "models/yolov4.weights";
|
||||
String yoloClasses = "models/coco.names";
|
||||
|
||||
// Load native OpenCV library
|
||||
System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
|
||||
|
||||
IO.println("Initializing scraper...");
|
||||
TroostwijkScraper scraper = new TroostwijkScraper(databaseFile, notificationConfig, "",
|
||||
yoloCfg, yoloWeights, yoloClasses);
|
||||
|
||||
// Step 1: Discover auctions in NL
|
||||
IO.println("\n[1/3] Discovering Dutch auctions...");
|
||||
List<Integer> auctions = scraper.discoverDutchAuctions();
|
||||
IO.println("✓ Found " + auctions.size() + " auctions: " + auctions);
|
||||
|
||||
// Step 2: Fetch lots for each auction
|
||||
IO.println("\n[2/3] Fetching lot details...");
|
||||
int totalAuctions = auctions.size();
|
||||
int currentAuction = 0;
|
||||
for (int saleId : auctions) {
|
||||
currentAuction++;
|
||||
IO.println(" [Page " + currentAuction + "] Fetching auctions...");
|
||||
IO.println(" [" + currentAuction + "/" + totalAuctions + "] Processing sale " + saleId + "...");
|
||||
scraper.fetchLotsForSale(saleId);
|
||||
}
|
||||
|
||||
// Show database summary
|
||||
IO.println("\n📊 Database Summary:");
|
||||
scraper.printDatabaseStats();
|
||||
|
||||
// Step 3: Start monitoring bids and closures
|
||||
IO.println("\n[3/3] Starting monitoring service...");
|
||||
scraper.scheduleMonitoring();
|
||||
IO.println("✓ Monitoring active. Press Ctrl+C to stop.\n");
|
||||
}
|
||||
}
|
||||
|
||||
156
src/main/java/com/auction/NotificationService.java
Normal file
156
src/main/java/com/auction/NotificationService.java
Normal file
@@ -0,0 +1,156 @@
|
||||
package com.auction;
|
||||
|
||||
import javax.mail.Authenticator;
|
||||
import javax.mail.Message.RecipientType;
|
||||
import javax.mail.PasswordAuthentication;
|
||||
import javax.mail.Session;
|
||||
import javax.mail.Transport;
|
||||
import javax.mail.internet.InternetAddress;
|
||||
import javax.mail.internet.MimeMessage;
|
||||
import java.awt.SystemTray;
|
||||
import java.awt.Toolkit;
|
||||
import java.awt.TrayIcon;
|
||||
import java.awt.TrayIcon.MessageType;
|
||||
import java.util.Date;
|
||||
import java.util.Properties;
|
||||
/**
|
||||
* Service for sending notifications via desktop notifications and/or email.
|
||||
* Supports free notification methods:
|
||||
* 1. Desktop notifications (Windows/Linux/macOS system tray)
|
||||
* 2. Email via Gmail SMTP (free, requires app password)
|
||||
*
|
||||
* Configuration:
|
||||
* - For email: Set notificationEmail to your Gmail address
|
||||
* - Enable 2FA in Gmail and create an App Password
|
||||
* - Use format "smtp:username:appPassword:toEmail" for credentials
|
||||
* - Or use "desktop" for desktop-only notifications
|
||||
*/
|
||||
class NotificationService {
|
||||
|
||||
private final boolean useDesktop;
|
||||
private final boolean useEmail;
|
||||
private final String smtpUsername;
|
||||
private final String smtpPassword;
|
||||
private final String toEmail;
|
||||
|
||||
/**
|
||||
* Creates a notification service.
|
||||
*
|
||||
* @param config "desktop" for desktop only, or "smtp:username:password:toEmail" for email
|
||||
* @param unusedParam Kept for compatibility (can pass empty string)
|
||||
*/
|
||||
NotificationService(String config, String unusedParam) {
|
||||
|
||||
if ("desktop".equalsIgnoreCase(config)) {
|
||||
this.useDesktop = true;
|
||||
this.useEmail = false;
|
||||
this.smtpUsername = null;
|
||||
this.smtpPassword = null;
|
||||
this.toEmail = null;
|
||||
} else if (config.startsWith("smtp:")) {
|
||||
var parts = config.split(":", 4);
|
||||
if (parts.length != 4) {
|
||||
throw new IllegalArgumentException("Email config must be 'smtp:username:password:toEmail'");
|
||||
}
|
||||
this.useDesktop = true; // Always include desktop
|
||||
this.useEmail = true;
|
||||
this.smtpUsername = parts[1];
|
||||
this.smtpPassword = parts[2];
|
||||
this.toEmail = parts[3];
|
||||
} else {
|
||||
throw new IllegalArgumentException("Config must be 'desktop' or 'smtp:username:password:toEmail'");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends notification via configured channels.
|
||||
*
|
||||
* @param message The message body
|
||||
* @param title Message title
|
||||
* @param priority Priority level (0=normal, 1=high)
|
||||
*/
|
||||
void sendNotification(String message, String title, int priority) {
|
||||
if (useDesktop) {
|
||||
sendDesktopNotification(title, message, priority);
|
||||
}
|
||||
if (useEmail) {
|
||||
sendEmailNotification(title, message, priority);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a desktop notification using system tray.
|
||||
* Works on Windows, macOS, and Linux with desktop environments.
|
||||
*/
|
||||
private void sendDesktopNotification(String title, String message, int priority) {
|
||||
try {
|
||||
if (SystemTray.isSupported()) {
|
||||
var tray = SystemTray.getSystemTray();
|
||||
var image = Toolkit.getDefaultToolkit()
|
||||
.createImage(new byte[0]); // Empty image
|
||||
|
||||
var trayIcon = new TrayIcon(image, "Troostwijk Scraper");
|
||||
trayIcon.setImageAutoSize(true);
|
||||
|
||||
var messageType = priority > 0
|
||||
? MessageType.WARNING
|
||||
: MessageType.INFO;
|
||||
|
||||
tray.add(trayIcon);
|
||||
trayIcon.displayMessage(title, message, messageType);
|
||||
|
||||
// Remove icon after 2 seconds to avoid clutter
|
||||
Thread.sleep(2000);
|
||||
tray.remove(trayIcon);
|
||||
|
||||
IO.println("Desktop notification sent: " + title);
|
||||
} else {
|
||||
IO.println("Desktop notifications not supported, logging: " + title + " - " + message);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println("Desktop notification failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends email notification via Gmail SMTP (free).
|
||||
* Uses Gmail's SMTP server with app password authentication.
|
||||
*/
|
||||
private void sendEmailNotification(String title, String message, int priority) {
|
||||
try {
|
||||
var props = new Properties();
|
||||
props.put("mail.smtp.auth", "true");
|
||||
props.put("mail.smtp.starttls.enable", "true");
|
||||
props.put("mail.smtp.host", "smtp.gmail.com");
|
||||
props.put("mail.smtp.port", "587");
|
||||
props.put("mail.smtp.ssl.trust", "smtp.gmail.com");
|
||||
|
||||
var session = Session.getInstance(props,
|
||||
new Authenticator() {
|
||||
|
||||
protected PasswordAuthentication getPasswordAuthentication() {
|
||||
return new PasswordAuthentication(smtpUsername, smtpPassword);
|
||||
}
|
||||
});
|
||||
|
||||
var msg = new MimeMessage(session);
|
||||
msg.setFrom(new InternetAddress(smtpUsername));
|
||||
msg.setRecipients(RecipientType.TO,
|
||||
InternetAddress.parse(toEmail));
|
||||
msg.setSubject("[Troostwijk] " + title);
|
||||
msg.setText(message);
|
||||
msg.setSentDate(new Date());
|
||||
|
||||
if (priority > 0) {
|
||||
msg.setHeader("X-Priority", "1");
|
||||
msg.setHeader("Importance", "High");
|
||||
}
|
||||
|
||||
Transport.send(msg);
|
||||
IO.println("Email notification sent: " + title);
|
||||
|
||||
} catch (Exception e) {
|
||||
System.err.println("Email notification failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
140
src/main/java/com/auction/ObjectDetectionService.java
Normal file
140
src/main/java/com/auction/ObjectDetectionService.java
Normal file
@@ -0,0 +1,140 @@
|
||||
package com.auction;
|
||||
|
||||
import org.opencv.core.Mat;
|
||||
import org.opencv.core.Scalar;
|
||||
import org.opencv.core.Size;
|
||||
import org.opencv.dnn.Dnn;
|
||||
import org.opencv.dnn.Net;
|
||||
import org.opencv.imgcodecs.Imgcodecs;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV;
|
||||
import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
|
||||
/**
|
||||
* Service for performing object detection on images using OpenCV's DNN
|
||||
* module. The DNN module can load pre‑trained models from several
|
||||
* frameworks (Darknet, TensorFlow, ONNX, etc.)【784097309529506†L209-L233】. Here
|
||||
* we load a YOLO model (Darknet) by specifying the configuration and
|
||||
* weights files. For each image we run a forward pass and return a
|
||||
* list of detected class labels.
|
||||
*
|
||||
* If model files are not found, the service operates in disabled mode
|
||||
* and returns empty lists.
|
||||
*/
|
||||
class ObjectDetectionService {
|
||||
|
||||
private final Net net;
|
||||
private final List<String> classNames;
|
||||
private final boolean enabled;
|
||||
|
||||
ObjectDetectionService(String cfgPath, String weightsPath, String classNamesPath) throws IOException {
|
||||
// Check if model files exist
|
||||
var cfgFile = Paths.get(cfgPath);
|
||||
var weightsFile = Paths.get(weightsPath);
|
||||
var classNamesFile = Paths.get(classNamesPath);
|
||||
|
||||
if (!Files.exists(cfgFile) || !Files.exists(weightsFile) || !Files.exists(classNamesFile)) {
|
||||
IO.println("⚠️ Object detection disabled: YOLO model files not found");
|
||||
IO.println(" Expected files:");
|
||||
IO.println(" - " + cfgPath);
|
||||
IO.println(" - " + weightsPath);
|
||||
IO.println(" - " + classNamesPath);
|
||||
IO.println(" Scraper will continue without image analysis.");
|
||||
this.enabled = false;
|
||||
this.net = null;
|
||||
this.classNames = new ArrayList<>();
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Load network
|
||||
this.net = Dnn.readNetFromDarknet(cfgPath, weightsPath);
|
||||
this.net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
this.net.setPreferableTarget(DNN_TARGET_CPU);
|
||||
// Load class names (one per line)
|
||||
this.classNames = Files.readAllLines(classNamesFile);
|
||||
this.enabled = true;
|
||||
IO.println("✓ Object detection enabled with YOLO");
|
||||
} catch (Exception e) {
|
||||
System.err.println("⚠️ Object detection disabled: " + e.getMessage());
|
||||
throw new IOException("Failed to initialize object detection", e);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Detects objects in the given image file and returns a list of
|
||||
* human‑readable labels. Only detections above a confidence
|
||||
* threshold are returned. For brevity this method omits drawing
|
||||
* bounding boxes. See the OpenCV DNN documentation for details on
|
||||
* post‑processing【784097309529506†L324-L344】.
|
||||
*
|
||||
* @param imagePath absolute path to the image
|
||||
* @return list of detected class names (empty if detection disabled)
|
||||
*/
|
||||
List<String> detectObjects(String imagePath) {
|
||||
if (!enabled) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
List<String> labels = new ArrayList<>();
|
||||
var image = Imgcodecs.imread(imagePath);
|
||||
if (image.empty()) return labels;
|
||||
// Create a 4D blob from the image
|
||||
var blob = Dnn.blobFromImage(image, 1.0 / 255.0, new Size(416, 416), new Scalar(0, 0, 0), true, false);
|
||||
net.setInput(blob);
|
||||
List<Mat> outs = new ArrayList<>();
|
||||
var outNames = getOutputLayerNames(net);
|
||||
net.forward(outs, outNames);
|
||||
// Post‑process: for each detection compute score and choose class
|
||||
var confThreshold = 0.5f;
|
||||
for (var out : outs) {
|
||||
for (var i = 0; i < out.rows(); i++) {
|
||||
var data = out.get(i, 0);
|
||||
if (data == null) continue;
|
||||
// The first 5 numbers are bounding box, then class scores
|
||||
var scores = new double[classNames.size()];
|
||||
System.arraycopy(data, 5, scores, 0, scores.length);
|
||||
var classId = argMax(scores);
|
||||
var confidence = scores[classId];
|
||||
if (confidence > confThreshold) {
|
||||
var label = classNames.get(classId);
|
||||
if (!labels.contains(label)) {
|
||||
labels.add(label);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return labels;
|
||||
}
|
||||
/**
|
||||
* Returns the indexes of the output layers in the network. YOLO
|
||||
* automatically discovers its output layers; other models may require
|
||||
* manually specifying them【784097309529506†L356-L365】.
|
||||
*/
|
||||
private List<String> getOutputLayerNames(Net net) {
|
||||
List<String> names = new ArrayList<>();
|
||||
var outLayers = net.getUnconnectedOutLayers().toList();
|
||||
var layersNames = net.getLayerNames();
|
||||
for (var i : outLayers) {
|
||||
names.add(layersNames.get(i - 1));
|
||||
}
|
||||
return names;
|
||||
}
|
||||
/**
|
||||
* Returns the index of the maximum value in the array.
|
||||
*/
|
||||
private int argMax(double[] array) {
|
||||
var best = 0;
|
||||
var max = array[0];
|
||||
for (var i = 1; i < array.length; i++) {
|
||||
if (array[i] > max) {
|
||||
max = array[i];
|
||||
best = i;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
}
|
||||
@@ -1,645 +0,0 @@
|
||||
package com.auction;
|
||||
|
||||
import com.microsoft.playwright.*;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.microsoft.playwright.options.WaitUntilState;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.*;
|
||||
import java.sql.*;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* TroostwijkAuctionExtractor
|
||||
*
|
||||
* Extracts auction listings from https://www.troostwijkauctions.com/auctions
|
||||
* using Playwright for Java (headless browser automation).
|
||||
*
|
||||
* Features:
|
||||
* - Uses Playwright for Java to load JavaScript-rendered content
|
||||
* - Iterates through all pages of auction listings
|
||||
* - Rate limiting: 200ms between each page request
|
||||
* - Caches visited pages in SQLite database with expiration times
|
||||
* - Extracts auction metadata: ID, title, location, URL
|
||||
*
|
||||
* Dependencies (Maven):
|
||||
* <dependency>
|
||||
* <groupId>com.microsoft.playwright</groupId>
|
||||
* <artifactId>playwright</artifactId>
|
||||
* <version>1.40.0</version>
|
||||
* </dependency>
|
||||
* <dependency>
|
||||
* <groupId>com.fasterxml.jackson.core</groupId>
|
||||
* <artifactId>jackson-databind</artifactId>
|
||||
* <version>2.17.0</version>
|
||||
* </dependency>
|
||||
* <dependency>
|
||||
* <groupId>org.xerial</groupId>
|
||||
* <artifactId>sqlite-jdbc</artifactId>
|
||||
* <version>3.45.1.0</version>
|
||||
* </dependency>
|
||||
*
|
||||
* After adding dependency, run: mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install"
|
||||
* This downloads the browser binaries needed by Playwright.
|
||||
*/
|
||||
public class TroostwijkAuctionExtractor {
|
||||
|
||||
private static final String AUCTIONS_BASE_URL = "https://www.troostwijkauctions.com/auctions";
|
||||
private static final int RATE_LIMIT_MS = 200;
|
||||
private static final String CACHE_DB_PATH = "cache/page_cache.db";
|
||||
private static final long CACHE_EXPIRATION_HOURS = 24; // Cache expires after 24 hours
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
private final boolean useCache;
|
||||
private final CacheDatabase cacheDb;
|
||||
private final int maxPageVisits; // Maximum number of pages to fetch (0 = unlimited)
|
||||
private int pageVisitCount; // Counter for actual page fetches (not from cache)
|
||||
private Playwright playwright;
|
||||
private Browser browser;
|
||||
|
||||
/**
|
||||
* Represents an auction listing
|
||||
*/
|
||||
public static class Auction {
|
||||
public int id;
|
||||
public String title;
|
||||
public String location;
|
||||
public String url;
|
||||
public String type; // e.g. "A1" or "A7"
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', url='%s'}",
|
||||
id, type, title, location, url);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param useCache Enable database caching of visited pages
|
||||
* @param maxPageVisits Maximum number of actual page fetches (0 = unlimited)
|
||||
*/
|
||||
public TroostwijkAuctionExtractor(boolean useCache, int maxPageVisits) throws SQLException, IOException {
|
||||
this.objectMapper = new ObjectMapper();
|
||||
this.useCache = useCache;
|
||||
this.maxPageVisits = maxPageVisits;
|
||||
this.pageVisitCount = 0;
|
||||
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
|
||||
|
||||
if (useCache) {
|
||||
cacheDb.initialize();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor with default unlimited page visits
|
||||
*
|
||||
* @param useCache Enable database caching of visited pages
|
||||
*/
|
||||
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
|
||||
this(useCache, 0); // 0 = unlimited
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes Playwright and browser instance
|
||||
* Call this before extracting auctions
|
||||
*/
|
||||
public void initialize() {
|
||||
System.out.println("Initializing Playwright browser...");
|
||||
this.playwright = Playwright.create();
|
||||
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
|
||||
.setHeadless(true)
|
||||
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
|
||||
System.out.println("✓ Browser ready");
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes browser and Playwright instance
|
||||
* Call this when done extracting
|
||||
*/
|
||||
public void close() {
|
||||
if (browser != null) {
|
||||
browser.close();
|
||||
}
|
||||
if (playwright != null) {
|
||||
playwright.close();
|
||||
}
|
||||
if (cacheDb != null) {
|
||||
cacheDb.close();
|
||||
}
|
||||
System.out.println("✓ Browser and cache closed");
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all auctions from all pages
|
||||
*
|
||||
* @return List of all discovered auctions
|
||||
*/
|
||||
public List<Auction> extractAllAuctions() throws InterruptedException {
|
||||
if (browser == null) {
|
||||
throw new IllegalStateException("Browser not initialized. Call initialize() first.");
|
||||
}
|
||||
|
||||
List<Auction> allAuctions = new ArrayList<>();
|
||||
int pageNumber = 1;
|
||||
boolean hasMorePages = true;
|
||||
|
||||
System.out.println("Starting auction extraction from " + AUCTIONS_BASE_URL);
|
||||
|
||||
while (hasMorePages) {
|
||||
System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
|
||||
|
||||
// Check cache first
|
||||
String cachedHtml = loadFromCache(pageNumber);
|
||||
String html;
|
||||
|
||||
if (cachedHtml != null) {
|
||||
System.out.println(" ✓ Loaded from cache");
|
||||
html = cachedHtml;
|
||||
} else {
|
||||
// Check if we've reached the maximum page visit limit
|
||||
if (maxPageVisits > 0 && pageVisitCount >= maxPageVisits) {
|
||||
System.out.println(" ⚠️ Reached maximum page visit limit (" + maxPageVisits + "), stopping");
|
||||
break;
|
||||
}
|
||||
|
||||
// Fetch with Playwright
|
||||
html = fetchPageWithPlaywright(pageNumber);
|
||||
pageVisitCount++; // Increment actual page fetch counter
|
||||
|
||||
if (html == null || html.isEmpty()) {
|
||||
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
|
||||
break;
|
||||
}
|
||||
|
||||
System.out.println(" ✓ Fetched from website (visit " + pageVisitCount +
|
||||
(maxPageVisits > 0 ? "/" + maxPageVisits : "") + ")");
|
||||
|
||||
// Save to cache
|
||||
if (useCache) {
|
||||
saveToCache(pageNumber, html);
|
||||
}
|
||||
|
||||
// Rate limiting - wait 200ms before next request
|
||||
Thread.sleep(RATE_LIMIT_MS);
|
||||
}
|
||||
|
||||
// Parse auctions from HTML
|
||||
List<Auction> pageAuctions = parseAuctionsFromHtml(html);
|
||||
|
||||
if (pageAuctions.isEmpty()) {
|
||||
System.out.println(" ⚠️ No auctions found on page, stopping pagination");
|
||||
hasMorePages = false;
|
||||
} else {
|
||||
System.out.println(" ✓ Found " + pageAuctions.size() + " auctions");
|
||||
allAuctions.addAll(pageAuctions);
|
||||
pageNumber++;
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("\n✓ Total auctions extracted: " + allAuctions.size());
|
||||
return allAuctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches a single page using Playwright
|
||||
*
|
||||
* @param pageNumber Page number (1-indexed)
|
||||
* @return HTML content of the page
|
||||
*/
|
||||
private String fetchPageWithPlaywright(int pageNumber) {
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_BASE_URL
|
||||
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
|
||||
|
||||
try {
|
||||
Page page = browser.newPage();
|
||||
|
||||
// Set user agent
|
||||
page.setExtraHTTPHeaders(Map.of(
|
||||
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
));
|
||||
|
||||
// Navigate to page
|
||||
page.navigate(url, new Page.NavigateOptions()
|
||||
.setTimeout(30000)
|
||||
.setWaitUntil(WaitUntilState.NETWORKIDLE));
|
||||
|
||||
// Wait for auction listings to appear
|
||||
try {
|
||||
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
|
||||
.setTimeout(10000));
|
||||
} catch (Exception e) {
|
||||
// Continue even if selector not found
|
||||
System.out.println(" ⚠️ Auction selector not found, attempting to parse anyway");
|
||||
}
|
||||
|
||||
// Get HTML content
|
||||
String html = page.content();
|
||||
page.close();
|
||||
|
||||
return html;
|
||||
|
||||
} catch (Exception e) {
|
||||
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses auction data from HTML content
|
||||
*
|
||||
* @param html HTML content
|
||||
* @return List of parsed auctions
|
||||
*/
|
||||
private List<Auction> parseAuctionsFromHtml(String html) {
|
||||
List<Auction> auctions = new ArrayList<>();
|
||||
|
||||
// Simple regex-based parsing for auction links
|
||||
// Format: <a href="/a/title-A1-12345" or "/a/title-A7-12345"
|
||||
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
|
||||
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
|
||||
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
|
||||
|
||||
Set<Integer> seenIds = new HashSet<>();
|
||||
|
||||
while (linkMatcher.find()) {
|
||||
String href = linkMatcher.group(1);
|
||||
int auctionId = Integer.parseInt(linkMatcher.group(2));
|
||||
|
||||
// Avoid duplicates
|
||||
if (seenIds.contains(auctionId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract auction type (A1 or A7)
|
||||
String type = href.contains("A1-") ? "A1" : "A7";
|
||||
|
||||
// Try to find location and title near this link
|
||||
String location = extractLocationNearLink(html, href);
|
||||
String title = extractTitleFromHref(href);
|
||||
|
||||
Auction auction = new Auction();
|
||||
auction.id = auctionId;
|
||||
auction.type = type;
|
||||
auction.title = title;
|
||||
auction.location = location;
|
||||
auction.url = "https://www.troostwijkauctions.com" + href;
|
||||
|
||||
auctions.add(auction);
|
||||
seenIds.add(auctionId);
|
||||
}
|
||||
|
||||
return auctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts location text near an auction link
|
||||
* Looks for ", NL" or other country codes
|
||||
*/
|
||||
private String extractLocationNearLink(String html, String href) {
|
||||
int hrefPos = html.indexOf(href);
|
||||
if (hrefPos == -1) return "Unknown";
|
||||
|
||||
// Look at 1000 characters before AND after the href for location info
|
||||
int startPos = Math.max(hrefPos - 500, 0);
|
||||
int endPos = Math.min(hrefPos + 1000, html.length());
|
||||
String context = html.substring(startPos, endPos);
|
||||
|
||||
// Pattern 1: Classic format "City, NL"
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
|
||||
|
||||
if (locMatcher.find()) {
|
||||
String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2);
|
||||
System.out.println(" Found location: " + location + " for auction " + href);
|
||||
return location;
|
||||
}
|
||||
|
||||
// Pattern 2: HTML format like "<span>City,<!-- --> </span>NL"
|
||||
// Extract city and country code separately
|
||||
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
|
||||
"<span[^>]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
|
||||
|
||||
if (htmlMatcher.find()) {
|
||||
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
|
||||
String country = htmlMatcher.group(2);
|
||||
String location = city + ", " + country;
|
||||
System.out.println(" Found location (HTML): " + location + " for auction " + href);
|
||||
return location;
|
||||
}
|
||||
|
||||
// Pattern 3: Fallback - just find country code after HTML tags
|
||||
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
|
||||
"(?:-->|</span>|</div>)\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
|
||||
|
||||
if (countryMatcher.find()) {
|
||||
String country = countryMatcher.group(1);
|
||||
System.out.println(" Found country code: " + country + " for auction " + href);
|
||||
return "Unknown, " + country;
|
||||
}
|
||||
|
||||
System.out.println(" ⚠️ No location found for auction " + href);
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts human-readable title from URL slug
|
||||
* Converts "some-auction-title-A1-12345" to "Some Auction Title"
|
||||
*/
|
||||
private String extractTitleFromHref(String href) {
|
||||
// Extract everything between "/a/" and "-A1-" or "-A7-"
|
||||
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile(
|
||||
"/a/(.+?)-A[17]-");
|
||||
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
|
||||
|
||||
if (titleMatcher.find()) {
|
||||
String slug = titleMatcher.group(1);
|
||||
// Convert kebab-case to Title Case
|
||||
String[] words = slug.split("-");
|
||||
StringBuilder title = new StringBuilder();
|
||||
for (String word : words) {
|
||||
if (!word.isEmpty()) {
|
||||
title.append(Character.toUpperCase(word.charAt(0)))
|
||||
.append(word.substring(1))
|
||||
.append(" ");
|
||||
}
|
||||
}
|
||||
return title.toString().trim();
|
||||
}
|
||||
|
||||
return "Untitled Auction";
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads cached HTML for a page from SQLite database
|
||||
* Returns null if not cached or cache has expired
|
||||
*
|
||||
* @param pageNumber Page number
|
||||
* @return Cached HTML or null if not found/expired
|
||||
*/
|
||||
private String loadFromCache(int pageNumber) {
|
||||
if (!useCache || cacheDb == null) return null;
|
||||
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_BASE_URL
|
||||
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
|
||||
|
||||
return cacheDb.get(url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves HTML to SQLite cache database with expiration time
|
||||
*
|
||||
* @param pageNumber Page number
|
||||
* @param html HTML content
|
||||
*/
|
||||
private void saveToCache(int pageNumber, String html) {
|
||||
if (!useCache || cacheDb == null) return;
|
||||
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_BASE_URL
|
||||
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
|
||||
|
||||
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters auctions by location
|
||||
*
|
||||
* @param auctions List of auctions
|
||||
* @param locationFilter Location string to match (e.g., "NL")
|
||||
* @return Filtered list
|
||||
*/
|
||||
public static List<Auction> filterByLocation(List<Auction> auctions, String locationFilter) {
|
||||
return auctions.stream()
|
||||
.filter(a -> a.location.contains(locationFilter))
|
||||
.toList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Entry point for testing
|
||||
*
|
||||
* Arguments:
|
||||
* --max-visits <number> : Maximum number of page visits (0 = unlimited, default)
|
||||
* --no-cache : Disable caching
|
||||
*/
|
||||
public static void main(String[] args) throws Exception {
|
||||
System.out.println("=== Troostwijk Auction Extractor ===\n");
|
||||
|
||||
// Parse command line arguments
|
||||
boolean useCache = true;
|
||||
int maxVisits = 0; // 0 = unlimited
|
||||
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case "--max-visits":
|
||||
if (i + 1 < args.length) {
|
||||
maxVisits = Integer.parseInt(args[++i]);
|
||||
System.out.println("Max page visits set to: " + maxVisits);
|
||||
}
|
||||
break;
|
||||
case "--no-cache":
|
||||
useCache = false;
|
||||
System.out.println("Caching disabled");
|
||||
break;
|
||||
case "--help":
|
||||
System.out.println("Usage: java TroostwijkAuctionExtractor [options]");
|
||||
System.out.println("Options:");
|
||||
System.out.println(" --max-visits <n> : Limit actual page fetches to n (0 = unlimited)");
|
||||
System.out.println(" --no-cache : Disable page caching");
|
||||
System.out.println(" --help : Show this help message");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache, maxVisits);
|
||||
|
||||
try {
|
||||
// Initialize browser
|
||||
extractor.initialize();
|
||||
|
||||
// Extract all auctions
|
||||
List<Auction> allAuctions = extractor.extractAllAuctions();
|
||||
|
||||
// Filter for Dutch auctions only
|
||||
List<Auction> dutchAuctions = filterByLocation(allAuctions, "NL");
|
||||
|
||||
System.out.println("\n=== Results ===");
|
||||
System.out.println("Total auctions found: " + allAuctions.size());
|
||||
System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
|
||||
System.out.println("Actual page visits: " + extractor.pageVisitCount);
|
||||
|
||||
// Display first 10 Dutch auctions
|
||||
System.out.println("\n=== Sample Dutch Auctions ===");
|
||||
dutchAuctions.stream()
|
||||
.limit(10)
|
||||
.forEach(System.out::println);
|
||||
|
||||
} finally {
|
||||
// Always close browser
|
||||
extractor.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* SQLite-based caching system for HTML pages with expiration support
|
||||
*/
|
||||
static class CacheDatabase {
|
||||
private final String dbPath;
|
||||
private Connection connection;
|
||||
|
||||
public CacheDatabase(String dbPath) {
|
||||
this.dbPath = dbPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize database and create schema
|
||||
*/
|
||||
public void initialize() throws SQLException, IOException {
|
||||
// Create cache directory if it doesn't exist
|
||||
Path cacheDir = Paths.get(dbPath).getParent();
|
||||
if (cacheDir != null) {
|
||||
Files.createDirectories(cacheDir);
|
||||
}
|
||||
|
||||
connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
|
||||
|
||||
// Create cache table with URL as primary key
|
||||
String createTable = """
|
||||
CREATE TABLE IF NOT EXISTS page_cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
html TEXT NOT NULL,
|
||||
cached_at INTEGER NOT NULL,
|
||||
expires_at INTEGER NOT NULL
|
||||
)
|
||||
""";
|
||||
|
||||
try (Statement stmt = connection.createStatement()) {
|
||||
stmt.execute(createTable);
|
||||
// Create index on expires_at for efficient cleanup
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
|
||||
}
|
||||
|
||||
// Clean up expired entries on initialization
|
||||
cleanupExpired();
|
||||
|
||||
System.out.println("✓ Cache database initialized");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cached HTML for a URL if it exists and hasn't expired
|
||||
*
|
||||
* @param url The URL to look up
|
||||
* @return Cached HTML or null if not found/expired
|
||||
*/
|
||||
public synchronized String get(String url) {
|
||||
String sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
|
||||
|
||||
try (PreparedStatement ps = connection.prepareStatement(sql)) {
|
||||
ps.setString(1, url);
|
||||
ps.setLong(2, Instant.now().getEpochSecond());
|
||||
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return rs.getString("html");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache read error: " + e.getMessage());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store HTML in cache with expiration time
|
||||
*
|
||||
* @param url The URL to cache
|
||||
* @param html The HTML content
|
||||
* @param expirationHours Hours until cache expires
|
||||
*/
|
||||
public synchronized void put(String url, String html, long expirationHours) {
|
||||
String sql = """
|
||||
INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""";
|
||||
|
||||
long now = Instant.now().getEpochSecond();
|
||||
long expiresAt = now + (expirationHours * 3600);
|
||||
|
||||
try (PreparedStatement ps = connection.prepareStatement(sql)) {
|
||||
ps.setString(1, url);
|
||||
ps.setString(2, html);
|
||||
ps.setLong(3, now);
|
||||
ps.setLong(4, expiresAt);
|
||||
ps.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache write error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove expired cache entries
|
||||
*/
|
||||
public synchronized void cleanupExpired() {
|
||||
String sql = "DELETE FROM page_cache WHERE expires_at <= ?";
|
||||
|
||||
try (PreparedStatement ps = connection.prepareStatement(sql)) {
|
||||
ps.setLong(1, Instant.now().getEpochSecond());
|
||||
int deleted = ps.executeUpdate();
|
||||
if (deleted > 0) {
|
||||
System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache cleanup error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cache statistics
|
||||
*/
|
||||
public synchronized void printStats() {
|
||||
String sql = "SELECT COUNT(*) as total, " +
|
||||
"SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
|
||||
"SUM(LENGTH(html)) as total_size " +
|
||||
"FROM page_cache";
|
||||
|
||||
try (PreparedStatement ps = connection.prepareStatement(sql)) {
|
||||
ps.setLong(1, Instant.now().getEpochSecond());
|
||||
ResultSet rs = ps.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
int total = rs.getInt("total");
|
||||
int valid = rs.getInt("valid");
|
||||
long size = rs.getLong("total_size");
|
||||
|
||||
System.out.println("\n=== Cache Statistics ===");
|
||||
System.out.println("Total entries: " + total);
|
||||
System.out.println("Valid entries: " + valid);
|
||||
System.out.println("Expired entries: " + (total - valid));
|
||||
System.out.println("Total size: " + (size / 1024) + " KB");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache stats error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close database connection
|
||||
*/
|
||||
public void close() {
|
||||
if (connection != null) {
|
||||
try {
|
||||
connection.close();
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Error closing cache database: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user