This commit is contained in:
Tour
2025-12-03 15:09:39 +01:00
parent 7fa3e4a545
commit 853c3cf53e
16 changed files with 1405 additions and 2000 deletions

View File

@@ -14,8 +14,8 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>21</maven.compiler.target>
<maven.compiler.source>25</maven.compiler.source>
<maven.compiler.target>25</maven.compiler.target>
<jackson.version>2.17.0</jackson.version>
<opencv.version>4.9.0-0</opencv.version>
</properties>
@@ -93,8 +93,8 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>21</source>
<target>21</target>
<source>25</source>
<target>25</target>
</configuration>
</plugin>

View File

@@ -0,0 +1,24 @@
package com.auction;
import java.time.LocalDateTime;
/**
* Represents auction metadata (veiling informatie)
*/
public final class AuctionInfo {
public int auctionId; // Unique auction ID (from URL)
public String title; // Auction title
public String location; // Location (e.g., "Amsterdam, NL")
public String city; // City name
public String country; // Country code (e.g., "NL")
public String url; // Full auction URL
public String type; // Auction type (A1 or A7)
public int lotCount; // Number of lots/kavels
public LocalDateTime closingTime; // Closing time if available
@Override
public String toString() {
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', lots=%d, url='%s'}",
auctionId, type, title, location, lotCount, url);
}
}

View File

@@ -0,0 +1,165 @@
package com.auction;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.time.Instant;
/**
* SQLite-based caching system for HTML pages with expiration support
*/
class CacheDatabase {
private final String dbPath;
private Connection connection;
public CacheDatabase(String dbPath) {
this.dbPath = dbPath;
}
/**
* Initialize database and create schema
*/
public void initialize() throws SQLException, IOException {
// Create cache directory if it doesn't exist
var cacheDir = Paths.get(dbPath).getParent();
if (cacheDir != null) {
Files.createDirectories(cacheDir);
}
connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
// Create cache table with URL as primary key
var createTable = "CREATE TABLE IF NOT EXISTS page_cache (\n" +
" url TEXT PRIMARY KEY,\n" +
" html TEXT NOT NULL,\n" +
" cached_at INTEGER NOT NULL,\n" +
" expires_at INTEGER NOT NULL\n" +
")\n";
try (var stmt = connection.createStatement()) {
stmt.execute(createTable);
// Create index on expires_at for efficient cleanup
stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
}
// Clean up expired entries on initialization
cleanupExpired();
System.out.println("✓ Cache database initialized");
}
/**
* Get cached HTML for a URL if it exists and hasn't expired
*
* @param url The URL to look up
* @return Cached HTML or null if not found/expired
*/
public synchronized String get(String url) {
var sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
try (var ps = connection.prepareStatement(sql)) {
ps.setString(1, url);
ps.setLong(2, Instant.now().getEpochSecond());
var rs = ps.executeQuery();
if (rs.next()) {
return rs.getString("html");
}
} catch (SQLException e) {
System.err.println("Cache read error: " + e.getMessage());
}
return null;
}
/**
* Store HTML in cache with expiration time
*
* @param url The URL to cache
* @param html The HTML content
* @param expirationHours Hours until cache expires
*/
public synchronized void put(String url, String html, long expirationHours) {
var sql = "INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)\n" +
"VALUES (?, ?, ?, ?)\n";
var now = Instant.now().getEpochSecond();
var expiresAt = now + (expirationHours * 3600);
try (var ps = connection.prepareStatement(sql)) {
ps.setString(1, url);
ps.setString(2, html);
ps.setLong(3, now);
ps.setLong(4, expiresAt);
ps.executeUpdate();
} catch (SQLException e) {
System.err.println("Cache write error: " + e.getMessage());
}
}
/**
* Remove expired cache entries
*/
public synchronized void cleanupExpired() {
var sql = "DELETE FROM page_cache WHERE expires_at <= ?";
try (var ps = connection.prepareStatement(sql)) {
ps.setLong(1, Instant.now().getEpochSecond());
var deleted = ps.executeUpdate();
if (deleted > 0) {
System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
}
} catch (SQLException e) {
System.err.println("Cache cleanup error: " + e.getMessage());
}
}
/**
* Get cache statistics
*/
public synchronized void printStats() {
var sql = "SELECT COUNT(*) as total, " +
"SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
"SUM(LENGTH(html)) as total_size " +
"FROM page_cache";
try (var ps = connection.prepareStatement(sql)) {
ps.setLong(1, Instant.now().getEpochSecond());
var rs = ps.executeQuery();
if (rs.next()) {
var total = rs.getInt("total");
var valid = rs.getInt("valid");
var size = rs.getLong("total_size");
System.out.println("\n=== Cache Statistics ===");
System.out.println("Total entries: " + total);
System.out.println("Valid entries: " + valid);
System.out.println("Expired entries: " + (total - valid));
System.out.println("Total size: " + (size / 1024) + " KB");
}
} catch (SQLException e) {
System.err.println("Cache stats error: " + e.getMessage());
}
}
/**
* Close database connection
*/
public void close() {
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
System.err.println("Error closing cache database: " + e.getMessage());
}
}
}
}

View File

@@ -0,0 +1,303 @@
package com.auction;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.time.Instant;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
/**
* Service for persisting auctions, lots, images, and object labels into
* a SQLite database. Uses the Xerial JDBC driver which connects to
* SQLite via a URL of the form "jdbc:sqlite:path_to_file"【329850066306528†L40-L63】.
*/
public class DatabaseService {
private final String url;
DatabaseService(String dbPath) {
this.url = "jdbc:sqlite:" + dbPath;
}
/**
* Creates tables if they do not already exist. The schema includes
* tables for auctions, lots, images, and object labels. This method is
* idempotent; it can be called multiple times.
*/
void ensureSchema() throws SQLException {
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
// Auctions table (veilingen)
stmt.execute("CREATE TABLE IF NOT EXISTS auctions ("
+ "auction_id INTEGER PRIMARY KEY,"
+ "title TEXT NOT NULL,"
+ "location TEXT,"
+ "city TEXT,"
+ "country TEXT,"
+ "url TEXT NOT NULL,"
+ "type TEXT,"
+ "lot_count INTEGER DEFAULT 0,"
+ "closing_time TEXT,"
+ "discovered_at INTEGER" // Unix timestamp
+ ")");
// Sales table (legacy - keep for compatibility)
stmt.execute("CREATE TABLE IF NOT EXISTS sales ("
+ "sale_id INTEGER PRIMARY KEY,"
+ "title TEXT,"
+ "location TEXT,"
+ "closing_time TEXT"
+ ")");
// Lots table
stmt.execute("CREATE TABLE IF NOT EXISTS lots ("
+ "lot_id INTEGER PRIMARY KEY,"
+ "sale_id INTEGER,"
+ "title TEXT,"
+ "description TEXT,"
+ "manufacturer TEXT,"
+ "type TEXT,"
+ "year INTEGER,"
+ "category TEXT,"
+ "current_bid REAL,"
+ "currency TEXT,"
+ "url TEXT,"
+ "closing_time TEXT,"
+ "closing_notified INTEGER DEFAULT 0,"
+ "FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)"
+ ")");
// Images table
stmt.execute("CREATE TABLE IF NOT EXISTS images ("
+ "id INTEGER PRIMARY KEY AUTOINCREMENT,"
+ "lot_id INTEGER,"
+ "url TEXT,"
+ "file_path TEXT,"
+ "labels TEXT,"
+ "FOREIGN KEY (lot_id) REFERENCES lots(lot_id)"
+ ")");
// Create indexes for better query performance
stmt.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)");
stmt.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)");
}
}
/**
* Inserts or updates an auction record
*/
synchronized void upsertAuction(AuctionInfo auction) throws SQLException {
var sql = "INSERT INTO auctions (auction_id, title, location, city, country, url, type, lot_count, closing_time, discovered_at)"
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+ " ON CONFLICT(auction_id) DO UPDATE SET "
+ "title = excluded.title, location = excluded.location, city = excluded.city, "
+ "country = excluded.country, url = excluded.url, type = excluded.type, "
+ "lot_count = excluded.lot_count, closing_time = excluded.closing_time";
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
ps.setInt(1, auction.auctionId);
ps.setString(2, auction.title);
ps.setString(3, auction.location);
ps.setString(4, auction.city);
ps.setString(5, auction.country);
ps.setString(6, auction.url);
ps.setString(7, auction.type);
ps.setInt(8, auction.lotCount);
ps.setString(9, auction.closingTime != null ? auction.closingTime.toString() : null);
ps.setLong(10, Instant.now().getEpochSecond());
ps.executeUpdate();
}
}
/**
* Retrieves all auctions from the database
*/
synchronized List<AuctionInfo> getAllAuctions() throws SQLException {
List<AuctionInfo> auctions = new ArrayList<>();
var sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time FROM auctions";
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery(sql);
while (rs.next()) {
var auction = new AuctionInfo();
auction.auctionId = rs.getInt("auction_id");
auction.title = rs.getString("title");
auction.location = rs.getString("location");
auction.city = rs.getString("city");
auction.country = rs.getString("country");
auction.url = rs.getString("url");
auction.type = rs.getString("type");
auction.lotCount = rs.getInt("lot_count");
var closing = rs.getString("closing_time");
if (closing != null) {
auction.closingTime = LocalDateTime.parse(closing);
}
auctions.add(auction);
}
}
return auctions;
}
/**
* Retrieves auctions by country code
*/
synchronized List<AuctionInfo> getAuctionsByCountry(String countryCode) throws SQLException {
List<AuctionInfo> auctions = new ArrayList<>();
var sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time "
+ "FROM auctions WHERE country = ?";
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
ps.setString(1, countryCode);
var rs = ps.executeQuery();
while (rs.next()) {
var auction = new AuctionInfo();
auction.auctionId = rs.getInt("auction_id");
auction.title = rs.getString("title");
auction.location = rs.getString("location");
auction.city = rs.getString("city");
auction.country = rs.getString("country");
auction.url = rs.getString("url");
auction.type = rs.getString("type");
auction.lotCount = rs.getInt("lot_count");
var closing = rs.getString("closing_time");
if (closing != null) {
auction.closingTime = LocalDateTime.parse(closing);
}
auctions.add(auction);
}
}
return auctions;
}
/**
* Inserts or updates a lot record. Uses INSERT OR REPLACE to
* implement upsert semantics so that existing rows are replaced.
*/
synchronized void upsertLot(Lot lot) throws SQLException {
var sql = "INSERT INTO lots (lot_id, sale_id, title, description, manufacturer, type, year, category, current_bid, currency, url, closing_time, closing_notified)"
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+ " ON CONFLICT(lot_id) DO UPDATE SET "
+ "sale_id = excluded.sale_id, title = excluded.title, description = excluded.description, "
+ "manufacturer = excluded.manufacturer, type = excluded.type, year = excluded.year, category = excluded.category, "
+ "current_bid = excluded.current_bid, currency = excluded.currency, url = excluded.url, closing_time = excluded.closing_time";
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
ps.setInt(1, lot.lotId);
ps.setInt(2, lot.saleId);
ps.setString(3, lot.title);
ps.setString(4, lot.description);
ps.setString(5, lot.manufacturer);
ps.setString(6, lot.type);
ps.setInt(7, lot.year);
ps.setString(8, lot.category);
ps.setDouble(9, lot.currentBid);
ps.setString(10, lot.currency);
ps.setString(11, lot.url);
ps.setString(12, lot.closingTime != null ? lot.closingTime.toString() : null);
ps.setInt(13, lot.closingNotified ? 1 : 0);
ps.executeUpdate();
}
}
/**
* Inserts a new image record. Each image is associated with a lot and
* stores both the original URL and the local file path. Detected
* labels are stored as a comma separated string.
*/
synchronized void insertImage(int lotId, String url, String filePath, List<String> labels) throws SQLException {
var sql = "INSERT INTO images (lot_id, url, file_path, labels) VALUES (?, ?, ?, ?)";
try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
ps.setInt(1, lotId);
ps.setString(2, url);
ps.setString(3, filePath);
ps.setString(4, String.join(",", labels));
ps.executeUpdate();
}
}
/**
* Retrieves all lots that are still active (i.e., have a closing time
* in the future or unknown). Only these lots need to be monitored.
*/
synchronized List<Lot> getActiveLots() throws SQLException {
List<Lot> list = new ArrayList<>();
var sql = "SELECT lot_id, sale_id, current_bid, currency, closing_time, closing_notified FROM lots";
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery(sql);
while (rs.next()) {
var lot = new Lot();
lot.lotId = rs.getInt("lot_id");
lot.saleId = rs.getInt("sale_id");
lot.currentBid = rs.getDouble("current_bid");
lot.currency = rs.getString("currency");
var closing = rs.getString("closing_time");
lot.closingNotified = rs.getInt("closing_notified") != 0;
if (closing != null) {
lot.closingTime = LocalDateTime.parse(closing);
}
list.add(lot);
}
}
return list;
}
/**
* Retrieves all lots from the database.
*/
synchronized List<Lot> getAllLots() throws SQLException {
List<Lot> list = new ArrayList<>();
var sql = "SELECT lot_id, sale_id, title, current_bid, currency FROM lots";
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery(sql);
while (rs.next()) {
var lot = new Lot();
lot.lotId = rs.getInt("lot_id");
lot.saleId = rs.getInt("sale_id");
lot.title = rs.getString("title");
lot.currentBid = rs.getDouble("current_bid");
lot.currency = rs.getString("currency");
list.add(lot);
}
}
return list;
}
/**
* Gets the total number of images in the database.
*/
synchronized int getImageCount() throws SQLException {
var sql = "SELECT COUNT(*) as count FROM images";
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery(sql);
if (rs.next()) {
return rs.getInt("count");
}
}
return 0;
}
/**
* Updates the current bid of a lot after a bid refresh.
*/
synchronized void updateLotCurrentBid(Lot lot) throws SQLException {
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(
"UPDATE lots SET current_bid = ? WHERE lot_id = ?")) {
ps.setDouble(1, lot.currentBid);
ps.setInt(2, lot.lotId);
ps.executeUpdate();
}
}
/**
* Updates the closingNotified flag of a lot (set to 1 when we have
* warned the user about its imminent closure).
*/
synchronized void updateLotNotificationFlags(Lot lot) throws SQLException {
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(
"UPDATE lots SET closing_notified = ? WHERE lot_id = ?")) {
ps.setInt(1, lot.closingNotified ? 1 : 0);
ps.setInt(2, lot.lotId);
ps.executeUpdate();
}
}
}

View File

@@ -0,0 +1,29 @@
package com.auction;
import java.time.LocalDateTime;
/**
* Simple POJO representing a lot (kavel) in an auction. It keeps track
* of the sale it belongs to, current bid and closing time. The method
* minutesUntilClose computes how many minutes remain until the lot closes.
*/
final class Lot {
int saleId;
int lotId;
String title;
String description;
String manufacturer;
String type;
int year;
String category;
double currentBid;
String currency;
String url;
LocalDateTime closingTime; // null if unknown
boolean closingNotified;
long minutesUntilClose() {
if (closingTime == null) return Long.MAX_VALUE;
return java.time.Duration.between(LocalDateTime.now(), closingTime).toMinutes();
}
}

View File

@@ -1,23 +1,82 @@
package com.auction;
import org.opencv.core.Core;
import java.util.List;
public class Main {
public static void main(String[] args) {
public static void main2(String[] args) {
// If arguments are passed, this is likely a one-off command via dokku run
// Just exit immediately to allow the command to run
if (args.length > 0) {
System.out.println("Command mode - exiting to allow shell commands");
IO.println("Command mode - exiting to allow shell commands");
return;
}
System.out.println("Starting Troostwijk Auction Scraper...");
System.out.println("Container is running and healthy.");
IO.println("Starting Troostwijk Auction Scraper...");
IO.println("Container is running and healthy.");
// Keep container alive
try {
Thread.sleep(Long.MAX_VALUE);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
System.out.println("Container interrupted, exiting.");
IO.println("Container interrupted, exiting.");
}
}
/**
* Entry point. Configure database location, notification settings, and
* YOLO model paths here before running. Once started the scraper
* discovers Dutch auctions, scrapes lots, and begins monitoring.
*/
public static void main(String[] args) throws Exception {
IO.println("=== Troostwijk Auction Scraper ===\n");
// Configuration parameters (replace with your own values)
String databaseFile = "troostwijk.db";
// Notification configuration - choose one:
// Option 1: Desktop notifications only (free, no setup required)
String notificationConfig = System.getenv().getOrDefault("NOTIFICATION_CONFIG", "desktop");
// Option 2: Desktop + Email via Gmail (free, requires Gmail app password)
// Format: "smtp:username:appPassword:toEmail"
// Example: "smtp:your.email@gmail.com:abcd1234efgh5678:recipient@example.com"
// Get app password: Google Account > Security > 2-Step Verification > App passwords
// YOLO model paths (optional - scraper works without object detection)
String yoloCfg = "models/yolov4.cfg";
String yoloWeights = "models/yolov4.weights";
String yoloClasses = "models/coco.names";
// Load native OpenCV library
System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
IO.println("Initializing scraper...");
TroostwijkScraper scraper = new TroostwijkScraper(databaseFile, notificationConfig, "",
yoloCfg, yoloWeights, yoloClasses);
// Step 1: Discover auctions in NL
IO.println("\n[1/3] Discovering Dutch auctions...");
List<Integer> auctions = scraper.discoverDutchAuctions();
IO.println("✓ Found " + auctions.size() + " auctions: " + auctions);
// Step 2: Fetch lots for each auction
IO.println("\n[2/3] Fetching lot details...");
int totalAuctions = auctions.size();
int currentAuction = 0;
for (int saleId : auctions) {
currentAuction++;
IO.println(" [Page " + currentAuction + "] Fetching auctions...");
IO.println(" [" + currentAuction + "/" + totalAuctions + "] Processing sale " + saleId + "...");
scraper.fetchLotsForSale(saleId);
}
// Show database summary
IO.println("\n📊 Database Summary:");
scraper.printDatabaseStats();
// Step 3: Start monitoring bids and closures
IO.println("\n[3/3] Starting monitoring service...");
scraper.scheduleMonitoring();
IO.println("✓ Monitoring active. Press Ctrl+C to stop.\n");
}
}

View File

@@ -0,0 +1,156 @@
package com.auction;
import javax.mail.Authenticator;
import javax.mail.Message.RecipientType;
import javax.mail.PasswordAuthentication;
import javax.mail.Session;
import javax.mail.Transport;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeMessage;
import java.awt.SystemTray;
import java.awt.Toolkit;
import java.awt.TrayIcon;
import java.awt.TrayIcon.MessageType;
import java.util.Date;
import java.util.Properties;
/**
* Service for sending notifications via desktop notifications and/or email.
* Supports free notification methods:
* 1. Desktop notifications (Windows/Linux/macOS system tray)
* 2. Email via Gmail SMTP (free, requires app password)
*
* Configuration:
* - For email: Set notificationEmail to your Gmail address
* - Enable 2FA in Gmail and create an App Password
* - Use format "smtp:username:appPassword:toEmail" for credentials
* - Or use "desktop" for desktop-only notifications
*/
class NotificationService {
private final boolean useDesktop;
private final boolean useEmail;
private final String smtpUsername;
private final String smtpPassword;
private final String toEmail;
/**
* Creates a notification service.
*
* @param config "desktop" for desktop only, or "smtp:username:password:toEmail" for email
* @param unusedParam Kept for compatibility (can pass empty string)
*/
NotificationService(String config, String unusedParam) {
if ("desktop".equalsIgnoreCase(config)) {
this.useDesktop = true;
this.useEmail = false;
this.smtpUsername = null;
this.smtpPassword = null;
this.toEmail = null;
} else if (config.startsWith("smtp:")) {
var parts = config.split(":", 4);
if (parts.length != 4) {
throw new IllegalArgumentException("Email config must be 'smtp:username:password:toEmail'");
}
this.useDesktop = true; // Always include desktop
this.useEmail = true;
this.smtpUsername = parts[1];
this.smtpPassword = parts[2];
this.toEmail = parts[3];
} else {
throw new IllegalArgumentException("Config must be 'desktop' or 'smtp:username:password:toEmail'");
}
}
/**
* Sends notification via configured channels.
*
* @param message The message body
* @param title Message title
* @param priority Priority level (0=normal, 1=high)
*/
void sendNotification(String message, String title, int priority) {
if (useDesktop) {
sendDesktopNotification(title, message, priority);
}
if (useEmail) {
sendEmailNotification(title, message, priority);
}
}
/**
* Sends a desktop notification using system tray.
* Works on Windows, macOS, and Linux with desktop environments.
*/
private void sendDesktopNotification(String title, String message, int priority) {
try {
if (SystemTray.isSupported()) {
var tray = SystemTray.getSystemTray();
var image = Toolkit.getDefaultToolkit()
.createImage(new byte[0]); // Empty image
var trayIcon = new TrayIcon(image, "Troostwijk Scraper");
trayIcon.setImageAutoSize(true);
var messageType = priority > 0
? MessageType.WARNING
: MessageType.INFO;
tray.add(trayIcon);
trayIcon.displayMessage(title, message, messageType);
// Remove icon after 2 seconds to avoid clutter
Thread.sleep(2000);
tray.remove(trayIcon);
IO.println("Desktop notification sent: " + title);
} else {
IO.println("Desktop notifications not supported, logging: " + title + " - " + message);
}
} catch (Exception e) {
System.err.println("Desktop notification failed: " + e.getMessage());
}
}
/**
* Sends email notification via Gmail SMTP (free).
* Uses Gmail's SMTP server with app password authentication.
*/
private void sendEmailNotification(String title, String message, int priority) {
try {
var props = new Properties();
props.put("mail.smtp.auth", "true");
props.put("mail.smtp.starttls.enable", "true");
props.put("mail.smtp.host", "smtp.gmail.com");
props.put("mail.smtp.port", "587");
props.put("mail.smtp.ssl.trust", "smtp.gmail.com");
var session = Session.getInstance(props,
new Authenticator() {
protected PasswordAuthentication getPasswordAuthentication() {
return new PasswordAuthentication(smtpUsername, smtpPassword);
}
});
var msg = new MimeMessage(session);
msg.setFrom(new InternetAddress(smtpUsername));
msg.setRecipients(RecipientType.TO,
InternetAddress.parse(toEmail));
msg.setSubject("[Troostwijk] " + title);
msg.setText(message);
msg.setSentDate(new Date());
if (priority > 0) {
msg.setHeader("X-Priority", "1");
msg.setHeader("Importance", "High");
}
Transport.send(msg);
IO.println("Email notification sent: " + title);
} catch (Exception e) {
System.err.println("Email notification failed: " + e.getMessage());
}
}
}

View File

@@ -0,0 +1,140 @@
package com.auction;
import org.opencv.core.Mat;
import org.opencv.core.Scalar;
import org.opencv.core.Size;
import org.opencv.dnn.Dnn;
import org.opencv.dnn.Net;
import org.opencv.imgcodecs.Imgcodecs;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV;
import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
/**
* Service for performing object detection on images using OpenCV's DNN
* module. The DNN module can load pretrained models from several
* frameworks (Darknet, TensorFlow, ONNX, etc.)【784097309529506†L209-L233】. Here
* we load a YOLO model (Darknet) by specifying the configuration and
* weights files. For each image we run a forward pass and return a
* list of detected class labels.
*
* If model files are not found, the service operates in disabled mode
* and returns empty lists.
*/
class ObjectDetectionService {
private final Net net;
private final List<String> classNames;
private final boolean enabled;
ObjectDetectionService(String cfgPath, String weightsPath, String classNamesPath) throws IOException {
// Check if model files exist
var cfgFile = Paths.get(cfgPath);
var weightsFile = Paths.get(weightsPath);
var classNamesFile = Paths.get(classNamesPath);
if (!Files.exists(cfgFile) || !Files.exists(weightsFile) || !Files.exists(classNamesFile)) {
IO.println("⚠️ Object detection disabled: YOLO model files not found");
IO.println(" Expected files:");
IO.println(" - " + cfgPath);
IO.println(" - " + weightsPath);
IO.println(" - " + classNamesPath);
IO.println(" Scraper will continue without image analysis.");
this.enabled = false;
this.net = null;
this.classNames = new ArrayList<>();
return;
}
try {
// Load network
this.net = Dnn.readNetFromDarknet(cfgPath, weightsPath);
this.net.setPreferableBackend(DNN_BACKEND_OPENCV);
this.net.setPreferableTarget(DNN_TARGET_CPU);
// Load class names (one per line)
this.classNames = Files.readAllLines(classNamesFile);
this.enabled = true;
IO.println("✓ Object detection enabled with YOLO");
} catch (Exception e) {
System.err.println("⚠️ Object detection disabled: " + e.getMessage());
throw new IOException("Failed to initialize object detection", e);
}
}
/**
* Detects objects in the given image file and returns a list of
* humanreadable labels. Only detections above a confidence
* threshold are returned. For brevity this method omits drawing
* bounding boxes. See the OpenCV DNN documentation for details on
* postprocessing【784097309529506†L324-L344】.
*
* @param imagePath absolute path to the image
* @return list of detected class names (empty if detection disabled)
*/
List<String> detectObjects(String imagePath) {
if (!enabled) {
return new ArrayList<>();
}
List<String> labels = new ArrayList<>();
var image = Imgcodecs.imread(imagePath);
if (image.empty()) return labels;
// Create a 4D blob from the image
var blob = Dnn.blobFromImage(image, 1.0 / 255.0, new Size(416, 416), new Scalar(0, 0, 0), true, false);
net.setInput(blob);
List<Mat> outs = new ArrayList<>();
var outNames = getOutputLayerNames(net);
net.forward(outs, outNames);
// Postprocess: for each detection compute score and choose class
var confThreshold = 0.5f;
for (var out : outs) {
for (var i = 0; i < out.rows(); i++) {
var data = out.get(i, 0);
if (data == null) continue;
// The first 5 numbers are bounding box, then class scores
var scores = new double[classNames.size()];
System.arraycopy(data, 5, scores, 0, scores.length);
var classId = argMax(scores);
var confidence = scores[classId];
if (confidence > confThreshold) {
var label = classNames.get(classId);
if (!labels.contains(label)) {
labels.add(label);
}
}
}
}
return labels;
}
/**
* Returns the indexes of the output layers in the network. YOLO
* automatically discovers its output layers; other models may require
* manually specifying them【784097309529506†L356-L365】.
*/
private List<String> getOutputLayerNames(Net net) {
List<String> names = new ArrayList<>();
var outLayers = net.getUnconnectedOutLayers().toList();
var layersNames = net.getLayerNames();
for (var i : outLayers) {
names.add(layersNames.get(i - 1));
}
return names;
}
/**
* Returns the index of the maximum value in the array.
*/
private int argMax(double[] array) {
var best = 0;
var max = array[0];
for (var i = 1; i < array.length; i++) {
if (array[i] > max) {
max = array[i];
best = i;
}
}
return best;
}
}

View File

@@ -1,645 +0,0 @@
package com.auction;
import com.microsoft.playwright.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.microsoft.playwright.options.WaitUntilState;
import java.io.IOException;
import java.nio.file.*;
import java.sql.*;
import java.time.Instant;
import java.util.*;
/**
* TroostwijkAuctionExtractor
*
* Extracts auction listings from https://www.troostwijkauctions.com/auctions
* using Playwright for Java (headless browser automation).
*
* Features:
* - Uses Playwright for Java to load JavaScript-rendered content
* - Iterates through all pages of auction listings
* - Rate limiting: 200ms between each page request
* - Caches visited pages in SQLite database with expiration times
* - Extracts auction metadata: ID, title, location, URL
*
* Dependencies (Maven):
* <dependency>
* <groupId>com.microsoft.playwright</groupId>
* <artifactId>playwright</artifactId>
* <version>1.40.0</version>
* </dependency>
* <dependency>
* <groupId>com.fasterxml.jackson.core</groupId>
* <artifactId>jackson-databind</artifactId>
* <version>2.17.0</version>
* </dependency>
* <dependency>
* <groupId>org.xerial</groupId>
* <artifactId>sqlite-jdbc</artifactId>
* <version>3.45.1.0</version>
* </dependency>
*
* After adding dependency, run: mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install"
* This downloads the browser binaries needed by Playwright.
*/
public class TroostwijkAuctionExtractor {
private static final String AUCTIONS_BASE_URL = "https://www.troostwijkauctions.com/auctions";
private static final int RATE_LIMIT_MS = 200;
private static final String CACHE_DB_PATH = "cache/page_cache.db";
private static final long CACHE_EXPIRATION_HOURS = 24; // Cache expires after 24 hours
private final ObjectMapper objectMapper;
private final boolean useCache;
private final CacheDatabase cacheDb;
private final int maxPageVisits; // Maximum number of pages to fetch (0 = unlimited)
private int pageVisitCount; // Counter for actual page fetches (not from cache)
private Playwright playwright;
private Browser browser;
/**
* Represents an auction listing
*/
public static class Auction {
public int id;
public String title;
public String location;
public String url;
public String type; // e.g. "A1" or "A7"
@Override
public String toString() {
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', url='%s'}",
id, type, title, location, url);
}
}
/**
* Constructor
*
* @param useCache Enable database caching of visited pages
* @param maxPageVisits Maximum number of actual page fetches (0 = unlimited)
*/
public TroostwijkAuctionExtractor(boolean useCache, int maxPageVisits) throws SQLException, IOException {
this.objectMapper = new ObjectMapper();
this.useCache = useCache;
this.maxPageVisits = maxPageVisits;
this.pageVisitCount = 0;
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
if (useCache) {
cacheDb.initialize();
}
}
/**
* Constructor with default unlimited page visits
*
* @param useCache Enable database caching of visited pages
*/
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
this(useCache, 0); // 0 = unlimited
}
/**
* Initializes Playwright and browser instance
* Call this before extracting auctions
*/
public void initialize() {
System.out.println("Initializing Playwright browser...");
this.playwright = Playwright.create();
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(true)
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
System.out.println("✓ Browser ready");
}
/**
* Closes browser and Playwright instance
* Call this when done extracting
*/
public void close() {
if (browser != null) {
browser.close();
}
if (playwright != null) {
playwright.close();
}
if (cacheDb != null) {
cacheDb.close();
}
System.out.println("✓ Browser and cache closed");
}
/**
* Extracts all auctions from all pages
*
* @return List of all discovered auctions
*/
public List<Auction> extractAllAuctions() throws InterruptedException {
if (browser == null) {
throw new IllegalStateException("Browser not initialized. Call initialize() first.");
}
List<Auction> allAuctions = new ArrayList<>();
int pageNumber = 1;
boolean hasMorePages = true;
System.out.println("Starting auction extraction from " + AUCTIONS_BASE_URL);
while (hasMorePages) {
System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
// Check cache first
String cachedHtml = loadFromCache(pageNumber);
String html;
if (cachedHtml != null) {
System.out.println(" ✓ Loaded from cache");
html = cachedHtml;
} else {
// Check if we've reached the maximum page visit limit
if (maxPageVisits > 0 && pageVisitCount >= maxPageVisits) {
System.out.println(" ⚠️ Reached maximum page visit limit (" + maxPageVisits + "), stopping");
break;
}
// Fetch with Playwright
html = fetchPageWithPlaywright(pageNumber);
pageVisitCount++; // Increment actual page fetch counter
if (html == null || html.isEmpty()) {
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
break;
}
System.out.println(" ✓ Fetched from website (visit " + pageVisitCount +
(maxPageVisits > 0 ? "/" + maxPageVisits : "") + ")");
// Save to cache
if (useCache) {
saveToCache(pageNumber, html);
}
// Rate limiting - wait 200ms before next request
Thread.sleep(RATE_LIMIT_MS);
}
// Parse auctions from HTML
List<Auction> pageAuctions = parseAuctionsFromHtml(html);
if (pageAuctions.isEmpty()) {
System.out.println(" ⚠️ No auctions found on page, stopping pagination");
hasMorePages = false;
} else {
System.out.println(" ✓ Found " + pageAuctions.size() + " auctions");
allAuctions.addAll(pageAuctions);
pageNumber++;
}
}
System.out.println("\n✓ Total auctions extracted: " + allAuctions.size());
return allAuctions;
}
/**
* Fetches a single page using Playwright
*
* @param pageNumber Page number (1-indexed)
* @return HTML content of the page
*/
private String fetchPageWithPlaywright(int pageNumber) {
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
try {
Page page = browser.newPage();
// Set user agent
page.setExtraHTTPHeaders(Map.of(
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
));
// Navigate to page
page.navigate(url, new Page.NavigateOptions()
.setTimeout(30000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
// Wait for auction listings to appear
try {
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
.setTimeout(10000));
} catch (Exception e) {
// Continue even if selector not found
System.out.println(" ⚠️ Auction selector not found, attempting to parse anyway");
}
// Get HTML content
String html = page.content();
page.close();
return html;
} catch (Exception e) {
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
return null;
}
}
/**
* Parses auction data from HTML content
*
* @param html HTML content
* @return List of parsed auctions
*/
private List<Auction> parseAuctionsFromHtml(String html) {
List<Auction> auctions = new ArrayList<>();
// Simple regex-based parsing for auction links
// Format: <a href="/a/title-A1-12345" or "/a/title-A7-12345"
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
Set<Integer> seenIds = new HashSet<>();
while (linkMatcher.find()) {
String href = linkMatcher.group(1);
int auctionId = Integer.parseInt(linkMatcher.group(2));
// Avoid duplicates
if (seenIds.contains(auctionId)) {
continue;
}
// Extract auction type (A1 or A7)
String type = href.contains("A1-") ? "A1" : "A7";
// Try to find location and title near this link
String location = extractLocationNearLink(html, href);
String title = extractTitleFromHref(href);
Auction auction = new Auction();
auction.id = auctionId;
auction.type = type;
auction.title = title;
auction.location = location;
auction.url = "https://www.troostwijkauctions.com" + href;
auctions.add(auction);
seenIds.add(auctionId);
}
return auctions;
}
/**
* Extracts location text near an auction link
* Looks for ", NL" or other country codes
*/
private String extractLocationNearLink(String html, String href) {
int hrefPos = html.indexOf(href);
if (hrefPos == -1) return "Unknown";
// Look at 1000 characters before AND after the href for location info
int startPos = Math.max(hrefPos - 500, 0);
int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Pattern 1: Classic format "City, NL"
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
if (locMatcher.find()) {
String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2);
System.out.println(" Found location: " + location + " for auction " + href);
return location;
}
// Pattern 2: HTML format like "<span>City,<!-- --> </span>NL"
// Extract city and country code separately
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
"<span[^>]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
if (htmlMatcher.find()) {
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
String country = htmlMatcher.group(2);
String location = city + ", " + country;
System.out.println(" Found location (HTML): " + location + " for auction " + href);
return location;
}
// Pattern 3: Fallback - just find country code after HTML tags
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
"(?:-->|</span>|</div>)\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
if (countryMatcher.find()) {
String country = countryMatcher.group(1);
System.out.println(" Found country code: " + country + " for auction " + href);
return "Unknown, " + country;
}
System.out.println(" ⚠️ No location found for auction " + href);
return "Unknown";
}
/**
* Extracts human-readable title from URL slug
* Converts "some-auction-title-A1-12345" to "Some Auction Title"
*/
private String extractTitleFromHref(String href) {
// Extract everything between "/a/" and "-A1-" or "-A7-"
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile(
"/a/(.+?)-A[17]-");
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
String slug = titleMatcher.group(1);
// Convert kebab-case to Title Case
String[] words = slug.split("-");
StringBuilder title = new StringBuilder();
for (String word : words) {
if (!word.isEmpty()) {
title.append(Character.toUpperCase(word.charAt(0)))
.append(word.substring(1))
.append(" ");
}
}
return title.toString().trim();
}
return "Untitled Auction";
}
/**
* Loads cached HTML for a page from SQLite database
* Returns null if not cached or cache has expired
*
* @param pageNumber Page number
* @return Cached HTML or null if not found/expired
*/
private String loadFromCache(int pageNumber) {
if (!useCache || cacheDb == null) return null;
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
return cacheDb.get(url);
}
/**
* Saves HTML to SQLite cache database with expiration time
*
* @param pageNumber Page number
* @param html HTML content
*/
private void saveToCache(int pageNumber, String html) {
if (!useCache || cacheDb == null) return;
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
}
/**
* Filters auctions by location
*
* @param auctions List of auctions
* @param locationFilter Location string to match (e.g., "NL")
* @return Filtered list
*/
public static List<Auction> filterByLocation(List<Auction> auctions, String locationFilter) {
return auctions.stream()
.filter(a -> a.location.contains(locationFilter))
.toList();
}
/**
* Entry point for testing
*
* Arguments:
* --max-visits <number> : Maximum number of page visits (0 = unlimited, default)
* --no-cache : Disable caching
*/
public static void main(String[] args) throws Exception {
System.out.println("=== Troostwijk Auction Extractor ===\n");
// Parse command line arguments
boolean useCache = true;
int maxVisits = 0; // 0 = unlimited
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "--max-visits":
if (i + 1 < args.length) {
maxVisits = Integer.parseInt(args[++i]);
System.out.println("Max page visits set to: " + maxVisits);
}
break;
case "--no-cache":
useCache = false;
System.out.println("Caching disabled");
break;
case "--help":
System.out.println("Usage: java TroostwijkAuctionExtractor [options]");
System.out.println("Options:");
System.out.println(" --max-visits <n> : Limit actual page fetches to n (0 = unlimited)");
System.out.println(" --no-cache : Disable page caching");
System.out.println(" --help : Show this help message");
return;
}
}
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache, maxVisits);
try {
// Initialize browser
extractor.initialize();
// Extract all auctions
List<Auction> allAuctions = extractor.extractAllAuctions();
// Filter for Dutch auctions only
List<Auction> dutchAuctions = filterByLocation(allAuctions, "NL");
System.out.println("\n=== Results ===");
System.out.println("Total auctions found: " + allAuctions.size());
System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
System.out.println("Actual page visits: " + extractor.pageVisitCount);
// Display first 10 Dutch auctions
System.out.println("\n=== Sample Dutch Auctions ===");
dutchAuctions.stream()
.limit(10)
.forEach(System.out::println);
} finally {
// Always close browser
extractor.close();
}
}
/**
* SQLite-based caching system for HTML pages with expiration support
*/
static class CacheDatabase {
private final String dbPath;
private Connection connection;
public CacheDatabase(String dbPath) {
this.dbPath = dbPath;
}
/**
* Initialize database and create schema
*/
public void initialize() throws SQLException, IOException {
// Create cache directory if it doesn't exist
Path cacheDir = Paths.get(dbPath).getParent();
if (cacheDir != null) {
Files.createDirectories(cacheDir);
}
connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
// Create cache table with URL as primary key
String createTable = """
CREATE TABLE IF NOT EXISTS page_cache (
url TEXT PRIMARY KEY,
html TEXT NOT NULL,
cached_at INTEGER NOT NULL,
expires_at INTEGER NOT NULL
)
""";
try (Statement stmt = connection.createStatement()) {
stmt.execute(createTable);
// Create index on expires_at for efficient cleanup
stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
}
// Clean up expired entries on initialization
cleanupExpired();
System.out.println("✓ Cache database initialized");
}
/**
* Get cached HTML for a URL if it exists and hasn't expired
*
* @param url The URL to look up
* @return Cached HTML or null if not found/expired
*/
public synchronized String get(String url) {
String sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setString(1, url);
ps.setLong(2, Instant.now().getEpochSecond());
ResultSet rs = ps.executeQuery();
if (rs.next()) {
return rs.getString("html");
}
} catch (SQLException e) {
System.err.println("Cache read error: " + e.getMessage());
}
return null;
}
/**
* Store HTML in cache with expiration time
*
* @param url The URL to cache
* @param html The HTML content
* @param expirationHours Hours until cache expires
*/
public synchronized void put(String url, String html, long expirationHours) {
String sql = """
INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)
VALUES (?, ?, ?, ?)
""";
long now = Instant.now().getEpochSecond();
long expiresAt = now + (expirationHours * 3600);
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setString(1, url);
ps.setString(2, html);
ps.setLong(3, now);
ps.setLong(4, expiresAt);
ps.executeUpdate();
} catch (SQLException e) {
System.err.println("Cache write error: " + e.getMessage());
}
}
/**
* Remove expired cache entries
*/
public synchronized void cleanupExpired() {
String sql = "DELETE FROM page_cache WHERE expires_at <= ?";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setLong(1, Instant.now().getEpochSecond());
int deleted = ps.executeUpdate();
if (deleted > 0) {
System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
}
} catch (SQLException e) {
System.err.println("Cache cleanup error: " + e.getMessage());
}
}
/**
* Get cache statistics
*/
public synchronized void printStats() {
String sql = "SELECT COUNT(*) as total, " +
"SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
"SUM(LENGTH(html)) as total_size " +
"FROM page_cache";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setLong(1, Instant.now().getEpochSecond());
ResultSet rs = ps.executeQuery();
if (rs.next()) {
int total = rs.getInt("total");
int valid = rs.getInt("valid");
long size = rs.getLong("total_size");
System.out.println("\n=== Cache Statistics ===");
System.out.println("Total entries: " + total);
System.out.println("Valid entries: " + valid);
System.out.println("Expired entries: " + (total - valid));
System.out.println("Total size: " + (size / 1024) + " KB");
}
} catch (SQLException e) {
System.err.println("Cache stats error: " + e.getMessage());
}
}
/**
* Close database connection
*/
public void close() {
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
System.err.println("Error closing cache database: " + e.getMessage());
}
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -41,7 +41,7 @@ public class AuctionParsingTest {
System.out.println("\n=== Auction Parsing Test ===");
System.out.println("Found " + auctionLinks.size() + " auction links");
List<TroostwijkScraper.AuctionInfo> auctions = new ArrayList<>();
List<AuctionInfo> auctions = new ArrayList<>();
int count = 0;
for (Element link : auctionLinks) {
@@ -59,7 +59,7 @@ public class AuctionParsingTest {
int auctionId = Integer.parseInt(matcher.group(2));
// Extract auction info using IMPROVED text-based method
TroostwijkScraper.AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
auctions.add(auction);
// Print the first 10 auctions for verification
@@ -101,7 +101,7 @@ public class AuctionParsingTest {
assertTrue(auctions.size() > 0, "Should find at least one auction");
// Verify all auctions have basic info
for (TroostwijkScraper.AuctionInfo auction : auctions) {
for (AuctionInfo auction : auctions) {
assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId);
assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId);
assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId);
@@ -119,8 +119,8 @@ public class AuctionParsingTest {
* Expected format: "[day] om [time] [lot_count] [title] [city], [CC]"
* Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
*/
private TroostwijkScraper.AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
TroostwijkScraper.AuctionInfo auction = new TroostwijkScraper.AuctionInfo();
private AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
AuctionInfo auction = new AuctionInfo();
auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;

View File

@@ -68,71 +68,18 @@ public class TroostwijkScraperTest {
}
@Test
public void testFetchAndPersistAuctionData() throws SQLException {
// First, discover auctions
List<Integer> auctions = scraper.discoverDutchAuctions();
assertFalse(auctions.isEmpty(), "Need at least one auction to test");
// Take the first auction and fetch its lots
Integer firstSaleId = auctions.getFirst();
System.out.println("Testing with sale ID: " + firstSaleId);
scraper.fetchLotsForSale(firstSaleId);
// Verify data was persisted to database
List<TroostwijkScraper.Lot> lotsInDb = scraper.db.getAllLots();
assertNotNull(lotsInDb, "Lots list should not be null");
assertFalse(lotsInDb.isEmpty(), "Should have persisted at least one lot");
// Verify lot properties
for (TroostwijkScraper.Lot lot : lotsInDb) {
assertEquals(firstSaleId.intValue(), lot.saleId, "Lot should belong to the correct sale");
assertTrue(lot.lotId > 0, "Lot ID should be positive");
assertNotNull(lot.title, "Lot title should not be null");
assertFalse(lot.title.isEmpty(), "Lot title should not be empty");
assertNotNull(lot.url, "Lot URL should not be null");
assertTrue(lot.url.startsWith("https://"), "Lot URL should be valid");
assertTrue(lot.currentBid >= 0, "Current bid should be non-negative");
}
System.out.println("✓ Successfully persisted " + lotsInDb.size() + " lots to database");
System.out.println("✓ All lot properties are valid");
}
@Test
public void testDatabaseSchema() throws SQLException {
// Verify that the database schema was created correctly
List<TroostwijkScraper.Lot> lots = scraper.db.getAllLots();
List<Lot> lots = scraper.db.getAllLots();
assertNotNull(lots, "Should be able to query lots table");
int imageCount = scraper.db.getImageCount();
assertTrue(imageCount >= 0, "Image count should be non-negative");
List<TroostwijkScraper.Lot> activeLots = scraper.db.getActiveLots();
List<Lot> activeLots = scraper.db.getActiveLots();
assertNotNull(activeLots, "Should be able to query active lots");
System.out.println("✓ Database schema is valid and queryable");
}
@Test
public void testAuctionProperties() {
List<Integer> auctions = scraper.discoverDutchAuctions();
assertFalse(auctions.isEmpty(), "Should find auctions");
// Test that we can fetch data for multiple auctions
int auctionsToTest = Math.min(2, auctions.size());
for (int i = 0; i < auctionsToTest; i++) {
Integer saleId = auctions.get(i);
System.out.println("Testing auction " + (i + 1) + ": " + saleId);
// This should not throw an exception
assertDoesNotThrow(() -> scraper.fetchLotsForSale(saleId),
"Should be able to fetch lots for sale " + saleId);
}
System.out.println("✓ Successfully tested " + auctionsToTest + " auctions");
}
}

View File

@@ -1,456 +1,61 @@
## Woensdag 3 dec 25
* [
woensdag om 16:00
![Industry & machinery](https://media.tbauctions.com/image-media/37f8e30d-7f4e-4af4-bb8a-029975b089cf/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/da3276a5-eb99-4a5d-ac3b-cc546b0a5f39/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/4d273787-cc80-4ac5-b89d-20525871426a/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/0b0b946d-26a6-486f-8c73-e9d0394e4e70/file?imageSize=1024x768 1024w)
145
Industrie & machines
Meerdere locaties (45)
](/a/industrie-machines-A3-37358)
* [
woensdag om 16:00
![D | Racing car transporters, crane polyp grabs and containers from inventory adjustment](https://media.tbauctions.com/image-media/b020da96-bb92-4e22-8a7b-2dd205dd5f7f/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/d3153444-7d59-45ac-8160-3cdaa7e1074e/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/be9f0e03-f585-47d0-abf0-3e802e21c8ad/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/658c88d7-0430-4454-b7fb-f48eeadeb401/file?imageSize=1024x768 1024w)
38
D | Raceautotransporters, kraan-polypengrepen en containers uit voorraadaanpassing
Nieheim, DE
](/a/d-%7C-raceautotransporters-kraan-polypengrepen-en-containers-uit-voorraadaanpassing-A1-39772)
* [
woensdag om 16:00
![Food Processing Equipment and Packaging Machinery](https://media.tbauctions.com/image-media/dd6b698d-79f5-40a4-ab74-2beca7a5341e/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/198b179c-0921-420f-9b14-ab430a957fb0/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/842f0a7d-cd05-4040-bb00-8691def7e9bc/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/b7855c86-f9e8-4346-9d0b-565d7cd126cc/file?imageSize=1024x768 1024w)
61
Voedselverwerkende apparatuur en verpakkingsmachines
CHOMERAC, FR
](/a/voedselverwerkende-apparatuur-en-verpakkingsmachines-A1-39319)
* [
woensdag om 16:00
![Agricultural & earthmoving machines](https://media.tbauctions.com/image-media/a11f32a9-5197-4486-8074-803b4da25227/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/dcf35ea2-0e1b-4d30-8cd7-1b534ecf7b0e/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/4857e24d-25b2-46f6-bde4-c0a5383508dd/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/7210f5e7-e124-40ab-b2db-c54e02af416a/file?imageSize=1024x768 1024w)
117
Landbouw- & grondverzetmachines
Meerdere locaties (49)
](/a/landbouw-grondverzetmachines-A3-37375)
* [
woensdag om 17:00
![Tools & equipment](https://media.tbauctions.com/image-media/24f212ce-e1f0-45a3-b095-c3944fd35340/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/1cc0cb65-68f6-4108-b51a-11cb1652f366/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/92d4629a-d259-43ad-806d-de99da15ea32/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/2653c057-5e28-46dc-aacc-08920f0500b4/file?imageSize=1024x768 1024w)
261
Gereedschappen & uitrusting
Meerdere locaties (36), BE
](/a/gereedschappen-uitrusting-A3-37367)
* [
woensdag om 18:00
![](https://media.tbauctions.com/image-media/13b74047-5372-493f-81dc-2d075c3bada1/file?imageSize=1024x768 1024w)
1
Vrachtwagens voor bedrijfsvoertuigen
Loßburg, DE
](/a/vrachtwagens-voor-bedrijfsvoertuigen-A7-39531)
* [
woensdag om 19:00
![White goods and accessories](https://media.tbauctions.com/image-media/1abf2d9b-7596-45e3-93b3-b503397eba0e/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/8cb4c2f1-f298-4e20-b221-ec828cca717c/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/d588cf94-3f82-4785-ace4-a8014a1859fd/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/9036f5bf-7fc7-43a3-ae8a-41c1376f60ec/file?imageSize=1024x768 1024w)
61
Witgoed en accessoires
Etten-Leur, NL
](/a/witgoed-en-accessoires-A1-27241)
* [
Opent 28 nov 17:00
![](https://media.tbauctions.com/image-media/4f41caa7-865a-4fe7-9cd5-305bd2e455f6/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/1cdf88f5-67d4-4932-96a4-449ad17ba51d/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/c47fee25-0326-40e1-b1db-3190a19ecb1a/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/a3e7c080-2dcf-45aa-99ab-2072697b4f54/file?imageSize=1024x768 1024w)
54
Collectie Rolex en Cartier horloges
Dordrecht, NL
](/a/collectie-rolex-en-cartier-horloges-A1-39398)
* [
woensdag om 19:00
![Kitchens and sanitary facilities](https://media.tbauctions.com/image-media/6379bf2c-aed2-4fbe-8fce-367c9d200141/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/16e97117-3cc2-4ced-bfe1-4f24e0b8d784/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/0002b9da-1ba5-429c-b377-eaddc3714e37/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/d3186016-0d7b-433a-b3e8-903fd85ec929/file?imageSize=1024x768 1024w)
254
SHOWROOMKEUKENS en INBOUWAPPARATUUR
Tilburg, NL
](/a/showroomkeukens-en-inbouwapparatuur-A1-39480)
* [
woensdag om 19:00
![](https://media.tbauctions.com/image-media/e6b76a75-9994-46b0-a2fa-c9a321f62980/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/ebef8850-5265-446a-ab32-5f2cd0d2bf88/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/f82e2b71-f908-472e-8019-9b2b15b0cb17/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/d92ba732-28b0-462f-bab5-ce82540d1c81/file?imageSize=1024x768 1024w)
499
Machines, retourgoederen en restpartijen
Harlingen, NL
](/a/machines-retourgoederen-en-restpartijen-A1-39642)
* [
woensdag om 19:00
![Lots of tools, office inventory, retail goods, decoration and olive trees](https://media.tbauctions.com/image-media/f949084a-50d5-4182-acfb-5d21ac54e471/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/96ef91fa-3927-49d4-af9e-ef709581ac51/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/2917a150-c99a-439f-8767-68ece5214800/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/328e4999-4a75-4f29-bd95-66b1de4e46e8/file?imageSize=1024x768 1024w)
120
Partijen gereedschap, kantoorinventaris, detailhandelgoederen, decoratie en olijfbomen
Meerdere locaties (3), NL
](/a/partijen-gereedschap-kantoorinventaris-detailhandelgoederen-decoratie-en-olijfbomen-A1-27016)
* [
woensdag om 19:00
![Bankruptcy vehicles](https://media.tbauctions.com/image-media/82880ed5-30f0-4055-99bd-a2cf07fad2ef/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/64312888-1b04-4f75-a0f1-386752232172/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/f4268e70-82fb-4ea1-ab92-c23dff650909/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/0c52cb25-fe99-4848-961b-a8a85c7a66d4/file?imageSize=1024x768 1024w)
16
Faillissementsvoertuigen
Meerdere locaties (3), NL
](/a/faillissementsvoertuigen-A1-38368)
* [
woensdag om 19:00
![](https://media.tbauctions.com/image-media/41066c8c-7806-43ee-beef-918c43e18cc7/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/a2543ed7-409e-449f-aa42-c68721432fdf/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/17dee71a-201f-40c5-9e8d-f71f807b27a9/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/0c8f8bb7-60fc-4c3f-af17-1080f489a8c2/file?imageSize=1024x768 1024w)
78
Personenautos, oldtimers, campers en brommobielen
Buitenpost, NL
](/a/personenauto%E2%80%99s-oldtimers-campers-en-brommobielen-A1-39508)
* [
woensdag om 19:00
![](https://media.tbauctions.com/image-media/278c35b9-3e5a-42eb-a09b-1bfdf48c60b2/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/2ced3007-a441-4752-84fc-eb3e2ceba7a2/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/36ed78ee-32f2-4824-b8eb-fd38f3d87e27/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/caa71786-c172-491f-bd78-787946e35480/file?imageSize=1024x768 1024w)
391
Bezorgveiling Faillissement Dvize B.V. Hyundai Power Products gereedschappen
Meerdere locaties (2)
](/a/bezorgveiling-faillissement-dvize-b-v-%E2%80%93-hyundai-power-products-gereedschappen-A1-39409)
* [
woensdag om 19:00
![](https://media.tbauctions.com/image-media/37df514f-357a-43aa-9a5a-8fedefb7068f/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/37df514f-357a-43aa-9a5a-8fedefb7068f/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/37df514f-357a-43aa-9a5a-8fedefb7068f/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/4b8b6f24-4ca1-4141-a3e3-254caba98284/file?imageSize=1024x768 1024w)
208
Kunstplanten en bomen, composiet gevel- en vloerbekleding en akoestische materialen
De Lier, NL
](/a/kunstplanten-en-bomen-composiet-gevel-en-vloerbekleding-en-akoestische-materialen-A1-28707)
* [
woensdag om 19:00
![Metalworking machines, tools and stock in connection with company relocation](https://media.tbauctions.com/image-media/d5232c84-b313-48dc-973d-b82e762f000e/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/997e6803-7667-49ce-8a2c-b62939ee0aa7/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/40cec190-c669-4d80-b9fa-99039e6804ea/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/77371ea9-2869-4df6-97b4-145d76a34615/file?imageSize=1024x768 1024w)
181
Metaalbewerkingsmachines, gereedschap en voorraad in verband met bedrijfsverhuizing
Cuijk, NL
](/a/metaalbewerkingsmachines-gereedschap-en-voorraad-in-verband-met-bedrijfsverhuizing-A1-39360)
* [
woensdag om 19:00
![](https://media.tbauctions.com/image-media/2faa2b06-09c0-49ec-ba31-b8d24e15263a/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/485930ea-fbd4-4fc0-adeb-53f9e4f9b89e/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/fdb0a385-b27e-4aa2-b0c9-aca59d7c5f96/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/a268ff31-f1f1-421c-80e9-97cfd0fcfa7b/file?imageSize=1024x768 1024w)
238
Overstock en magazijnopruiming
Heesch, NL
](/a/overstock-en-magazijnopruiming-A1-39538)
* [
woensdag om 19:00
![Collector's Auction Scooters & Motorcycles](https://media.tbauctions.com/image-media/8464e9fc-b60a-4081-b898-d98328c8d1dd/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/a953420a-4d1d-4e80-9c53-be7bb85106c5/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/7f0198da-b213-46c0-8a0b-6a382ca1b029/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/7b11eee5-efe7-4fbd-8529-b1d50bd4db2e/file?imageSize=1024x768 1024w)
47
Verzamelveiling Scooters en Motoren
Meerdere locaties (2), NL
](/a/verzamelveiling-scooters-en-motoren-A1-28428)
* [
woensdag om 19:00
![Cars & transport](https://media.tbauctions.com/image-media/28be6ce7-6987-48ed-8758-622ab308ca2a/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/5be09c82-6f9a-41b8-b0dd-2d5a43327cb4/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/8fe5b954-b16c-4e72-a4b9-be6b345d5a82/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/183e08ae-4769-4a9d-a7db-2d07ab487781/file?imageSize=1024x768 1024w)
338
Auto's & transport
Meerdere locaties (109)
](/a/auto%27s-transport-A3-37349)
* [
woensdag om 19:30
![](https://media.tbauctions.com/image-media/f1401ff5-4e5d-41e5-b4b2-6771fd7aad83/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/d788438c-5a47-4eeb-aced-4777c5bb4701/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/f888c72a-8756-4b83-994d-a4bb6a08eb05/file?imageSize=1024x768 1024w)
![](https://media.tbauctions.com/image-media/9566453d-2c19-431c-b5c6-521cfdc01594/file?imageSize=1024x768 1024w)
74
Gouden juwelen en diamanten
Meerdere locaties (28)
](/a/gouden-juwelen-en-diamanten-A1-29562)
Configure your devices to use the Pi-hole as their DNS server │
│ using: │
│ │
│ IPv4: 192.168.1.159 │
│ IPv6: fdc5:59a6:9ac1:f11f:2c86:25d3:6282:37ef │
│ If you have not done so already, the above IP should be set to │
│ static. │
│ View the web interface at http://pi.hole:80/admin or │
http://192.168.1.159:80/admin │
│ │
│ Your Admin Webpage login password is gYj7Enh- │
│ │
│ To allow your user to use all CLI functions without │
│ authentication, │
│ refer to https://docs.pi-hole.net/main/post-install/ │
├─────────────────────────────────────────────────────────────
127.0.0.1
192.168.1.159
::1
fdc5:59a6:9ac1:f11f:2c86:25d3:6282:37ef
fdc5:59a6:9ac1:f11f:bd8c:6e87:65f0:243c
fe80::a05b:bbc6:d47f:3002%enp9s0
2IXD-XJN9-C337-1K4Y-BBEO-HV1F-3BVI
https://ollama.lan:9443/#!/wizard - heel-goed-wachtwoord
[
{
"domain": "ollama.lan",
"answer": "192.168.1.159",
"enabled": true
},
{
"domain": "hephaestus.lan",
"answer": "192.168.1.159",
"enabled": true
},
{
"domain": "hermes.lan",
"answer": "192.168.137.239",
"enabled": true
},
{
"domain": "atlas.lan",
"answer": "192.168.1.100",
"enabled": true
},
{
"domain": "hub.lan",
"answer": "192.168.1.1",
"enabled": true
},
{
"domain": "ha.lan",
"answer": "192.168.1.193",
"enabled": true
}
]

View File

@@ -0,0 +1,326 @@
# Troostwijk Scraper - Architecture & Data Flow
## System Overview
The scraper follows a **3-phase hierarchical crawling pattern** to extract auction and lot data from Troostwijk Auctions website.
## Architecture Diagram
```
┌─────────────────────────────────────────────────────────────────┐
│ TROOSTWIJK SCRAPER │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ PHASE 1: COLLECT AUCTION URLs │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ Listing Page │────────▶│ Extract /a/ │ │
│ │ /auctions? │ │ auction URLs │ │
│ │ page=1..N │ └──────────────┘ │
│ └──────────────┘ │ │
│ ▼ │
│ [ List of Auction URLs ] │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ PHASE 2: EXTRACT LOT URLs FROM AUCTIONS │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ Auction Page │────────▶│ Parse │ │
│ │ /a/... │ │ __NEXT_DATA__│ │
│ └──────────────┘ │ JSON │ │
│ │ └──────────────┘ │
│ │ │ │
│ ▼ ▼ │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ Save Auction │ │ Extract /l/ │ │
│ │ Metadata │ │ lot URLs │ │
│ │ to DB │ └──────────────┘ │
│ └──────────────┘ │ │
│ ▼ │
│ [ List of Lot URLs ] │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ PHASE 3: SCRAPE LOT DETAILS │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ Lot Page │────────▶│ Parse │ │
│ │ /l/... │ │ __NEXT_DATA__│ │
│ └──────────────┘ │ JSON │ │
│ └──────────────┘ │
│ │ │
│ ┌─────────────────────────┴─────────────────┐ │
│ ▼ ▼ │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ Save Lot │ │ Save Images │ │
│ │ Details │ │ URLs to DB │ │
│ │ to DB │ └──────────────┘ │
│ └──────────────┘ │ │
│ ▼ │
│ [Optional Download] │
└─────────────────────────────────────────────────────────────────┘
```
## Database Schema
```sql
CACHE TABLE (HTML Storage with Compression)
cache
url (TEXT, PRIMARY KEY)
content (BLOB) -- Compressed HTML (zlib) │
timestamp (REAL)
status_code (INTEGER)
compressed (INTEGER) -- 1=compressed, 0=plain │
AUCTIONS TABLE
auctions
auction_id (TEXT, PRIMARY KEY) -- e.g. "A7-39813" │
url (TEXT, UNIQUE)
title (TEXT)
location (TEXT) -- e.g. "Cluj-Napoca, RO" │
lots_count (INTEGER)
first_lot_closing_time (TEXT)
scraped_at (TEXT)
LOTS TABLE
lots
lot_id (TEXT, PRIMARY KEY) -- e.g. "A1-28505-5" │
auction_id (TEXT) -- FK to auctions │
url (TEXT, UNIQUE)
title (TEXT)
current_bid (TEXT) -- "€123.45" or "No bids" │
bid_count (INTEGER)
closing_time (TEXT)
viewing_time (TEXT)
pickup_date (TEXT)
location (TEXT) -- e.g. "Dongen, NL" │
description (TEXT)
category (TEXT)
scraped_at (TEXT)
FOREIGN KEY (auction_id) auctions(auction_id)
IMAGES TABLE (Image URLs & Download Status)
images THIS TABLE HOLDS IMAGE LINKS
id (INTEGER, PRIMARY KEY AUTOINCREMENT)
lot_id (TEXT) -- FK to lots │
url (TEXT) -- Image URL │
local_path (TEXT) -- Path after download │
downloaded (INTEGER) -- 0=pending, 1=downloaded │
FOREIGN KEY (lot_id) lots(lot_id)
```
## Sequence Diagram
```
User Scraper Playwright Cache DB Data Tables
│ │ │ │ │
│ Run │ │ │ │
├──────────────▶│ │ │ │
│ │ │ │ │
│ │ Phase 1: Listing Pages │ │
│ ├───────────────▶│ │ │
│ │ goto() │ │ │
│ │◀───────────────┤ │ │
│ │ HTML │ │ │
│ ├───────────────────────────────▶│ │
│ │ compress & cache │ │
│ │ │ │ │
│ │ Phase 2: Auction Pages │ │
│ ├───────────────▶│ │ │
│ │◀───────────────┤ │ │
│ │ HTML │ │ │
│ │ │ │ │
│ │ Parse __NEXT_DATA__ JSON │ │
│ │────────────────────────────────────────────────▶│
│ │ │ │ INSERT auctions
│ │ │ │ │
│ │ Phase 3: Lot Pages │ │
│ ├───────────────▶│ │ │
│ │◀───────────────┤ │ │
│ │ HTML │ │ │
│ │ │ │ │
│ │ Parse __NEXT_DATA__ JSON │ │
│ │────────────────────────────────────────────────▶│
│ │ │ │ INSERT lots │
│ │────────────────────────────────────────────────▶│
│ │ │ │ INSERT images│
│ │ │ │ │
│ │ Export to CSV/JSON │ │
│ │◀────────────────────────────────────────────────┤
│ │ Query all data │ │
│◀──────────────┤ │ │ │
│ Results │ │ │ │
```
## Data Flow Details
### 1. **Page Retrieval & Caching**
```
Request URL
├──▶ Check cache DB (with timestamp validation)
│ │
│ ├─[HIT]──▶ Decompress (if compressed=1)
│ │ └──▶ Return HTML
│ │
│ └─[MISS]─▶ Fetch via Playwright
│ │
│ ├──▶ Compress HTML (zlib level 9)
│ │ ~70-90% size reduction
│ │
│ └──▶ Store in cache DB (compressed=1)
└──▶ Return HTML for parsing
```
### 2. **JSON Parsing Strategy**
```
HTML Content
└──▶ Extract <script id="__NEXT_DATA__">
├──▶ Parse JSON
│ │
│ ├─[has pageProps.lot]──▶ Individual LOT
│ │ └──▶ Extract: title, bid, location, images, etc.
│ │
│ └─[has pageProps.auction]──▶ AUCTION
│ │
│ ├─[has lots[] array]──▶ Auction with lots
│ │ └──▶ Extract: title, location, lots_count
│ │
│ └─[no lots[] array]──▶ Old format lot
│ └──▶ Parse as lot
└──▶ Fallback to HTML regex parsing (if JSON fails)
```
### 3. **Image Handling**
```
Lot Page Parsed
├──▶ Extract images[] from JSON
│ │
│ └──▶ INSERT INTO images (lot_id, url, downloaded=0)
└──▶ [If DOWNLOAD_IMAGES=True]
├──▶ Download each image
│ │
│ ├──▶ Save to: /images/{lot_id}/001.jpg
│ │
│ └──▶ UPDATE images SET local_path=?, downloaded=1
└──▶ Rate limit between downloads (0.5s)
```
## Key Configuration
| Setting | Value | Purpose |
|---------|-------|---------|
| `CACHE_DB` | `/mnt/okcomputer/output/cache.db` | SQLite database path |
| `IMAGES_DIR` | `/mnt/okcomputer/output/images` | Downloaded images storage |
| `RATE_LIMIT_SECONDS` | `0.5` | Delay between requests |
| `DOWNLOAD_IMAGES` | `False` | Toggle image downloading |
| `MAX_PAGES` | `50` | Number of listing pages to crawl |
## Output Files
```
/mnt/okcomputer/output/
├── cache.db # SQLite database (compressed HTML + data)
├── auctions_{timestamp}.json # Exported auctions
├── auctions_{timestamp}.csv # Exported auctions
├── lots_{timestamp}.json # Exported lots
├── lots_{timestamp}.csv # Exported lots
└── images/ # Downloaded images (if enabled)
├── A1-28505-5/
│ ├── 001.jpg
│ └── 002.jpg
└── A1-28505-6/
└── 001.jpg
```
## Extension Points for Integration
### 1. **Downstream Processing Pipeline**
```python
# Query lots without downloaded images
SELECT lot_id, url FROM images WHERE downloaded = 0
# Process images: OCR, classification, etc.
# Update status when complete
UPDATE images SET downloaded = 1, local_path = ? WHERE id = ?
```
### 2. **Real-time Monitoring**
```python
# Check for new lots every N minutes
SELECT COUNT(*) FROM lots WHERE scraped_at > datetime('now', '-1 hour')
# Monitor bid changes
SELECT lot_id, current_bid, bid_count FROM lots WHERE bid_count > 0
```
### 3. **Analytics & Reporting**
```python
# Top locations
SELECT location, COUNT(*) as lot_count FROM lots GROUP BY location
# Auction statistics
SELECT
a.auction_id,
a.title,
COUNT(l.lot_id) as actual_lots,
SUM(CASE WHEN l.bid_count > 0 THEN 1 ELSE 0 END) as lots_with_bids
FROM auctions a
LEFT JOIN lots l ON a.auction_id = l.auction_id
GROUP BY a.auction_id
```
### 4. **Image Processing Integration**
```python
# Get all images for a lot
SELECT url, local_path FROM images WHERE lot_id = 'A1-28505-5'
# Batch process unprocessed images
SELECT i.id, i.lot_id, i.local_path, l.title, l.category
FROM images i
JOIN lots l ON i.lot_id = l.lot_id
WHERE i.downloaded = 1 AND i.local_path IS NOT NULL
```
## Performance Characteristics
- **Compression**: ~70-90% HTML size reduction (1GB → ~100-300MB)
- **Rate Limiting**: Exactly 0.5s between requests (respectful scraping)
- **Caching**: 24-hour default cache validity (configurable)
- **Throughput**: ~7,200 pages/hour (with 0.5s rate limit)
- **Scalability**: SQLite handles millions of rows efficiently
## Error Handling
- **Network failures**: Cached as status_code=500, retry after cache expiry
- **Parse failures**: Falls back to HTML regex patterns
- **Compression errors**: Auto-detects and handles uncompressed legacy data
- **Missing fields**: Defaults to "No bids", empty string, or 0
## Rate Limiting & Ethics
- **REQUIRED**: 0.5 second delay between ALL requests
- **Respects cache**: Avoids unnecessary re-fetching
- **User-Agent**: Identifies as standard browser
- **No parallelization**: Single-threaded sequential crawling

View File

@@ -183,7 +183,7 @@ The scraper works fine despite these warnings.
## Full Documentation
See [README.md](README.md) for complete documentation including:
See [README.md](../README.md) for complete documentation including:
- Email setup details
- YOLO installation guide
- Configuration options