start
This commit is contained in:
8
pom.xml
8
pom.xml
@@ -14,8 +14,8 @@
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.source>21</maven.compiler.source>
|
||||
<maven.compiler.target>21</maven.compiler.target>
|
||||
<maven.compiler.source>25</maven.compiler.source>
|
||||
<maven.compiler.target>25</maven.compiler.target>
|
||||
<jackson.version>2.17.0</jackson.version>
|
||||
<opencv.version>4.9.0-0</opencv.version>
|
||||
</properties>
|
||||
@@ -93,8 +93,8 @@
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.11.0</version>
|
||||
<configuration>
|
||||
<source>21</source>
|
||||
<target>21</target>
|
||||
<source>25</source>
|
||||
<target>25</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
|
||||
24
src/main/java/com/auction/AuctionInfo.java
Normal file
24
src/main/java/com/auction/AuctionInfo.java
Normal file
@@ -0,0 +1,24 @@
|
||||
package com.auction;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
/**
|
||||
* Represents auction metadata (veiling informatie)
|
||||
*/
|
||||
public final class AuctionInfo {
|
||||
|
||||
public int auctionId; // Unique auction ID (from URL)
|
||||
public String title; // Auction title
|
||||
public String location; // Location (e.g., "Amsterdam, NL")
|
||||
public String city; // City name
|
||||
public String country; // Country code (e.g., "NL")
|
||||
public String url; // Full auction URL
|
||||
public String type; // Auction type (A1 or A7)
|
||||
public int lotCount; // Number of lots/kavels
|
||||
public LocalDateTime closingTime; // Closing time if available
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', lots=%d, url='%s'}",
|
||||
auctionId, type, title, location, lotCount, url);
|
||||
}
|
||||
}
|
||||
165
src/main/java/com/auction/CacheDatabase.java
Normal file
165
src/main/java/com/auction/CacheDatabase.java
Normal file
@@ -0,0 +1,165 @@
|
||||
package com.auction;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.time.Instant;
|
||||
/**
|
||||
* SQLite-based caching system for HTML pages with expiration support
|
||||
*/
|
||||
class CacheDatabase {
|
||||
|
||||
private final String dbPath;
|
||||
private Connection connection;
|
||||
|
||||
public CacheDatabase(String dbPath) {
|
||||
this.dbPath = dbPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize database and create schema
|
||||
*/
|
||||
public void initialize() throws SQLException, IOException {
|
||||
// Create cache directory if it doesn't exist
|
||||
var cacheDir = Paths.get(dbPath).getParent();
|
||||
if (cacheDir != null) {
|
||||
Files.createDirectories(cacheDir);
|
||||
}
|
||||
|
||||
connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
|
||||
|
||||
// Create cache table with URL as primary key
|
||||
var createTable = "CREATE TABLE IF NOT EXISTS page_cache (\n" +
|
||||
" url TEXT PRIMARY KEY,\n" +
|
||||
" html TEXT NOT NULL,\n" +
|
||||
" cached_at INTEGER NOT NULL,\n" +
|
||||
" expires_at INTEGER NOT NULL\n" +
|
||||
")\n";
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.execute(createTable);
|
||||
// Create index on expires_at for efficient cleanup
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
|
||||
}
|
||||
|
||||
// Clean up expired entries on initialization
|
||||
cleanupExpired();
|
||||
|
||||
System.out.println("✓ Cache database initialized");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cached HTML for a URL if it exists and hasn't expired
|
||||
*
|
||||
* @param url The URL to look up
|
||||
* @return Cached HTML or null if not found/expired
|
||||
*/
|
||||
public synchronized String get(String url) {
|
||||
var sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
|
||||
|
||||
try (var ps = connection.prepareStatement(sql)) {
|
||||
ps.setString(1, url);
|
||||
ps.setLong(2, Instant.now().getEpochSecond());
|
||||
|
||||
var rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return rs.getString("html");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache read error: " + e.getMessage());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store HTML in cache with expiration time
|
||||
*
|
||||
* @param url The URL to cache
|
||||
* @param html The HTML content
|
||||
* @param expirationHours Hours until cache expires
|
||||
*/
|
||||
public synchronized void put(String url, String html, long expirationHours) {
|
||||
var sql = "INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)\n" +
|
||||
"VALUES (?, ?, ?, ?)\n";
|
||||
|
||||
var now = Instant.now().getEpochSecond();
|
||||
var expiresAt = now + (expirationHours * 3600);
|
||||
|
||||
try (var ps = connection.prepareStatement(sql)) {
|
||||
ps.setString(1, url);
|
||||
ps.setString(2, html);
|
||||
ps.setLong(3, now);
|
||||
ps.setLong(4, expiresAt);
|
||||
ps.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache write error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove expired cache entries
|
||||
*/
|
||||
public synchronized void cleanupExpired() {
|
||||
var sql = "DELETE FROM page_cache WHERE expires_at <= ?";
|
||||
|
||||
try (var ps = connection.prepareStatement(sql)) {
|
||||
ps.setLong(1, Instant.now().getEpochSecond());
|
||||
var deleted = ps.executeUpdate();
|
||||
if (deleted > 0) {
|
||||
System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache cleanup error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cache statistics
|
||||
*/
|
||||
public synchronized void printStats() {
|
||||
var sql = "SELECT COUNT(*) as total, " +
|
||||
"SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
|
||||
"SUM(LENGTH(html)) as total_size " +
|
||||
"FROM page_cache";
|
||||
|
||||
try (var ps = connection.prepareStatement(sql)) {
|
||||
ps.setLong(1, Instant.now().getEpochSecond());
|
||||
var rs = ps.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
var total = rs.getInt("total");
|
||||
var valid = rs.getInt("valid");
|
||||
var size = rs.getLong("total_size");
|
||||
|
||||
System.out.println("\n=== Cache Statistics ===");
|
||||
System.out.println("Total entries: " + total);
|
||||
System.out.println("Valid entries: " + valid);
|
||||
System.out.println("Expired entries: " + (total - valid));
|
||||
System.out.println("Total size: " + (size / 1024) + " KB");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache stats error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close database connection
|
||||
*/
|
||||
public void close() {
|
||||
if (connection != null) {
|
||||
try {
|
||||
connection.close();
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Error closing cache database: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
303
src/main/java/com/auction/DatabaseService.java
Normal file
303
src/main/java/com/auction/DatabaseService.java
Normal file
@@ -0,0 +1,303 @@
|
||||
package com.auction;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.time.Instant;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
/**
|
||||
* Service for persisting auctions, lots, images, and object labels into
|
||||
* a SQLite database. Uses the Xerial JDBC driver which connects to
|
||||
* SQLite via a URL of the form "jdbc:sqlite:path_to_file"【329850066306528†L40-L63】.
|
||||
*/
|
||||
public class DatabaseService {
|
||||
|
||||
private final String url;
|
||||
DatabaseService(String dbPath) {
|
||||
this.url = "jdbc:sqlite:" + dbPath;
|
||||
}
|
||||
/**
|
||||
* Creates tables if they do not already exist. The schema includes
|
||||
* tables for auctions, lots, images, and object labels. This method is
|
||||
* idempotent; it can be called multiple times.
|
||||
*/
|
||||
void ensureSchema() throws SQLException {
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
// Auctions table (veilingen)
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS auctions ("
|
||||
+ "auction_id INTEGER PRIMARY KEY,"
|
||||
+ "title TEXT NOT NULL,"
|
||||
+ "location TEXT,"
|
||||
+ "city TEXT,"
|
||||
+ "country TEXT,"
|
||||
+ "url TEXT NOT NULL,"
|
||||
+ "type TEXT,"
|
||||
+ "lot_count INTEGER DEFAULT 0,"
|
||||
+ "closing_time TEXT,"
|
||||
+ "discovered_at INTEGER" // Unix timestamp
|
||||
+ ")");
|
||||
|
||||
// Sales table (legacy - keep for compatibility)
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS sales ("
|
||||
+ "sale_id INTEGER PRIMARY KEY,"
|
||||
+ "title TEXT,"
|
||||
+ "location TEXT,"
|
||||
+ "closing_time TEXT"
|
||||
+ ")");
|
||||
|
||||
// Lots table
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS lots ("
|
||||
+ "lot_id INTEGER PRIMARY KEY,"
|
||||
+ "sale_id INTEGER,"
|
||||
+ "title TEXT,"
|
||||
+ "description TEXT,"
|
||||
+ "manufacturer TEXT,"
|
||||
+ "type TEXT,"
|
||||
+ "year INTEGER,"
|
||||
+ "category TEXT,"
|
||||
+ "current_bid REAL,"
|
||||
+ "currency TEXT,"
|
||||
+ "url TEXT,"
|
||||
+ "closing_time TEXT,"
|
||||
+ "closing_notified INTEGER DEFAULT 0,"
|
||||
+ "FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)"
|
||||
+ ")");
|
||||
|
||||
// Images table
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS images ("
|
||||
+ "id INTEGER PRIMARY KEY AUTOINCREMENT,"
|
||||
+ "lot_id INTEGER,"
|
||||
+ "url TEXT,"
|
||||
+ "file_path TEXT,"
|
||||
+ "labels TEXT,"
|
||||
+ "FOREIGN KEY (lot_id) REFERENCES lots(lot_id)"
|
||||
+ ")");
|
||||
|
||||
// Create indexes for better query performance
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)");
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts or updates an auction record
|
||||
*/
|
||||
synchronized void upsertAuction(AuctionInfo auction) throws SQLException {
|
||||
var sql = "INSERT INTO auctions (auction_id, title, location, city, country, url, type, lot_count, closing_time, discovered_at)"
|
||||
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
||||
+ " ON CONFLICT(auction_id) DO UPDATE SET "
|
||||
+ "title = excluded.title, location = excluded.location, city = excluded.city, "
|
||||
+ "country = excluded.country, url = excluded.url, type = excluded.type, "
|
||||
+ "lot_count = excluded.lot_count, closing_time = excluded.closing_time";
|
||||
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setInt(1, auction.auctionId);
|
||||
ps.setString(2, auction.title);
|
||||
ps.setString(3, auction.location);
|
||||
ps.setString(4, auction.city);
|
||||
ps.setString(5, auction.country);
|
||||
ps.setString(6, auction.url);
|
||||
ps.setString(7, auction.type);
|
||||
ps.setInt(8, auction.lotCount);
|
||||
ps.setString(9, auction.closingTime != null ? auction.closingTime.toString() : null);
|
||||
ps.setLong(10, Instant.now().getEpochSecond());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all auctions from the database
|
||||
*/
|
||||
synchronized List<AuctionInfo> getAllAuctions() throws SQLException {
|
||||
List<AuctionInfo> auctions = new ArrayList<>();
|
||||
var sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time FROM auctions";
|
||||
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
while (rs.next()) {
|
||||
var auction = new AuctionInfo();
|
||||
auction.auctionId = rs.getInt("auction_id");
|
||||
auction.title = rs.getString("title");
|
||||
auction.location = rs.getString("location");
|
||||
auction.city = rs.getString("city");
|
||||
auction.country = rs.getString("country");
|
||||
auction.url = rs.getString("url");
|
||||
auction.type = rs.getString("type");
|
||||
auction.lotCount = rs.getInt("lot_count");
|
||||
var closing = rs.getString("closing_time");
|
||||
if (closing != null) {
|
||||
auction.closingTime = LocalDateTime.parse(closing);
|
||||
}
|
||||
auctions.add(auction);
|
||||
}
|
||||
}
|
||||
return auctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves auctions by country code
|
||||
*/
|
||||
synchronized List<AuctionInfo> getAuctionsByCountry(String countryCode) throws SQLException {
|
||||
List<AuctionInfo> auctions = new ArrayList<>();
|
||||
var sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time "
|
||||
+ "FROM auctions WHERE country = ?";
|
||||
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setString(1, countryCode);
|
||||
var rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
var auction = new AuctionInfo();
|
||||
auction.auctionId = rs.getInt("auction_id");
|
||||
auction.title = rs.getString("title");
|
||||
auction.location = rs.getString("location");
|
||||
auction.city = rs.getString("city");
|
||||
auction.country = rs.getString("country");
|
||||
auction.url = rs.getString("url");
|
||||
auction.type = rs.getString("type");
|
||||
auction.lotCount = rs.getInt("lot_count");
|
||||
var closing = rs.getString("closing_time");
|
||||
if (closing != null) {
|
||||
auction.closingTime = LocalDateTime.parse(closing);
|
||||
}
|
||||
auctions.add(auction);
|
||||
}
|
||||
}
|
||||
return auctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts or updates a lot record. Uses INSERT OR REPLACE to
|
||||
* implement upsert semantics so that existing rows are replaced.
|
||||
*/
|
||||
synchronized void upsertLot(Lot lot) throws SQLException {
|
||||
var sql = "INSERT INTO lots (lot_id, sale_id, title, description, manufacturer, type, year, category, current_bid, currency, url, closing_time, closing_notified)"
|
||||
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
||||
+ " ON CONFLICT(lot_id) DO UPDATE SET "
|
||||
+ "sale_id = excluded.sale_id, title = excluded.title, description = excluded.description, "
|
||||
+ "manufacturer = excluded.manufacturer, type = excluded.type, year = excluded.year, category = excluded.category, "
|
||||
+ "current_bid = excluded.current_bid, currency = excluded.currency, url = excluded.url, closing_time = excluded.closing_time";
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setInt(1, lot.lotId);
|
||||
ps.setInt(2, lot.saleId);
|
||||
ps.setString(3, lot.title);
|
||||
ps.setString(4, lot.description);
|
||||
ps.setString(5, lot.manufacturer);
|
||||
ps.setString(6, lot.type);
|
||||
ps.setInt(7, lot.year);
|
||||
ps.setString(8, lot.category);
|
||||
ps.setDouble(9, lot.currentBid);
|
||||
ps.setString(10, lot.currency);
|
||||
ps.setString(11, lot.url);
|
||||
ps.setString(12, lot.closingTime != null ? lot.closingTime.toString() : null);
|
||||
ps.setInt(13, lot.closingNotified ? 1 : 0);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts a new image record. Each image is associated with a lot and
|
||||
* stores both the original URL and the local file path. Detected
|
||||
* labels are stored as a comma separated string.
|
||||
*/
|
||||
synchronized void insertImage(int lotId, String url, String filePath, List<String> labels) throws SQLException {
|
||||
var sql = "INSERT INTO images (lot_id, url, file_path, labels) VALUES (?, ?, ?, ?)";
|
||||
try (var conn = DriverManager.getConnection(this.url); var ps = conn.prepareStatement(sql)) {
|
||||
ps.setInt(1, lotId);
|
||||
ps.setString(2, url);
|
||||
ps.setString(3, filePath);
|
||||
ps.setString(4, String.join(",", labels));
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all lots that are still active (i.e., have a closing time
|
||||
* in the future or unknown). Only these lots need to be monitored.
|
||||
*/
|
||||
synchronized List<Lot> getActiveLots() throws SQLException {
|
||||
List<Lot> list = new ArrayList<>();
|
||||
var sql = "SELECT lot_id, sale_id, current_bid, currency, closing_time, closing_notified FROM lots";
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
while (rs.next()) {
|
||||
var lot = new Lot();
|
||||
lot.lotId = rs.getInt("lot_id");
|
||||
lot.saleId = rs.getInt("sale_id");
|
||||
lot.currentBid = rs.getDouble("current_bid");
|
||||
lot.currency = rs.getString("currency");
|
||||
var closing = rs.getString("closing_time");
|
||||
lot.closingNotified = rs.getInt("closing_notified") != 0;
|
||||
if (closing != null) {
|
||||
lot.closingTime = LocalDateTime.parse(closing);
|
||||
}
|
||||
list.add(lot);
|
||||
}
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all lots from the database.
|
||||
*/
|
||||
synchronized List<Lot> getAllLots() throws SQLException {
|
||||
List<Lot> list = new ArrayList<>();
|
||||
var sql = "SELECT lot_id, sale_id, title, current_bid, currency FROM lots";
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
while (rs.next()) {
|
||||
var lot = new Lot();
|
||||
lot.lotId = rs.getInt("lot_id");
|
||||
lot.saleId = rs.getInt("sale_id");
|
||||
lot.title = rs.getString("title");
|
||||
lot.currentBid = rs.getDouble("current_bid");
|
||||
lot.currency = rs.getString("currency");
|
||||
list.add(lot);
|
||||
}
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the total number of images in the database.
|
||||
*/
|
||||
synchronized int getImageCount() throws SQLException {
|
||||
var sql = "SELECT COUNT(*) as count FROM images";
|
||||
try (var conn = DriverManager.getConnection(url); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery(sql);
|
||||
if (rs.next()) {
|
||||
return rs.getInt("count");
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the current bid of a lot after a bid refresh.
|
||||
*/
|
||||
synchronized void updateLotCurrentBid(Lot lot) throws SQLException {
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(
|
||||
"UPDATE lots SET current_bid = ? WHERE lot_id = ?")) {
|
||||
ps.setDouble(1, lot.currentBid);
|
||||
ps.setInt(2, lot.lotId);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the closingNotified flag of a lot (set to 1 when we have
|
||||
* warned the user about its imminent closure).
|
||||
*/
|
||||
synchronized void updateLotNotificationFlags(Lot lot) throws SQLException {
|
||||
try (var conn = DriverManager.getConnection(url); var ps = conn.prepareStatement(
|
||||
"UPDATE lots SET closing_notified = ? WHERE lot_id = ?")) {
|
||||
ps.setInt(1, lot.closingNotified ? 1 : 0);
|
||||
ps.setInt(2, lot.lotId);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
29
src/main/java/com/auction/Lot.java
Normal file
29
src/main/java/com/auction/Lot.java
Normal file
@@ -0,0 +1,29 @@
|
||||
package com.auction;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
/**
|
||||
* Simple POJO representing a lot (kavel) in an auction. It keeps track
|
||||
* of the sale it belongs to, current bid and closing time. The method
|
||||
* minutesUntilClose computes how many minutes remain until the lot closes.
|
||||
*/
|
||||
final class Lot {
|
||||
|
||||
int saleId;
|
||||
int lotId;
|
||||
String title;
|
||||
String description;
|
||||
String manufacturer;
|
||||
String type;
|
||||
int year;
|
||||
String category;
|
||||
double currentBid;
|
||||
String currency;
|
||||
String url;
|
||||
LocalDateTime closingTime; // null if unknown
|
||||
boolean closingNotified;
|
||||
|
||||
long minutesUntilClose() {
|
||||
if (closingTime == null) return Long.MAX_VALUE;
|
||||
return java.time.Duration.between(LocalDateTime.now(), closingTime).toMinutes();
|
||||
}
|
||||
}
|
||||
@@ -1,23 +1,82 @@
|
||||
package com.auction;
|
||||
|
||||
import org.opencv.core.Core;
|
||||
import java.util.List;
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
public static void main2(String[] args) {
|
||||
// If arguments are passed, this is likely a one-off command via dokku run
|
||||
// Just exit immediately to allow the command to run
|
||||
if (args.length > 0) {
|
||||
System.out.println("Command mode - exiting to allow shell commands");
|
||||
IO.println("Command mode - exiting to allow shell commands");
|
||||
return;
|
||||
}
|
||||
|
||||
System.out.println("Starting Troostwijk Auction Scraper...");
|
||||
System.out.println("Container is running and healthy.");
|
||||
IO.println("Starting Troostwijk Auction Scraper...");
|
||||
IO.println("Container is running and healthy.");
|
||||
|
||||
// Keep container alive
|
||||
try {
|
||||
Thread.sleep(Long.MAX_VALUE);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
System.out.println("Container interrupted, exiting.");
|
||||
IO.println("Container interrupted, exiting.");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Entry point. Configure database location, notification settings, and
|
||||
* YOLO model paths here before running. Once started the scraper
|
||||
* discovers Dutch auctions, scrapes lots, and begins monitoring.
|
||||
*/
|
||||
public static void main(String[] args) throws Exception {
|
||||
IO.println("=== Troostwijk Auction Scraper ===\n");
|
||||
|
||||
// Configuration parameters (replace with your own values)
|
||||
String databaseFile = "troostwijk.db";
|
||||
|
||||
// Notification configuration - choose one:
|
||||
// Option 1: Desktop notifications only (free, no setup required)
|
||||
String notificationConfig = System.getenv().getOrDefault("NOTIFICATION_CONFIG", "desktop");
|
||||
|
||||
// Option 2: Desktop + Email via Gmail (free, requires Gmail app password)
|
||||
// Format: "smtp:username:appPassword:toEmail"
|
||||
// Example: "smtp:your.email@gmail.com:abcd1234efgh5678:recipient@example.com"
|
||||
// Get app password: Google Account > Security > 2-Step Verification > App passwords
|
||||
|
||||
// YOLO model paths (optional - scraper works without object detection)
|
||||
String yoloCfg = "models/yolov4.cfg";
|
||||
String yoloWeights = "models/yolov4.weights";
|
||||
String yoloClasses = "models/coco.names";
|
||||
|
||||
// Load native OpenCV library
|
||||
System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
|
||||
|
||||
IO.println("Initializing scraper...");
|
||||
TroostwijkScraper scraper = new TroostwijkScraper(databaseFile, notificationConfig, "",
|
||||
yoloCfg, yoloWeights, yoloClasses);
|
||||
|
||||
// Step 1: Discover auctions in NL
|
||||
IO.println("\n[1/3] Discovering Dutch auctions...");
|
||||
List<Integer> auctions = scraper.discoverDutchAuctions();
|
||||
IO.println("✓ Found " + auctions.size() + " auctions: " + auctions);
|
||||
|
||||
// Step 2: Fetch lots for each auction
|
||||
IO.println("\n[2/3] Fetching lot details...");
|
||||
int totalAuctions = auctions.size();
|
||||
int currentAuction = 0;
|
||||
for (int saleId : auctions) {
|
||||
currentAuction++;
|
||||
IO.println(" [Page " + currentAuction + "] Fetching auctions...");
|
||||
IO.println(" [" + currentAuction + "/" + totalAuctions + "] Processing sale " + saleId + "...");
|
||||
scraper.fetchLotsForSale(saleId);
|
||||
}
|
||||
|
||||
// Show database summary
|
||||
IO.println("\n📊 Database Summary:");
|
||||
scraper.printDatabaseStats();
|
||||
|
||||
// Step 3: Start monitoring bids and closures
|
||||
IO.println("\n[3/3] Starting monitoring service...");
|
||||
scraper.scheduleMonitoring();
|
||||
IO.println("✓ Monitoring active. Press Ctrl+C to stop.\n");
|
||||
}
|
||||
}
|
||||
|
||||
156
src/main/java/com/auction/NotificationService.java
Normal file
156
src/main/java/com/auction/NotificationService.java
Normal file
@@ -0,0 +1,156 @@
|
||||
package com.auction;
|
||||
|
||||
import javax.mail.Authenticator;
|
||||
import javax.mail.Message.RecipientType;
|
||||
import javax.mail.PasswordAuthentication;
|
||||
import javax.mail.Session;
|
||||
import javax.mail.Transport;
|
||||
import javax.mail.internet.InternetAddress;
|
||||
import javax.mail.internet.MimeMessage;
|
||||
import java.awt.SystemTray;
|
||||
import java.awt.Toolkit;
|
||||
import java.awt.TrayIcon;
|
||||
import java.awt.TrayIcon.MessageType;
|
||||
import java.util.Date;
|
||||
import java.util.Properties;
|
||||
/**
|
||||
* Service for sending notifications via desktop notifications and/or email.
|
||||
* Supports free notification methods:
|
||||
* 1. Desktop notifications (Windows/Linux/macOS system tray)
|
||||
* 2. Email via Gmail SMTP (free, requires app password)
|
||||
*
|
||||
* Configuration:
|
||||
* - For email: Set notificationEmail to your Gmail address
|
||||
* - Enable 2FA in Gmail and create an App Password
|
||||
* - Use format "smtp:username:appPassword:toEmail" for credentials
|
||||
* - Or use "desktop" for desktop-only notifications
|
||||
*/
|
||||
class NotificationService {
|
||||
|
||||
private final boolean useDesktop;
|
||||
private final boolean useEmail;
|
||||
private final String smtpUsername;
|
||||
private final String smtpPassword;
|
||||
private final String toEmail;
|
||||
|
||||
/**
|
||||
* Creates a notification service.
|
||||
*
|
||||
* @param config "desktop" for desktop only, or "smtp:username:password:toEmail" for email
|
||||
* @param unusedParam Kept for compatibility (can pass empty string)
|
||||
*/
|
||||
NotificationService(String config, String unusedParam) {
|
||||
|
||||
if ("desktop".equalsIgnoreCase(config)) {
|
||||
this.useDesktop = true;
|
||||
this.useEmail = false;
|
||||
this.smtpUsername = null;
|
||||
this.smtpPassword = null;
|
||||
this.toEmail = null;
|
||||
} else if (config.startsWith("smtp:")) {
|
||||
var parts = config.split(":", 4);
|
||||
if (parts.length != 4) {
|
||||
throw new IllegalArgumentException("Email config must be 'smtp:username:password:toEmail'");
|
||||
}
|
||||
this.useDesktop = true; // Always include desktop
|
||||
this.useEmail = true;
|
||||
this.smtpUsername = parts[1];
|
||||
this.smtpPassword = parts[2];
|
||||
this.toEmail = parts[3];
|
||||
} else {
|
||||
throw new IllegalArgumentException("Config must be 'desktop' or 'smtp:username:password:toEmail'");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends notification via configured channels.
|
||||
*
|
||||
* @param message The message body
|
||||
* @param title Message title
|
||||
* @param priority Priority level (0=normal, 1=high)
|
||||
*/
|
||||
void sendNotification(String message, String title, int priority) {
|
||||
if (useDesktop) {
|
||||
sendDesktopNotification(title, message, priority);
|
||||
}
|
||||
if (useEmail) {
|
||||
sendEmailNotification(title, message, priority);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a desktop notification using system tray.
|
||||
* Works on Windows, macOS, and Linux with desktop environments.
|
||||
*/
|
||||
private void sendDesktopNotification(String title, String message, int priority) {
|
||||
try {
|
||||
if (SystemTray.isSupported()) {
|
||||
var tray = SystemTray.getSystemTray();
|
||||
var image = Toolkit.getDefaultToolkit()
|
||||
.createImage(new byte[0]); // Empty image
|
||||
|
||||
var trayIcon = new TrayIcon(image, "Troostwijk Scraper");
|
||||
trayIcon.setImageAutoSize(true);
|
||||
|
||||
var messageType = priority > 0
|
||||
? MessageType.WARNING
|
||||
: MessageType.INFO;
|
||||
|
||||
tray.add(trayIcon);
|
||||
trayIcon.displayMessage(title, message, messageType);
|
||||
|
||||
// Remove icon after 2 seconds to avoid clutter
|
||||
Thread.sleep(2000);
|
||||
tray.remove(trayIcon);
|
||||
|
||||
IO.println("Desktop notification sent: " + title);
|
||||
} else {
|
||||
IO.println("Desktop notifications not supported, logging: " + title + " - " + message);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println("Desktop notification failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends email notification via Gmail SMTP (free).
|
||||
* Uses Gmail's SMTP server with app password authentication.
|
||||
*/
|
||||
private void sendEmailNotification(String title, String message, int priority) {
|
||||
try {
|
||||
var props = new Properties();
|
||||
props.put("mail.smtp.auth", "true");
|
||||
props.put("mail.smtp.starttls.enable", "true");
|
||||
props.put("mail.smtp.host", "smtp.gmail.com");
|
||||
props.put("mail.smtp.port", "587");
|
||||
props.put("mail.smtp.ssl.trust", "smtp.gmail.com");
|
||||
|
||||
var session = Session.getInstance(props,
|
||||
new Authenticator() {
|
||||
|
||||
protected PasswordAuthentication getPasswordAuthentication() {
|
||||
return new PasswordAuthentication(smtpUsername, smtpPassword);
|
||||
}
|
||||
});
|
||||
|
||||
var msg = new MimeMessage(session);
|
||||
msg.setFrom(new InternetAddress(smtpUsername));
|
||||
msg.setRecipients(RecipientType.TO,
|
||||
InternetAddress.parse(toEmail));
|
||||
msg.setSubject("[Troostwijk] " + title);
|
||||
msg.setText(message);
|
||||
msg.setSentDate(new Date());
|
||||
|
||||
if (priority > 0) {
|
||||
msg.setHeader("X-Priority", "1");
|
||||
msg.setHeader("Importance", "High");
|
||||
}
|
||||
|
||||
Transport.send(msg);
|
||||
IO.println("Email notification sent: " + title);
|
||||
|
||||
} catch (Exception e) {
|
||||
System.err.println("Email notification failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
140
src/main/java/com/auction/ObjectDetectionService.java
Normal file
140
src/main/java/com/auction/ObjectDetectionService.java
Normal file
@@ -0,0 +1,140 @@
|
||||
package com.auction;
|
||||
|
||||
import org.opencv.core.Mat;
|
||||
import org.opencv.core.Scalar;
|
||||
import org.opencv.core.Size;
|
||||
import org.opencv.dnn.Dnn;
|
||||
import org.opencv.dnn.Net;
|
||||
import org.opencv.imgcodecs.Imgcodecs;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV;
|
||||
import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
|
||||
/**
|
||||
* Service for performing object detection on images using OpenCV's DNN
|
||||
* module. The DNN module can load pre‑trained models from several
|
||||
* frameworks (Darknet, TensorFlow, ONNX, etc.)【784097309529506†L209-L233】. Here
|
||||
* we load a YOLO model (Darknet) by specifying the configuration and
|
||||
* weights files. For each image we run a forward pass and return a
|
||||
* list of detected class labels.
|
||||
*
|
||||
* If model files are not found, the service operates in disabled mode
|
||||
* and returns empty lists.
|
||||
*/
|
||||
class ObjectDetectionService {
|
||||
|
||||
private final Net net;
|
||||
private final List<String> classNames;
|
||||
private final boolean enabled;
|
||||
|
||||
ObjectDetectionService(String cfgPath, String weightsPath, String classNamesPath) throws IOException {
|
||||
// Check if model files exist
|
||||
var cfgFile = Paths.get(cfgPath);
|
||||
var weightsFile = Paths.get(weightsPath);
|
||||
var classNamesFile = Paths.get(classNamesPath);
|
||||
|
||||
if (!Files.exists(cfgFile) || !Files.exists(weightsFile) || !Files.exists(classNamesFile)) {
|
||||
IO.println("⚠️ Object detection disabled: YOLO model files not found");
|
||||
IO.println(" Expected files:");
|
||||
IO.println(" - " + cfgPath);
|
||||
IO.println(" - " + weightsPath);
|
||||
IO.println(" - " + classNamesPath);
|
||||
IO.println(" Scraper will continue without image analysis.");
|
||||
this.enabled = false;
|
||||
this.net = null;
|
||||
this.classNames = new ArrayList<>();
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Load network
|
||||
this.net = Dnn.readNetFromDarknet(cfgPath, weightsPath);
|
||||
this.net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
this.net.setPreferableTarget(DNN_TARGET_CPU);
|
||||
// Load class names (one per line)
|
||||
this.classNames = Files.readAllLines(classNamesFile);
|
||||
this.enabled = true;
|
||||
IO.println("✓ Object detection enabled with YOLO");
|
||||
} catch (Exception e) {
|
||||
System.err.println("⚠️ Object detection disabled: " + e.getMessage());
|
||||
throw new IOException("Failed to initialize object detection", e);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Detects objects in the given image file and returns a list of
|
||||
* human‑readable labels. Only detections above a confidence
|
||||
* threshold are returned. For brevity this method omits drawing
|
||||
* bounding boxes. See the OpenCV DNN documentation for details on
|
||||
* post‑processing【784097309529506†L324-L344】.
|
||||
*
|
||||
* @param imagePath absolute path to the image
|
||||
* @return list of detected class names (empty if detection disabled)
|
||||
*/
|
||||
List<String> detectObjects(String imagePath) {
|
||||
if (!enabled) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
List<String> labels = new ArrayList<>();
|
||||
var image = Imgcodecs.imread(imagePath);
|
||||
if (image.empty()) return labels;
|
||||
// Create a 4D blob from the image
|
||||
var blob = Dnn.blobFromImage(image, 1.0 / 255.0, new Size(416, 416), new Scalar(0, 0, 0), true, false);
|
||||
net.setInput(blob);
|
||||
List<Mat> outs = new ArrayList<>();
|
||||
var outNames = getOutputLayerNames(net);
|
||||
net.forward(outs, outNames);
|
||||
// Post‑process: for each detection compute score and choose class
|
||||
var confThreshold = 0.5f;
|
||||
for (var out : outs) {
|
||||
for (var i = 0; i < out.rows(); i++) {
|
||||
var data = out.get(i, 0);
|
||||
if (data == null) continue;
|
||||
// The first 5 numbers are bounding box, then class scores
|
||||
var scores = new double[classNames.size()];
|
||||
System.arraycopy(data, 5, scores, 0, scores.length);
|
||||
var classId = argMax(scores);
|
||||
var confidence = scores[classId];
|
||||
if (confidence > confThreshold) {
|
||||
var label = classNames.get(classId);
|
||||
if (!labels.contains(label)) {
|
||||
labels.add(label);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return labels;
|
||||
}
|
||||
/**
|
||||
* Returns the indexes of the output layers in the network. YOLO
|
||||
* automatically discovers its output layers; other models may require
|
||||
* manually specifying them【784097309529506†L356-L365】.
|
||||
*/
|
||||
private List<String> getOutputLayerNames(Net net) {
|
||||
List<String> names = new ArrayList<>();
|
||||
var outLayers = net.getUnconnectedOutLayers().toList();
|
||||
var layersNames = net.getLayerNames();
|
||||
for (var i : outLayers) {
|
||||
names.add(layersNames.get(i - 1));
|
||||
}
|
||||
return names;
|
||||
}
|
||||
/**
|
||||
* Returns the index of the maximum value in the array.
|
||||
*/
|
||||
private int argMax(double[] array) {
|
||||
var best = 0;
|
||||
var max = array[0];
|
||||
for (var i = 1; i < array.length; i++) {
|
||||
if (array[i] > max) {
|
||||
max = array[i];
|
||||
best = i;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
}
|
||||
@@ -1,645 +0,0 @@
|
||||
package com.auction;
|
||||
|
||||
import com.microsoft.playwright.*;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.microsoft.playwright.options.WaitUntilState;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.*;
|
||||
import java.sql.*;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* TroostwijkAuctionExtractor
|
||||
*
|
||||
* Extracts auction listings from https://www.troostwijkauctions.com/auctions
|
||||
* using Playwright for Java (headless browser automation).
|
||||
*
|
||||
* Features:
|
||||
* - Uses Playwright for Java to load JavaScript-rendered content
|
||||
* - Iterates through all pages of auction listings
|
||||
* - Rate limiting: 200ms between each page request
|
||||
* - Caches visited pages in SQLite database with expiration times
|
||||
* - Extracts auction metadata: ID, title, location, URL
|
||||
*
|
||||
* Dependencies (Maven):
|
||||
* <dependency>
|
||||
* <groupId>com.microsoft.playwright</groupId>
|
||||
* <artifactId>playwright</artifactId>
|
||||
* <version>1.40.0</version>
|
||||
* </dependency>
|
||||
* <dependency>
|
||||
* <groupId>com.fasterxml.jackson.core</groupId>
|
||||
* <artifactId>jackson-databind</artifactId>
|
||||
* <version>2.17.0</version>
|
||||
* </dependency>
|
||||
* <dependency>
|
||||
* <groupId>org.xerial</groupId>
|
||||
* <artifactId>sqlite-jdbc</artifactId>
|
||||
* <version>3.45.1.0</version>
|
||||
* </dependency>
|
||||
*
|
||||
* After adding dependency, run: mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install"
|
||||
* This downloads the browser binaries needed by Playwright.
|
||||
*/
|
||||
public class TroostwijkAuctionExtractor {
|
||||
|
||||
private static final String AUCTIONS_BASE_URL = "https://www.troostwijkauctions.com/auctions";
|
||||
private static final int RATE_LIMIT_MS = 200;
|
||||
private static final String CACHE_DB_PATH = "cache/page_cache.db";
|
||||
private static final long CACHE_EXPIRATION_HOURS = 24; // Cache expires after 24 hours
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
private final boolean useCache;
|
||||
private final CacheDatabase cacheDb;
|
||||
private final int maxPageVisits; // Maximum number of pages to fetch (0 = unlimited)
|
||||
private int pageVisitCount; // Counter for actual page fetches (not from cache)
|
||||
private Playwright playwright;
|
||||
private Browser browser;
|
||||
|
||||
/**
|
||||
* Represents an auction listing
|
||||
*/
|
||||
public static class Auction {
|
||||
public int id;
|
||||
public String title;
|
||||
public String location;
|
||||
public String url;
|
||||
public String type; // e.g. "A1" or "A7"
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', url='%s'}",
|
||||
id, type, title, location, url);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param useCache Enable database caching of visited pages
|
||||
* @param maxPageVisits Maximum number of actual page fetches (0 = unlimited)
|
||||
*/
|
||||
public TroostwijkAuctionExtractor(boolean useCache, int maxPageVisits) throws SQLException, IOException {
|
||||
this.objectMapper = new ObjectMapper();
|
||||
this.useCache = useCache;
|
||||
this.maxPageVisits = maxPageVisits;
|
||||
this.pageVisitCount = 0;
|
||||
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
|
||||
|
||||
if (useCache) {
|
||||
cacheDb.initialize();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor with default unlimited page visits
|
||||
*
|
||||
* @param useCache Enable database caching of visited pages
|
||||
*/
|
||||
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
|
||||
this(useCache, 0); // 0 = unlimited
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes Playwright and browser instance
|
||||
* Call this before extracting auctions
|
||||
*/
|
||||
public void initialize() {
|
||||
System.out.println("Initializing Playwright browser...");
|
||||
this.playwright = Playwright.create();
|
||||
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
|
||||
.setHeadless(true)
|
||||
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
|
||||
System.out.println("✓ Browser ready");
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes browser and Playwright instance
|
||||
* Call this when done extracting
|
||||
*/
|
||||
public void close() {
|
||||
if (browser != null) {
|
||||
browser.close();
|
||||
}
|
||||
if (playwright != null) {
|
||||
playwright.close();
|
||||
}
|
||||
if (cacheDb != null) {
|
||||
cacheDb.close();
|
||||
}
|
||||
System.out.println("✓ Browser and cache closed");
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all auctions from all pages
|
||||
*
|
||||
* @return List of all discovered auctions
|
||||
*/
|
||||
public List<Auction> extractAllAuctions() throws InterruptedException {
|
||||
if (browser == null) {
|
||||
throw new IllegalStateException("Browser not initialized. Call initialize() first.");
|
||||
}
|
||||
|
||||
List<Auction> allAuctions = new ArrayList<>();
|
||||
int pageNumber = 1;
|
||||
boolean hasMorePages = true;
|
||||
|
||||
System.out.println("Starting auction extraction from " + AUCTIONS_BASE_URL);
|
||||
|
||||
while (hasMorePages) {
|
||||
System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
|
||||
|
||||
// Check cache first
|
||||
String cachedHtml = loadFromCache(pageNumber);
|
||||
String html;
|
||||
|
||||
if (cachedHtml != null) {
|
||||
System.out.println(" ✓ Loaded from cache");
|
||||
html = cachedHtml;
|
||||
} else {
|
||||
// Check if we've reached the maximum page visit limit
|
||||
if (maxPageVisits > 0 && pageVisitCount >= maxPageVisits) {
|
||||
System.out.println(" ⚠️ Reached maximum page visit limit (" + maxPageVisits + "), stopping");
|
||||
break;
|
||||
}
|
||||
|
||||
// Fetch with Playwright
|
||||
html = fetchPageWithPlaywright(pageNumber);
|
||||
pageVisitCount++; // Increment actual page fetch counter
|
||||
|
||||
if (html == null || html.isEmpty()) {
|
||||
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
|
||||
break;
|
||||
}
|
||||
|
||||
System.out.println(" ✓ Fetched from website (visit " + pageVisitCount +
|
||||
(maxPageVisits > 0 ? "/" + maxPageVisits : "") + ")");
|
||||
|
||||
// Save to cache
|
||||
if (useCache) {
|
||||
saveToCache(pageNumber, html);
|
||||
}
|
||||
|
||||
// Rate limiting - wait 200ms before next request
|
||||
Thread.sleep(RATE_LIMIT_MS);
|
||||
}
|
||||
|
||||
// Parse auctions from HTML
|
||||
List<Auction> pageAuctions = parseAuctionsFromHtml(html);
|
||||
|
||||
if (pageAuctions.isEmpty()) {
|
||||
System.out.println(" ⚠️ No auctions found on page, stopping pagination");
|
||||
hasMorePages = false;
|
||||
} else {
|
||||
System.out.println(" ✓ Found " + pageAuctions.size() + " auctions");
|
||||
allAuctions.addAll(pageAuctions);
|
||||
pageNumber++;
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("\n✓ Total auctions extracted: " + allAuctions.size());
|
||||
return allAuctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches a single page using Playwright
|
||||
*
|
||||
* @param pageNumber Page number (1-indexed)
|
||||
* @return HTML content of the page
|
||||
*/
|
||||
private String fetchPageWithPlaywright(int pageNumber) {
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_BASE_URL
|
||||
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
|
||||
|
||||
try {
|
||||
Page page = browser.newPage();
|
||||
|
||||
// Set user agent
|
||||
page.setExtraHTTPHeaders(Map.of(
|
||||
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
));
|
||||
|
||||
// Navigate to page
|
||||
page.navigate(url, new Page.NavigateOptions()
|
||||
.setTimeout(30000)
|
||||
.setWaitUntil(WaitUntilState.NETWORKIDLE));
|
||||
|
||||
// Wait for auction listings to appear
|
||||
try {
|
||||
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
|
||||
.setTimeout(10000));
|
||||
} catch (Exception e) {
|
||||
// Continue even if selector not found
|
||||
System.out.println(" ⚠️ Auction selector not found, attempting to parse anyway");
|
||||
}
|
||||
|
||||
// Get HTML content
|
||||
String html = page.content();
|
||||
page.close();
|
||||
|
||||
return html;
|
||||
|
||||
} catch (Exception e) {
|
||||
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses auction data from HTML content
|
||||
*
|
||||
* @param html HTML content
|
||||
* @return List of parsed auctions
|
||||
*/
|
||||
private List<Auction> parseAuctionsFromHtml(String html) {
|
||||
List<Auction> auctions = new ArrayList<>();
|
||||
|
||||
// Simple regex-based parsing for auction links
|
||||
// Format: <a href="/a/title-A1-12345" or "/a/title-A7-12345"
|
||||
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
|
||||
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
|
||||
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
|
||||
|
||||
Set<Integer> seenIds = new HashSet<>();
|
||||
|
||||
while (linkMatcher.find()) {
|
||||
String href = linkMatcher.group(1);
|
||||
int auctionId = Integer.parseInt(linkMatcher.group(2));
|
||||
|
||||
// Avoid duplicates
|
||||
if (seenIds.contains(auctionId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract auction type (A1 or A7)
|
||||
String type = href.contains("A1-") ? "A1" : "A7";
|
||||
|
||||
// Try to find location and title near this link
|
||||
String location = extractLocationNearLink(html, href);
|
||||
String title = extractTitleFromHref(href);
|
||||
|
||||
Auction auction = new Auction();
|
||||
auction.id = auctionId;
|
||||
auction.type = type;
|
||||
auction.title = title;
|
||||
auction.location = location;
|
||||
auction.url = "https://www.troostwijkauctions.com" + href;
|
||||
|
||||
auctions.add(auction);
|
||||
seenIds.add(auctionId);
|
||||
}
|
||||
|
||||
return auctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts location text near an auction link
|
||||
* Looks for ", NL" or other country codes
|
||||
*/
|
||||
private String extractLocationNearLink(String html, String href) {
|
||||
int hrefPos = html.indexOf(href);
|
||||
if (hrefPos == -1) return "Unknown";
|
||||
|
||||
// Look at 1000 characters before AND after the href for location info
|
||||
int startPos = Math.max(hrefPos - 500, 0);
|
||||
int endPos = Math.min(hrefPos + 1000, html.length());
|
||||
String context = html.substring(startPos, endPos);
|
||||
|
||||
// Pattern 1: Classic format "City, NL"
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
|
||||
|
||||
if (locMatcher.find()) {
|
||||
String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2);
|
||||
System.out.println(" Found location: " + location + " for auction " + href);
|
||||
return location;
|
||||
}
|
||||
|
||||
// Pattern 2: HTML format like "<span>City,<!-- --> </span>NL"
|
||||
// Extract city and country code separately
|
||||
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
|
||||
"<span[^>]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
|
||||
|
||||
if (htmlMatcher.find()) {
|
||||
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
|
||||
String country = htmlMatcher.group(2);
|
||||
String location = city + ", " + country;
|
||||
System.out.println(" Found location (HTML): " + location + " for auction " + href);
|
||||
return location;
|
||||
}
|
||||
|
||||
// Pattern 3: Fallback - just find country code after HTML tags
|
||||
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
|
||||
"(?:-->|</span>|</div>)\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
|
||||
|
||||
if (countryMatcher.find()) {
|
||||
String country = countryMatcher.group(1);
|
||||
System.out.println(" Found country code: " + country + " for auction " + href);
|
||||
return "Unknown, " + country;
|
||||
}
|
||||
|
||||
System.out.println(" ⚠️ No location found for auction " + href);
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts human-readable title from URL slug
|
||||
* Converts "some-auction-title-A1-12345" to "Some Auction Title"
|
||||
*/
|
||||
private String extractTitleFromHref(String href) {
|
||||
// Extract everything between "/a/" and "-A1-" or "-A7-"
|
||||
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile(
|
||||
"/a/(.+?)-A[17]-");
|
||||
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
|
||||
|
||||
if (titleMatcher.find()) {
|
||||
String slug = titleMatcher.group(1);
|
||||
// Convert kebab-case to Title Case
|
||||
String[] words = slug.split("-");
|
||||
StringBuilder title = new StringBuilder();
|
||||
for (String word : words) {
|
||||
if (!word.isEmpty()) {
|
||||
title.append(Character.toUpperCase(word.charAt(0)))
|
||||
.append(word.substring(1))
|
||||
.append(" ");
|
||||
}
|
||||
}
|
||||
return title.toString().trim();
|
||||
}
|
||||
|
||||
return "Untitled Auction";
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads cached HTML for a page from SQLite database
|
||||
* Returns null if not cached or cache has expired
|
||||
*
|
||||
* @param pageNumber Page number
|
||||
* @return Cached HTML or null if not found/expired
|
||||
*/
|
||||
private String loadFromCache(int pageNumber) {
|
||||
if (!useCache || cacheDb == null) return null;
|
||||
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_BASE_URL
|
||||
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
|
||||
|
||||
return cacheDb.get(url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves HTML to SQLite cache database with expiration time
|
||||
*
|
||||
* @param pageNumber Page number
|
||||
* @param html HTML content
|
||||
*/
|
||||
private void saveToCache(int pageNumber, String html) {
|
||||
if (!useCache || cacheDb == null) return;
|
||||
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_BASE_URL
|
||||
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
|
||||
|
||||
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters auctions by location
|
||||
*
|
||||
* @param auctions List of auctions
|
||||
* @param locationFilter Location string to match (e.g., "NL")
|
||||
* @return Filtered list
|
||||
*/
|
||||
public static List<Auction> filterByLocation(List<Auction> auctions, String locationFilter) {
|
||||
return auctions.stream()
|
||||
.filter(a -> a.location.contains(locationFilter))
|
||||
.toList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Entry point for testing
|
||||
*
|
||||
* Arguments:
|
||||
* --max-visits <number> : Maximum number of page visits (0 = unlimited, default)
|
||||
* --no-cache : Disable caching
|
||||
*/
|
||||
public static void main(String[] args) throws Exception {
|
||||
System.out.println("=== Troostwijk Auction Extractor ===\n");
|
||||
|
||||
// Parse command line arguments
|
||||
boolean useCache = true;
|
||||
int maxVisits = 0; // 0 = unlimited
|
||||
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case "--max-visits":
|
||||
if (i + 1 < args.length) {
|
||||
maxVisits = Integer.parseInt(args[++i]);
|
||||
System.out.println("Max page visits set to: " + maxVisits);
|
||||
}
|
||||
break;
|
||||
case "--no-cache":
|
||||
useCache = false;
|
||||
System.out.println("Caching disabled");
|
||||
break;
|
||||
case "--help":
|
||||
System.out.println("Usage: java TroostwijkAuctionExtractor [options]");
|
||||
System.out.println("Options:");
|
||||
System.out.println(" --max-visits <n> : Limit actual page fetches to n (0 = unlimited)");
|
||||
System.out.println(" --no-cache : Disable page caching");
|
||||
System.out.println(" --help : Show this help message");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache, maxVisits);
|
||||
|
||||
try {
|
||||
// Initialize browser
|
||||
extractor.initialize();
|
||||
|
||||
// Extract all auctions
|
||||
List<Auction> allAuctions = extractor.extractAllAuctions();
|
||||
|
||||
// Filter for Dutch auctions only
|
||||
List<Auction> dutchAuctions = filterByLocation(allAuctions, "NL");
|
||||
|
||||
System.out.println("\n=== Results ===");
|
||||
System.out.println("Total auctions found: " + allAuctions.size());
|
||||
System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
|
||||
System.out.println("Actual page visits: " + extractor.pageVisitCount);
|
||||
|
||||
// Display first 10 Dutch auctions
|
||||
System.out.println("\n=== Sample Dutch Auctions ===");
|
||||
dutchAuctions.stream()
|
||||
.limit(10)
|
||||
.forEach(System.out::println);
|
||||
|
||||
} finally {
|
||||
// Always close browser
|
||||
extractor.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* SQLite-based caching system for HTML pages with expiration support
|
||||
*/
|
||||
static class CacheDatabase {
|
||||
private final String dbPath;
|
||||
private Connection connection;
|
||||
|
||||
public CacheDatabase(String dbPath) {
|
||||
this.dbPath = dbPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize database and create schema
|
||||
*/
|
||||
public void initialize() throws SQLException, IOException {
|
||||
// Create cache directory if it doesn't exist
|
||||
Path cacheDir = Paths.get(dbPath).getParent();
|
||||
if (cacheDir != null) {
|
||||
Files.createDirectories(cacheDir);
|
||||
}
|
||||
|
||||
connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
|
||||
|
||||
// Create cache table with URL as primary key
|
||||
String createTable = """
|
||||
CREATE TABLE IF NOT EXISTS page_cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
html TEXT NOT NULL,
|
||||
cached_at INTEGER NOT NULL,
|
||||
expires_at INTEGER NOT NULL
|
||||
)
|
||||
""";
|
||||
|
||||
try (Statement stmt = connection.createStatement()) {
|
||||
stmt.execute(createTable);
|
||||
// Create index on expires_at for efficient cleanup
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
|
||||
}
|
||||
|
||||
// Clean up expired entries on initialization
|
||||
cleanupExpired();
|
||||
|
||||
System.out.println("✓ Cache database initialized");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cached HTML for a URL if it exists and hasn't expired
|
||||
*
|
||||
* @param url The URL to look up
|
||||
* @return Cached HTML or null if not found/expired
|
||||
*/
|
||||
public synchronized String get(String url) {
|
||||
String sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
|
||||
|
||||
try (PreparedStatement ps = connection.prepareStatement(sql)) {
|
||||
ps.setString(1, url);
|
||||
ps.setLong(2, Instant.now().getEpochSecond());
|
||||
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return rs.getString("html");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache read error: " + e.getMessage());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store HTML in cache with expiration time
|
||||
*
|
||||
* @param url The URL to cache
|
||||
* @param html The HTML content
|
||||
* @param expirationHours Hours until cache expires
|
||||
*/
|
||||
public synchronized void put(String url, String html, long expirationHours) {
|
||||
String sql = """
|
||||
INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""";
|
||||
|
||||
long now = Instant.now().getEpochSecond();
|
||||
long expiresAt = now + (expirationHours * 3600);
|
||||
|
||||
try (PreparedStatement ps = connection.prepareStatement(sql)) {
|
||||
ps.setString(1, url);
|
||||
ps.setString(2, html);
|
||||
ps.setLong(3, now);
|
||||
ps.setLong(4, expiresAt);
|
||||
ps.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache write error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove expired cache entries
|
||||
*/
|
||||
public synchronized void cleanupExpired() {
|
||||
String sql = "DELETE FROM page_cache WHERE expires_at <= ?";
|
||||
|
||||
try (PreparedStatement ps = connection.prepareStatement(sql)) {
|
||||
ps.setLong(1, Instant.now().getEpochSecond());
|
||||
int deleted = ps.executeUpdate();
|
||||
if (deleted > 0) {
|
||||
System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache cleanup error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cache statistics
|
||||
*/
|
||||
public synchronized void printStats() {
|
||||
String sql = "SELECT COUNT(*) as total, " +
|
||||
"SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
|
||||
"SUM(LENGTH(html)) as total_size " +
|
||||
"FROM page_cache";
|
||||
|
||||
try (PreparedStatement ps = connection.prepareStatement(sql)) {
|
||||
ps.setLong(1, Instant.now().getEpochSecond());
|
||||
ResultSet rs = ps.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
int total = rs.getInt("total");
|
||||
int valid = rs.getInt("valid");
|
||||
long size = rs.getLong("total_size");
|
||||
|
||||
System.out.println("\n=== Cache Statistics ===");
|
||||
System.out.println("Total entries: " + total);
|
||||
System.out.println("Valid entries: " + valid);
|
||||
System.out.println("Expired entries: " + (total - valid));
|
||||
System.out.println("Total size: " + (size / 1024) + " KB");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Cache stats error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close database connection
|
||||
*/
|
||||
public void close() {
|
||||
if (connection != null) {
|
||||
try {
|
||||
connection.close();
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Error closing cache database: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -41,8 +41,8 @@ public class AuctionParsingTest {
|
||||
System.out.println("\n=== Auction Parsing Test ===");
|
||||
System.out.println("Found " + auctionLinks.size() + " auction links");
|
||||
|
||||
List<TroostwijkScraper.AuctionInfo> auctions = new ArrayList<>();
|
||||
int count = 0;
|
||||
List<AuctionInfo> auctions = new ArrayList<>();
|
||||
int count = 0;
|
||||
|
||||
for (Element link : auctionLinks) {
|
||||
String href = link.attr("href");
|
||||
@@ -59,7 +59,7 @@ public class AuctionParsingTest {
|
||||
int auctionId = Integer.parseInt(matcher.group(2));
|
||||
|
||||
// Extract auction info using IMPROVED text-based method
|
||||
TroostwijkScraper.AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
|
||||
AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
|
||||
auctions.add(auction);
|
||||
|
||||
// Print the first 10 auctions for verification
|
||||
@@ -101,7 +101,7 @@ public class AuctionParsingTest {
|
||||
assertTrue(auctions.size() > 0, "Should find at least one auction");
|
||||
|
||||
// Verify all auctions have basic info
|
||||
for (TroostwijkScraper.AuctionInfo auction : auctions) {
|
||||
for (AuctionInfo auction : auctions) {
|
||||
assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId);
|
||||
assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId);
|
||||
assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId);
|
||||
@@ -119,8 +119,8 @@ public class AuctionParsingTest {
|
||||
* Expected format: "[day] om [time] [lot_count] [title] [city], [CC]"
|
||||
* Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
|
||||
*/
|
||||
private TroostwijkScraper.AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
|
||||
TroostwijkScraper.AuctionInfo auction = new TroostwijkScraper.AuctionInfo();
|
||||
private AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
|
||||
AuctionInfo auction = new AuctionInfo();
|
||||
auction.auctionId = auctionId;
|
||||
auction.type = type;
|
||||
auction.url = "https://www.troostwijkauctions.com" + href;
|
||||
|
||||
@@ -68,71 +68,18 @@ public class TroostwijkScraperTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFetchAndPersistAuctionData() throws SQLException {
|
||||
// First, discover auctions
|
||||
List<Integer> auctions = scraper.discoverDutchAuctions();
|
||||
assertFalse(auctions.isEmpty(), "Need at least one auction to test");
|
||||
|
||||
// Take the first auction and fetch its lots
|
||||
Integer firstSaleId = auctions.getFirst();
|
||||
System.out.println("Testing with sale ID: " + firstSaleId);
|
||||
|
||||
scraper.fetchLotsForSale(firstSaleId);
|
||||
|
||||
// Verify data was persisted to database
|
||||
List<TroostwijkScraper.Lot> lotsInDb = scraper.db.getAllLots();
|
||||
|
||||
assertNotNull(lotsInDb, "Lots list should not be null");
|
||||
assertFalse(lotsInDb.isEmpty(), "Should have persisted at least one lot");
|
||||
|
||||
// Verify lot properties
|
||||
for (TroostwijkScraper.Lot lot : lotsInDb) {
|
||||
assertEquals(firstSaleId.intValue(), lot.saleId, "Lot should belong to the correct sale");
|
||||
assertTrue(lot.lotId > 0, "Lot ID should be positive");
|
||||
assertNotNull(lot.title, "Lot title should not be null");
|
||||
assertFalse(lot.title.isEmpty(), "Lot title should not be empty");
|
||||
assertNotNull(lot.url, "Lot URL should not be null");
|
||||
assertTrue(lot.url.startsWith("https://"), "Lot URL should be valid");
|
||||
assertTrue(lot.currentBid >= 0, "Current bid should be non-negative");
|
||||
}
|
||||
|
||||
System.out.println("✓ Successfully persisted " + lotsInDb.size() + " lots to database");
|
||||
System.out.println("✓ All lot properties are valid");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDatabaseSchema() throws SQLException {
|
||||
// Verify that the database schema was created correctly
|
||||
List<TroostwijkScraper.Lot> lots = scraper.db.getAllLots();
|
||||
List<Lot> lots = scraper.db.getAllLots();
|
||||
assertNotNull(lots, "Should be able to query lots table");
|
||||
|
||||
int imageCount = scraper.db.getImageCount();
|
||||
assertTrue(imageCount >= 0, "Image count should be non-negative");
|
||||
|
||||
List<TroostwijkScraper.Lot> activeLots = scraper.db.getActiveLots();
|
||||
List<Lot> activeLots = scraper.db.getActiveLots();
|
||||
assertNotNull(activeLots, "Should be able to query active lots");
|
||||
|
||||
System.out.println("✓ Database schema is valid and queryable");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAuctionProperties() {
|
||||
List<Integer> auctions = scraper.discoverDutchAuctions();
|
||||
assertFalse(auctions.isEmpty(), "Should find auctions");
|
||||
|
||||
// Test that we can fetch data for multiple auctions
|
||||
int auctionsToTest = Math.min(2, auctions.size());
|
||||
|
||||
for (int i = 0; i < auctionsToTest; i++) {
|
||||
Integer saleId = auctions.get(i);
|
||||
System.out.println("Testing auction " + (i + 1) + ": " + saleId);
|
||||
|
||||
// This should not throw an exception
|
||||
assertDoesNotThrow(() -> scraper.fetchLotsForSale(saleId),
|
||||
"Should be able to fetch lots for sale " + saleId);
|
||||
}
|
||||
|
||||
System.out.println("✓ Successfully tested " + auctionsToTest + " auctions");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,456 +1,61 @@
|
||||
## Woensdag 3 dec 25
|
||||
|
||||
* [
|
||||
|
||||
woensdag om 16:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
145
|
||||
|
||||
Industrie & machines
|
||||
|
||||
Meerdere locaties (45)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/industrie-machines-A3-37358)
|
||||
* [
|
||||
|
||||
woensdag om 16:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
38
|
||||
|
||||
D | Raceautotransporters, kraan-polypengrepen en containers uit voorraadaanpassing
|
||||
|
||||
Nieheim, DE
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/d-%7C-raceautotransporters-kraan-polypengrepen-en-containers-uit-voorraadaanpassing-A1-39772)
|
||||
* [
|
||||
|
||||
woensdag om 16:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
61
|
||||
|
||||
Voedselverwerkende apparatuur en verpakkingsmachines
|
||||
|
||||
CHOMERAC, FR
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/voedselverwerkende-apparatuur-en-verpakkingsmachines-A1-39319)
|
||||
* [
|
||||
|
||||
woensdag om 16:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
117
|
||||
|
||||
Landbouw- & grondverzetmachines
|
||||
|
||||
Meerdere locaties (49)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/landbouw-grondverzetmachines-A3-37375)
|
||||
* [
|
||||
|
||||
woensdag om 17:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
261
|
||||
|
||||
Gereedschappen & uitrusting
|
||||
|
||||
Meerdere locaties (36), BE
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/gereedschappen-uitrusting-A3-37367)
|
||||
* [
|
||||
|
||||
woensdag om 18:00
|
||||
|
||||

|
||||
|
||||
1
|
||||
|
||||
Vrachtwagens voor bedrijfsvoertuigen
|
||||
|
||||
Loßburg, DE
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/vrachtwagens-voor-bedrijfsvoertuigen-A7-39531)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
61
|
||||
|
||||
Witgoed en accessoires
|
||||
|
||||
Etten-Leur, NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/witgoed-en-accessoires-A1-27241)
|
||||
* [
|
||||
|
||||
Opent 28 nov 17:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
54
|
||||
|
||||
Collectie Rolex en Cartier horloges
|
||||
|
||||
Dordrecht, NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/collectie-rolex-en-cartier-horloges-A1-39398)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
254
|
||||
|
||||
SHOWROOMKEUKENS en INBOUWAPPARATUUR
|
||||
|
||||
Tilburg, NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/showroomkeukens-en-inbouwapparatuur-A1-39480)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
499
|
||||
|
||||
Machines, retourgoederen en restpartijen
|
||||
|
||||
Harlingen, NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/machines-retourgoederen-en-restpartijen-A1-39642)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
120
|
||||
|
||||
Partijen gereedschap, kantoorinventaris, detailhandelgoederen, decoratie en olijfbomen
|
||||
|
||||
Meerdere locaties (3), NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/partijen-gereedschap-kantoorinventaris-detailhandelgoederen-decoratie-en-olijfbomen-A1-27016)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
16
|
||||
|
||||
Faillissementsvoertuigen
|
||||
|
||||
Meerdere locaties (3), NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/faillissementsvoertuigen-A1-38368)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
78
|
||||
|
||||
Personenauto’s, oldtimers, campers en brommobielen
|
||||
|
||||
Buitenpost, NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/personenauto%E2%80%99s-oldtimers-campers-en-brommobielen-A1-39508)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
391
|
||||
|
||||
Bezorgveiling Faillissement Dvize B.V. – Hyundai Power Products gereedschappen
|
||||
|
||||
Meerdere locaties (2)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/bezorgveiling-faillissement-dvize-b-v-%E2%80%93-hyundai-power-products-gereedschappen-A1-39409)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
208
|
||||
|
||||
Kunstplanten en bomen, composiet gevel- en vloerbekleding en akoestische materialen
|
||||
|
||||
De Lier, NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/kunstplanten-en-bomen-composiet-gevel-en-vloerbekleding-en-akoestische-materialen-A1-28707)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
181
|
||||
|
||||
Metaalbewerkingsmachines, gereedschap en voorraad in verband met bedrijfsverhuizing
|
||||
|
||||
Cuijk, NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/metaalbewerkingsmachines-gereedschap-en-voorraad-in-verband-met-bedrijfsverhuizing-A1-39360)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
238
|
||||
|
||||
Overstock en magazijnopruiming
|
||||
|
||||
Heesch, NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/overstock-en-magazijnopruiming-A1-39538)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
47
|
||||
|
||||
Verzamelveiling Scooters en Motoren
|
||||
|
||||
Meerdere locaties (2), NL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/verzamelveiling-scooters-en-motoren-A1-28428)
|
||||
* [
|
||||
|
||||
woensdag om 19:00
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
338
|
||||
|
||||
Auto's & transport
|
||||
|
||||
Meerdere locaties (109)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/auto%27s-transport-A3-37349)
|
||||
* [
|
||||
|
||||
woensdag om 19:30
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
74
|
||||
|
||||
Gouden juwelen en diamanten
|
||||
|
||||
Meerdere locaties (28)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
](/a/gouden-juwelen-en-diamanten-A1-29562)
|
||||
Configure your devices to use the Pi-hole as their DNS server │
|
||||
│ using: │
|
||||
│ │
|
||||
│ IPv4: 192.168.1.159 │
|
||||
│ IPv6: fdc5:59a6:9ac1:f11f:2c86:25d3:6282:37ef │
|
||||
│ If you have not done so already, the above IP should be set to │
|
||||
│ static. │
|
||||
│ View the web interface at http://pi.hole:80/admin or │
|
||||
│ http://192.168.1.159:80/admin │
|
||||
│ │
|
||||
│ Your Admin Webpage login password is gYj7Enh- │
|
||||
│ │
|
||||
│ │
|
||||
│ To allow your user to use all CLI functions without │
|
||||
│ authentication, │
|
||||
│ refer to https://docs.pi-hole.net/main/post-install/ │
|
||||
├─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
127.0.0.1
|
||||
192.168.1.159
|
||||
::1
|
||||
fdc5:59a6:9ac1:f11f:2c86:25d3:6282:37ef
|
||||
fdc5:59a6:9ac1:f11f:bd8c:6e87:65f0:243c
|
||||
fe80::a05b:bbc6:d47f:3002%enp9s0
|
||||
2IXD-XJN9-C337-1K4Y-BBEO-HV1F-3BVI
|
||||
|
||||
https://ollama.lan:9443/#!/wizard - heel-goed-wachtwoord
|
||||
|
||||
[
|
||||
{
|
||||
"domain": "ollama.lan",
|
||||
"answer": "192.168.1.159",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"domain": "hephaestus.lan",
|
||||
"answer": "192.168.1.159",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"domain": "hermes.lan",
|
||||
"answer": "192.168.137.239",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"domain": "atlas.lan",
|
||||
"answer": "192.168.1.100",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"domain": "hub.lan",
|
||||
"answer": "192.168.1.1",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"domain": "ha.lan",
|
||||
"answer": "192.168.1.193",
|
||||
"enabled": true
|
||||
}
|
||||
]
|
||||
|
||||
326
wiki/ARCHITECTURE-TROOSTWIJK-SCRAPER.md
Normal file
326
wiki/ARCHITECTURE-TROOSTWIJK-SCRAPER.md
Normal file
@@ -0,0 +1,326 @@
|
||||
# Troostwijk Scraper - Architecture & Data Flow
|
||||
|
||||
## System Overview
|
||||
|
||||
The scraper follows a **3-phase hierarchical crawling pattern** to extract auction and lot data from Troostwijk Auctions website.
|
||||
|
||||
## Architecture Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ TROOSTWIJK SCRAPER │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ PHASE 1: COLLECT AUCTION URLs │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Listing Page │────────▶│ Extract /a/ │ │
|
||||
│ │ /auctions? │ │ auction URLs │ │
|
||||
│ │ page=1..N │ └──────────────┘ │
|
||||
│ └──────────────┘ │ │
|
||||
│ ▼ │
|
||||
│ [ List of Auction URLs ] │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ PHASE 2: EXTRACT LOT URLs FROM AUCTIONS │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Auction Page │────────▶│ Parse │ │
|
||||
│ │ /a/... │ │ __NEXT_DATA__│ │
|
||||
│ └──────────────┘ │ JSON │ │
|
||||
│ │ └──────────────┘ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Save Auction │ │ Extract /l/ │ │
|
||||
│ │ Metadata │ │ lot URLs │ │
|
||||
│ │ to DB │ └──────────────┘ │
|
||||
│ └──────────────┘ │ │
|
||||
│ ▼ │
|
||||
│ [ List of Lot URLs ] │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ PHASE 3: SCRAPE LOT DETAILS │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Lot Page │────────▶│ Parse │ │
|
||||
│ │ /l/... │ │ __NEXT_DATA__│ │
|
||||
│ └──────────────┘ │ JSON │ │
|
||||
│ └──────────────┘ │
|
||||
│ │ │
|
||||
│ ┌─────────────────────────┴─────────────────┐ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Save Lot │ │ Save Images │ │
|
||||
│ │ Details │ │ URLs to DB │ │
|
||||
│ │ to DB │ └──────────────┘ │
|
||||
│ └──────────────┘ │ │
|
||||
│ ▼ │
|
||||
│ [Optional Download] │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
┌──────────────────────────────────────────────────────────────────┐
|
||||
│ CACHE TABLE (HTML Storage with Compression) │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ cache │
|
||||
│ ├── url (TEXT, PRIMARY KEY) │
|
||||
│ ├── content (BLOB) -- Compressed HTML (zlib) │
|
||||
│ ├── timestamp (REAL) │
|
||||
│ ├── status_code (INTEGER) │
|
||||
│ └── compressed (INTEGER) -- 1=compressed, 0=plain │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────────────────────────────────────────────────┐
|
||||
│ AUCTIONS TABLE │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ auctions │
|
||||
│ ├── auction_id (TEXT, PRIMARY KEY) -- e.g. "A7-39813" │
|
||||
│ ├── url (TEXT, UNIQUE) │
|
||||
│ ├── title (TEXT) │
|
||||
│ ├── location (TEXT) -- e.g. "Cluj-Napoca, RO" │
|
||||
│ ├── lots_count (INTEGER) │
|
||||
│ ├── first_lot_closing_time (TEXT) │
|
||||
│ └── scraped_at (TEXT) │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────────────────────────────────────────────────┐
|
||||
│ LOTS TABLE │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ lots │
|
||||
│ ├── lot_id (TEXT, PRIMARY KEY) -- e.g. "A1-28505-5" │
|
||||
│ ├── auction_id (TEXT) -- FK to auctions │
|
||||
│ ├── url (TEXT, UNIQUE) │
|
||||
│ ├── title (TEXT) │
|
||||
│ ├── current_bid (TEXT) -- "€123.45" or "No bids" │
|
||||
│ ├── bid_count (INTEGER) │
|
||||
│ ├── closing_time (TEXT) │
|
||||
│ ├── viewing_time (TEXT) │
|
||||
│ ├── pickup_date (TEXT) │
|
||||
│ ├── location (TEXT) -- e.g. "Dongen, NL" │
|
||||
│ ├── description (TEXT) │
|
||||
│ ├── category (TEXT) │
|
||||
│ └── scraped_at (TEXT) │
|
||||
│ FOREIGN KEY (auction_id) → auctions(auction_id) │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────────────────────────────────────────────────┐
|
||||
│ IMAGES TABLE (Image URLs & Download Status) │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ images ◀── THIS TABLE HOLDS IMAGE LINKS│
|
||||
│ ├── id (INTEGER, PRIMARY KEY AUTOINCREMENT) │
|
||||
│ ├── lot_id (TEXT) -- FK to lots │
|
||||
│ ├── url (TEXT) -- Image URL │
|
||||
│ ├── local_path (TEXT) -- Path after download │
|
||||
│ └── downloaded (INTEGER) -- 0=pending, 1=downloaded │
|
||||
│ FOREIGN KEY (lot_id) → lots(lot_id) │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Sequence Diagram
|
||||
|
||||
```
|
||||
User Scraper Playwright Cache DB Data Tables
|
||||
│ │ │ │ │
|
||||
│ Run │ │ │ │
|
||||
├──────────────▶│ │ │ │
|
||||
│ │ │ │ │
|
||||
│ │ Phase 1: Listing Pages │ │
|
||||
│ ├───────────────▶│ │ │
|
||||
│ │ goto() │ │ │
|
||||
│ │◀───────────────┤ │ │
|
||||
│ │ HTML │ │ │
|
||||
│ ├───────────────────────────────▶│ │
|
||||
│ │ compress & cache │ │
|
||||
│ │ │ │ │
|
||||
│ │ Phase 2: Auction Pages │ │
|
||||
│ ├───────────────▶│ │ │
|
||||
│ │◀───────────────┤ │ │
|
||||
│ │ HTML │ │ │
|
||||
│ │ │ │ │
|
||||
│ │ Parse __NEXT_DATA__ JSON │ │
|
||||
│ │────────────────────────────────────────────────▶│
|
||||
│ │ │ │ INSERT auctions
|
||||
│ │ │ │ │
|
||||
│ │ Phase 3: Lot Pages │ │
|
||||
│ ├───────────────▶│ │ │
|
||||
│ │◀───────────────┤ │ │
|
||||
│ │ HTML │ │ │
|
||||
│ │ │ │ │
|
||||
│ │ Parse __NEXT_DATA__ JSON │ │
|
||||
│ │────────────────────────────────────────────────▶│
|
||||
│ │ │ │ INSERT lots │
|
||||
│ │────────────────────────────────────────────────▶│
|
||||
│ │ │ │ INSERT images│
|
||||
│ │ │ │ │
|
||||
│ │ Export to CSV/JSON │ │
|
||||
│ │◀────────────────────────────────────────────────┤
|
||||
│ │ Query all data │ │
|
||||
│◀──────────────┤ │ │ │
|
||||
│ Results │ │ │ │
|
||||
```
|
||||
|
||||
## Data Flow Details
|
||||
|
||||
### 1. **Page Retrieval & Caching**
|
||||
```
|
||||
Request URL
|
||||
│
|
||||
├──▶ Check cache DB (with timestamp validation)
|
||||
│ │
|
||||
│ ├─[HIT]──▶ Decompress (if compressed=1)
|
||||
│ │ └──▶ Return HTML
|
||||
│ │
|
||||
│ └─[MISS]─▶ Fetch via Playwright
|
||||
│ │
|
||||
│ ├──▶ Compress HTML (zlib level 9)
|
||||
│ │ ~70-90% size reduction
|
||||
│ │
|
||||
│ └──▶ Store in cache DB (compressed=1)
|
||||
│
|
||||
└──▶ Return HTML for parsing
|
||||
```
|
||||
|
||||
### 2. **JSON Parsing Strategy**
|
||||
```
|
||||
HTML Content
|
||||
│
|
||||
└──▶ Extract <script id="__NEXT_DATA__">
|
||||
│
|
||||
├──▶ Parse JSON
|
||||
│ │
|
||||
│ ├─[has pageProps.lot]──▶ Individual LOT
|
||||
│ │ └──▶ Extract: title, bid, location, images, etc.
|
||||
│ │
|
||||
│ └─[has pageProps.auction]──▶ AUCTION
|
||||
│ │
|
||||
│ ├─[has lots[] array]──▶ Auction with lots
|
||||
│ │ └──▶ Extract: title, location, lots_count
|
||||
│ │
|
||||
│ └─[no lots[] array]──▶ Old format lot
|
||||
│ └──▶ Parse as lot
|
||||
│
|
||||
└──▶ Fallback to HTML regex parsing (if JSON fails)
|
||||
```
|
||||
|
||||
### 3. **Image Handling**
|
||||
```
|
||||
Lot Page Parsed
|
||||
│
|
||||
├──▶ Extract images[] from JSON
|
||||
│ │
|
||||
│ └──▶ INSERT INTO images (lot_id, url, downloaded=0)
|
||||
│
|
||||
└──▶ [If DOWNLOAD_IMAGES=True]
|
||||
│
|
||||
├──▶ Download each image
|
||||
│ │
|
||||
│ ├──▶ Save to: /images/{lot_id}/001.jpg
|
||||
│ │
|
||||
│ └──▶ UPDATE images SET local_path=?, downloaded=1
|
||||
│
|
||||
└──▶ Rate limit between downloads (0.5s)
|
||||
```
|
||||
|
||||
## Key Configuration
|
||||
|
||||
| Setting | Value | Purpose |
|
||||
|---------|-------|---------|
|
||||
| `CACHE_DB` | `/mnt/okcomputer/output/cache.db` | SQLite database path |
|
||||
| `IMAGES_DIR` | `/mnt/okcomputer/output/images` | Downloaded images storage |
|
||||
| `RATE_LIMIT_SECONDS` | `0.5` | Delay between requests |
|
||||
| `DOWNLOAD_IMAGES` | `False` | Toggle image downloading |
|
||||
| `MAX_PAGES` | `50` | Number of listing pages to crawl |
|
||||
|
||||
## Output Files
|
||||
|
||||
```
|
||||
/mnt/okcomputer/output/
|
||||
├── cache.db # SQLite database (compressed HTML + data)
|
||||
├── auctions_{timestamp}.json # Exported auctions
|
||||
├── auctions_{timestamp}.csv # Exported auctions
|
||||
├── lots_{timestamp}.json # Exported lots
|
||||
├── lots_{timestamp}.csv # Exported lots
|
||||
└── images/ # Downloaded images (if enabled)
|
||||
├── A1-28505-5/
|
||||
│ ├── 001.jpg
|
||||
│ └── 002.jpg
|
||||
└── A1-28505-6/
|
||||
└── 001.jpg
|
||||
```
|
||||
|
||||
## Extension Points for Integration
|
||||
|
||||
### 1. **Downstream Processing Pipeline**
|
||||
```python
|
||||
# Query lots without downloaded images
|
||||
SELECT lot_id, url FROM images WHERE downloaded = 0
|
||||
|
||||
# Process images: OCR, classification, etc.
|
||||
# Update status when complete
|
||||
UPDATE images SET downloaded = 1, local_path = ? WHERE id = ?
|
||||
```
|
||||
|
||||
### 2. **Real-time Monitoring**
|
||||
```python
|
||||
# Check for new lots every N minutes
|
||||
SELECT COUNT(*) FROM lots WHERE scraped_at > datetime('now', '-1 hour')
|
||||
|
||||
# Monitor bid changes
|
||||
SELECT lot_id, current_bid, bid_count FROM lots WHERE bid_count > 0
|
||||
```
|
||||
|
||||
### 3. **Analytics & Reporting**
|
||||
```python
|
||||
# Top locations
|
||||
SELECT location, COUNT(*) as lot_count FROM lots GROUP BY location
|
||||
|
||||
# Auction statistics
|
||||
SELECT
|
||||
a.auction_id,
|
||||
a.title,
|
||||
COUNT(l.lot_id) as actual_lots,
|
||||
SUM(CASE WHEN l.bid_count > 0 THEN 1 ELSE 0 END) as lots_with_bids
|
||||
FROM auctions a
|
||||
LEFT JOIN lots l ON a.auction_id = l.auction_id
|
||||
GROUP BY a.auction_id
|
||||
```
|
||||
|
||||
### 4. **Image Processing Integration**
|
||||
```python
|
||||
# Get all images for a lot
|
||||
SELECT url, local_path FROM images WHERE lot_id = 'A1-28505-5'
|
||||
|
||||
# Batch process unprocessed images
|
||||
SELECT i.id, i.lot_id, i.local_path, l.title, l.category
|
||||
FROM images i
|
||||
JOIN lots l ON i.lot_id = l.lot_id
|
||||
WHERE i.downloaded = 1 AND i.local_path IS NOT NULL
|
||||
```
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
- **Compression**: ~70-90% HTML size reduction (1GB → ~100-300MB)
|
||||
- **Rate Limiting**: Exactly 0.5s between requests (respectful scraping)
|
||||
- **Caching**: 24-hour default cache validity (configurable)
|
||||
- **Throughput**: ~7,200 pages/hour (with 0.5s rate limit)
|
||||
- **Scalability**: SQLite handles millions of rows efficiently
|
||||
|
||||
## Error Handling
|
||||
|
||||
- **Network failures**: Cached as status_code=500, retry after cache expiry
|
||||
- **Parse failures**: Falls back to HTML regex patterns
|
||||
- **Compression errors**: Auto-detects and handles uncompressed legacy data
|
||||
- **Missing fields**: Defaults to "No bids", empty string, or 0
|
||||
|
||||
## Rate Limiting & Ethics
|
||||
|
||||
- **REQUIRED**: 0.5 second delay between ALL requests
|
||||
- **Respects cache**: Avoids unnecessary re-fetching
|
||||
- **User-Agent**: Identifies as standard browser
|
||||
- **No parallelization**: Single-threaded sequential crawling
|
||||
@@ -183,7 +183,7 @@ The scraper works fine despite these warnings.
|
||||
|
||||
## Full Documentation
|
||||
|
||||
See [README.md](README.md) for complete documentation including:
|
||||
See [README.md](../README.md) for complete documentation including:
|
||||
- Email setup details
|
||||
- YOLO installation guide
|
||||
- Configuration options
|
||||
Reference in New Issue
Block a user