diff --git a/pom.xml b/pom.xml
index 2911c0e..51e2be6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -14,8 +14,8 @@
UTF-8
- 11
- 11
+ 21
+ 21
2.17.0
4.9.0-0
@@ -55,6 +55,11 @@
opencv
${opencv.version}
+
+ com.microsoft.playwright
+ playwright
+ 1.40.0
+
@@ -65,22 +70,22 @@
maven-compiler-plugin
3.11.0
- 11
- 11
+ 21
+ 21
-
+
org.apache.maven.plugins
diff --git a/src/main/java/com/auction/Main.java b/src/main/java/com/auction/Main.java
index fa2d283..6a0091f 100644
--- a/src/main/java/com/auction/Main.java
+++ b/src/main/java/com/auction/Main.java
@@ -2,6 +2,13 @@ package com.auction;
public class Main {
public static void main(String[] args) {
+ // If arguments are passed, this is likely a one-off command via dokku run
+ // Just exit immediately to allow the command to run
+ if (args.length > 0) {
+ System.out.println("Command mode - exiting to allow shell commands");
+ return;
+ }
+
System.out.println("Starting Troostwijk Auction Scraper...");
System.out.println("Container is running and healthy.");
diff --git a/src/main/java/com/auction/TroostwijkAuctionExtractor.java b/src/main/java/com/auction/TroostwijkAuctionExtractor.java
new file mode 100644
index 0000000..ab65248
--- /dev/null
+++ b/src/main/java/com/auction/TroostwijkAuctionExtractor.java
@@ -0,0 +1,563 @@
+package com.auction;
+
+import com.microsoft.playwright.*;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.microsoft.playwright.options.WaitUntilState;
+
+import java.io.IOException;
+import java.nio.file.*;
+import java.sql.*;
+import java.time.Instant;
+import java.util.*;
+
+/**
+ * TroostwijkAuctionExtractor
+ *
+ * Extracts auction listings from https://www.troostwijkauctions.com/auctions
+ * using Playwright for Java (headless browser automation).
+ *
+ * Features:
+ * - Uses Playwright for Java to load JavaScript-rendered content
+ * - Iterates through all pages of auction listings
+ * - Rate limiting: 200ms between each page request
+ * - Caches visited pages in SQLite database with expiration times
+ * - Extracts auction metadata: ID, title, location, URL
+ *
+ * Dependencies (Maven):
+ *
+ * com.microsoft.playwright
+ * playwright
+ * 1.40.0
+ *
+ *
+ * com.fasterxml.jackson.core
+ * jackson-databind
+ * 2.17.0
+ *
+ *
+ * org.xerial
+ * sqlite-jdbc
+ * 3.45.1.0
+ *
+ *
+ * After adding dependency, run: mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install"
+ * This downloads the browser binaries needed by Playwright.
+ */
+public class TroostwijkAuctionExtractor {
+
+ private static final String AUCTIONS_BASE_URL = "https://www.troostwijkauctions.com/auctions";
+ private static final int RATE_LIMIT_MS = 200;
+ private static final String CACHE_DB_PATH = "cache/page_cache.db";
+ private static final long CACHE_EXPIRATION_HOURS = 24; // Cache expires after 24 hours
+
+ private final ObjectMapper objectMapper;
+ private final boolean useCache;
+ private final CacheDatabase cacheDb;
+ private Playwright playwright;
+ private Browser browser;
+
+ /**
+ * Represents an auction listing
+ */
+ public static class Auction {
+ public int id;
+ public String title;
+ public String location;
+ public String url;
+ public String type; // e.g. "A1" or "A7"
+
+ @Override
+ public String toString() {
+ return String.format("Auction{id=%d, type=%s, title='%s', location='%s', url='%s'}",
+ id, type, title, location, url);
+ }
+ }
+
+ /**
+ * Constructor
+ *
+ * @param useCache Enable database caching of visited pages
+ */
+ public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
+ this.objectMapper = new ObjectMapper();
+ this.useCache = useCache;
+ this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
+
+ if (useCache) {
+ cacheDb.initialize();
+ }
+ }
+
+ /**
+ * Initializes Playwright and browser instance
+ * Call this before extracting auctions
+ */
+ public void initialize() {
+ System.out.println("Initializing Playwright browser...");
+ this.playwright = Playwright.create();
+ this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
+ .setHeadless(true)
+ .setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
+ System.out.println("✓ Browser ready");
+ }
+
+ /**
+ * Closes browser and Playwright instance
+ * Call this when done extracting
+ */
+ public void close() {
+ if (browser != null) {
+ browser.close();
+ }
+ if (playwright != null) {
+ playwright.close();
+ }
+ if (cacheDb != null) {
+ cacheDb.close();
+ }
+ System.out.println("✓ Browser and cache closed");
+ }
+
+ /**
+ * Extracts all auctions from all pages
+ *
+ * @return List of all discovered auctions
+ */
+ public List extractAllAuctions() throws InterruptedException {
+ if (browser == null) {
+ throw new IllegalStateException("Browser not initialized. Call initialize() first.");
+ }
+
+ List allAuctions = new ArrayList<>();
+ int pageNumber = 1;
+ boolean hasMorePages = true;
+
+ System.out.println("Starting auction extraction from " + AUCTIONS_BASE_URL);
+
+ while (hasMorePages) {
+ System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
+
+ // Check cache first
+ String cachedHtml = loadFromCache(pageNumber);
+ String html;
+
+ if (cachedHtml != null) {
+ System.out.println(" ✓ Loaded from cache");
+ html = cachedHtml;
+ } else {
+ // Fetch with Playwright
+ html = fetchPageWithPlaywright(pageNumber);
+
+ if (html == null || html.isEmpty()) {
+ System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
+ break;
+ }
+
+ // Save to cache
+ if (useCache) {
+ saveToCache(pageNumber, html);
+ }
+
+ // Rate limiting - wait 200ms before next request
+ Thread.sleep(RATE_LIMIT_MS);
+ }
+
+ // Parse auctions from HTML
+ List pageAuctions = parseAuctionsFromHtml(html);
+
+ if (pageAuctions.isEmpty()) {
+ System.out.println(" ⚠️ No auctions found on page, stopping pagination");
+ hasMorePages = false;
+ } else {
+ System.out.println(" ✓ Found " + pageAuctions.size() + " auctions");
+ allAuctions.addAll(pageAuctions);
+ pageNumber++;
+ }
+ }
+
+ System.out.println("\n✓ Total auctions extracted: " + allAuctions.size());
+ return allAuctions;
+ }
+
+ /**
+ * Fetches a single page using Playwright
+ *
+ * @param pageNumber Page number (1-indexed)
+ * @return HTML content of the page
+ */
+ private String fetchPageWithPlaywright(int pageNumber) {
+ String url = pageNumber == 1
+ ? AUCTIONS_BASE_URL
+ : AUCTIONS_BASE_URL + "?page=" + pageNumber;
+
+ try {
+ Page page = browser.newPage();
+
+ // Set user agent
+ page.setExtraHTTPHeaders(Map.of(
+ "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+ ));
+
+ // Navigate to page
+ page.navigate(url, new Page.NavigateOptions()
+ .setTimeout(30000)
+ .setWaitUntil(WaitUntilState.NETWORKIDLE));
+
+ // Wait for auction listings to appear
+ try {
+ page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
+ .setTimeout(10000));
+ } catch (Exception e) {
+ // Continue even if selector not found
+ System.out.println(" ⚠️ Auction selector not found, attempting to parse anyway");
+ }
+
+ // Get HTML content
+ String html = page.content();
+ page.close();
+
+ return html;
+
+ } catch (Exception e) {
+ System.err.println(" ⚠️ Playwright error: " + e.getMessage());
+ return null;
+ }
+ }
+
+ /**
+ * Parses auction data from HTML content
+ *
+ * @param html HTML content
+ * @return List of parsed auctions
+ */
+ private List parseAuctionsFromHtml(String html) {
+ List auctions = new ArrayList<>();
+
+ // Simple regex-based parsing for auction links
+ // Format: seenIds = new HashSet<>();
+
+ while (linkMatcher.find()) {
+ String href = linkMatcher.group(1);
+ int auctionId = Integer.parseInt(linkMatcher.group(2));
+
+ // Avoid duplicates
+ if (seenIds.contains(auctionId)) {
+ continue;
+ }
+
+ // Extract auction type (A1 or A7)
+ String type = href.contains("A1-") ? "A1" : "A7";
+
+ // Try to find location and title near this link
+ String location = extractLocationNearLink(html, href);
+ String title = extractTitleFromHref(href);
+
+ Auction auction = new Auction();
+ auction.id = auctionId;
+ auction.type = type;
+ auction.title = title;
+ auction.location = location;
+ auction.url = "https://www.troostwijkauctions.com" + href;
+
+ auctions.add(auction);
+ seenIds.add(auctionId);
+ }
+
+ return auctions;
+ }
+
+ /**
+ * Extracts location text near an auction link
+ * Looks for ", NL" or other country codes
+ */
+ private String extractLocationNearLink(String html, String href) {
+ int hrefPos = html.indexOf(href);
+ if (hrefPos == -1) return "Unknown";
+
+ // Look at 500 characters after the href for location info
+ int endPos = Math.min(hrefPos + 500, html.length());
+ String context = html.substring(hrefPos, endPos);
+
+ // Try to find location pattern like "City, NL" or "City, Country"
+ java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
+ "([A-Za-z\\s]+),\\s*([A-Z]{2})");
+ java.util.regex.Matcher locMatcher = locPattern.matcher(context);
+
+ if (locMatcher.find()) {
+ return locMatcher.group(1).trim() + ", " + locMatcher.group(2);
+ }
+
+ return "Unknown";
+ }
+
+ /**
+ * Extracts human-readable title from URL slug
+ * Converts "some-auction-title-A1-12345" to "Some Auction Title"
+ */
+ private String extractTitleFromHref(String href) {
+ // Extract everything between "/a/" and "-A1-" or "-A7-"
+ java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile(
+ "/a/(.+?)-A[17]-");
+ java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
+
+ if (titleMatcher.find()) {
+ String slug = titleMatcher.group(1);
+ // Convert kebab-case to Title Case
+ String[] words = slug.split("-");
+ StringBuilder title = new StringBuilder();
+ for (String word : words) {
+ if (!word.isEmpty()) {
+ title.append(Character.toUpperCase(word.charAt(0)))
+ .append(word.substring(1))
+ .append(" ");
+ }
+ }
+ return title.toString().trim();
+ }
+
+ return "Untitled Auction";
+ }
+
+ /**
+ * Loads cached HTML for a page from SQLite database
+ * Returns null if not cached or cache has expired
+ *
+ * @param pageNumber Page number
+ * @return Cached HTML or null if not found/expired
+ */
+ private String loadFromCache(int pageNumber) {
+ if (!useCache || cacheDb == null) return null;
+
+ String url = pageNumber == 1
+ ? AUCTIONS_BASE_URL
+ : AUCTIONS_BASE_URL + "?page=" + pageNumber;
+
+ return cacheDb.get(url);
+ }
+
+ /**
+ * Saves HTML to SQLite cache database with expiration time
+ *
+ * @param pageNumber Page number
+ * @param html HTML content
+ */
+ private void saveToCache(int pageNumber, String html) {
+ if (!useCache || cacheDb == null) return;
+
+ String url = pageNumber == 1
+ ? AUCTIONS_BASE_URL
+ : AUCTIONS_BASE_URL + "?page=" + pageNumber;
+
+ cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
+ }
+
+ /**
+ * Filters auctions by location
+ *
+ * @param auctions List of auctions
+ * @param locationFilter Location string to match (e.g., "NL")
+ * @return Filtered list
+ */
+ public static List filterByLocation(List auctions, String locationFilter) {
+ return auctions.stream()
+ .filter(a -> a.location.contains(locationFilter))
+ .toList();
+ }
+
+ /**
+ * Entry point for testing
+ */
+ public static void main(String[] args) throws Exception {
+ System.out.println("=== Troostwijk Auction Extractor ===\n");
+
+ // Enable caching by default
+ boolean useCache = true;
+ TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache);
+
+ try {
+ // Initialize browser
+ extractor.initialize();
+
+ // Extract all auctions
+ List allAuctions = extractor.extractAllAuctions();
+
+ // Filter for Dutch auctions only
+ List dutchAuctions = filterByLocation(allAuctions, "NL");
+
+ System.out.println("\n=== Results ===");
+ System.out.println("Total auctions found: " + allAuctions.size());
+ System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
+
+ // Display first 10 Dutch auctions
+ System.out.println("\n=== Sample Dutch Auctions ===");
+ dutchAuctions.stream()
+ .limit(10)
+ .forEach(System.out::println);
+
+ } finally {
+ // Always close browser
+ extractor.close();
+ }
+ }
+
+ /**
+ * SQLite-based caching system for HTML pages with expiration support
+ */
+ static class CacheDatabase {
+ private final String dbPath;
+ private Connection connection;
+
+ public CacheDatabase(String dbPath) {
+ this.dbPath = dbPath;
+ }
+
+ /**
+ * Initialize database and create schema
+ */
+ public void initialize() throws SQLException, IOException {
+ // Create cache directory if it doesn't exist
+ Path cacheDir = Paths.get(dbPath).getParent();
+ if (cacheDir != null) {
+ Files.createDirectories(cacheDir);
+ }
+
+ connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
+
+ // Create cache table with URL as primary key
+ String createTable = """
+ CREATE TABLE IF NOT EXISTS page_cache (
+ url TEXT PRIMARY KEY,
+ html TEXT NOT NULL,
+ cached_at INTEGER NOT NULL,
+ expires_at INTEGER NOT NULL
+ )
+ """;
+
+ try (Statement stmt = connection.createStatement()) {
+ stmt.execute(createTable);
+ // Create index on expires_at for efficient cleanup
+ stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
+ }
+
+ // Clean up expired entries on initialization
+ cleanupExpired();
+
+ System.out.println("✓ Cache database initialized");
+ }
+
+ /**
+ * Get cached HTML for a URL if it exists and hasn't expired
+ *
+ * @param url The URL to look up
+ * @return Cached HTML or null if not found/expired
+ */
+ public synchronized String get(String url) {
+ String sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
+
+ try (PreparedStatement ps = connection.prepareStatement(sql)) {
+ ps.setString(1, url);
+ ps.setLong(2, Instant.now().getEpochSecond());
+
+ ResultSet rs = ps.executeQuery();
+ if (rs.next()) {
+ return rs.getString("html");
+ }
+ } catch (SQLException e) {
+ System.err.println("Cache read error: " + e.getMessage());
+ }
+
+ return null;
+ }
+
+ /**
+ * Store HTML in cache with expiration time
+ *
+ * @param url The URL to cache
+ * @param html The HTML content
+ * @param expirationHours Hours until cache expires
+ */
+ public synchronized void put(String url, String html, long expirationHours) {
+ String sql = """
+ INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)
+ VALUES (?, ?, ?, ?)
+ """;
+
+ long now = Instant.now().getEpochSecond();
+ long expiresAt = now + (expirationHours * 3600);
+
+ try (PreparedStatement ps = connection.prepareStatement(sql)) {
+ ps.setString(1, url);
+ ps.setString(2, html);
+ ps.setLong(3, now);
+ ps.setLong(4, expiresAt);
+ ps.executeUpdate();
+ } catch (SQLException e) {
+ System.err.println("Cache write error: " + e.getMessage());
+ }
+ }
+
+ /**
+ * Remove expired cache entries
+ */
+ public synchronized void cleanupExpired() {
+ String sql = "DELETE FROM page_cache WHERE expires_at <= ?";
+
+ try (PreparedStatement ps = connection.prepareStatement(sql)) {
+ ps.setLong(1, Instant.now().getEpochSecond());
+ int deleted = ps.executeUpdate();
+ if (deleted > 0) {
+ System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
+ }
+ } catch (SQLException e) {
+ System.err.println("Cache cleanup error: " + e.getMessage());
+ }
+ }
+
+ /**
+ * Get cache statistics
+ */
+ public synchronized void printStats() {
+ String sql = "SELECT COUNT(*) as total, " +
+ "SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
+ "SUM(LENGTH(html)) as total_size " +
+ "FROM page_cache";
+
+ try (PreparedStatement ps = connection.prepareStatement(sql)) {
+ ps.setLong(1, Instant.now().getEpochSecond());
+ ResultSet rs = ps.executeQuery();
+
+ if (rs.next()) {
+ int total = rs.getInt("total");
+ int valid = rs.getInt("valid");
+ long size = rs.getLong("total_size");
+
+ System.out.println("\n=== Cache Statistics ===");
+ System.out.println("Total entries: " + total);
+ System.out.println("Valid entries: " + valid);
+ System.out.println("Expired entries: " + (total - valid));
+ System.out.println("Total size: " + (size / 1024) + " KB");
+ }
+ } catch (SQLException e) {
+ System.err.println("Cache stats error: " + e.getMessage());
+ }
+ }
+
+ /**
+ * Close database connection
+ */
+ public void close() {
+ if (connection != null) {
+ try {
+ connection.close();
+ } catch (SQLException e) {
+ System.err.println("Error closing cache database: " + e.getMessage());
+ }
+ }
+ }
+ }
+}