package com.auction;
import com.microsoft.playwright.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.microsoft.playwright.options.WaitUntilState;
import java.io.IOException;
import java.nio.file.*;
import java.sql.*;
import java.time.Instant;
import java.util.*;
/**
* TroostwijkAuctionExtractor
*
* Extracts auction listings from https://www.troostwijkauctions.com/auctions
* using Playwright for Java (headless browser automation).
*
* Features:
* - Uses Playwright for Java to load JavaScript-rendered content
* - Iterates through all pages of auction listings
* - Rate limiting: 200ms between each page request
* - Caches visited pages in SQLite database with expiration times
* - Extracts auction metadata: ID, title, location, URL
*
* Dependencies (Maven):
*
* com.microsoft.playwright
* playwright
* 1.40.0
*
*
* com.fasterxml.jackson.core
* jackson-databind
* 2.17.0
*
*
* org.xerial
* sqlite-jdbc
* 3.45.1.0
*
*
* After adding dependency, run: mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install"
* This downloads the browser binaries needed by Playwright.
*/
public class TroostwijkAuctionExtractor {
private static final String AUCTIONS_BASE_URL = "https://www.troostwijkauctions.com/auctions";
private static final int RATE_LIMIT_MS = 200;
private static final String CACHE_DB_PATH = "cache/page_cache.db";
private static final long CACHE_EXPIRATION_HOURS = 24; // Cache expires after 24 hours
private final ObjectMapper objectMapper;
private final boolean useCache;
private final CacheDatabase cacheDb;
private final int maxPageVisits; // Maximum number of pages to fetch (0 = unlimited)
private int pageVisitCount; // Counter for actual page fetches (not from cache)
private Playwright playwright;
private Browser browser;
/**
* Represents an auction listing
*/
public static class Auction {
public int id;
public String title;
public String location;
public String url;
public String type; // e.g. "A1" or "A7"
@Override
public String toString() {
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', url='%s'}",
id, type, title, location, url);
}
}
/**
* Constructor
*
* @param useCache Enable database caching of visited pages
* @param maxPageVisits Maximum number of actual page fetches (0 = unlimited)
*/
public TroostwijkAuctionExtractor(boolean useCache, int maxPageVisits) throws SQLException, IOException {
this.objectMapper = new ObjectMapper();
this.useCache = useCache;
this.maxPageVisits = maxPageVisits;
this.pageVisitCount = 0;
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
if (useCache) {
cacheDb.initialize();
}
}
/**
* Constructor with default unlimited page visits
*
* @param useCache Enable database caching of visited pages
*/
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
this(useCache, 0); // 0 = unlimited
}
/**
* Initializes Playwright and browser instance
* Call this before extracting auctions
*/
public void initialize() {
System.out.println("Initializing Playwright browser...");
this.playwright = Playwright.create();
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(true)
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
System.out.println("✓ Browser ready");
}
/**
* Closes browser and Playwright instance
* Call this when done extracting
*/
public void close() {
if (browser != null) {
browser.close();
}
if (playwright != null) {
playwright.close();
}
if (cacheDb != null) {
cacheDb.close();
}
System.out.println("✓ Browser and cache closed");
}
/**
* Extracts all auctions from all pages
*
* @return List of all discovered auctions
*/
public List extractAllAuctions() throws InterruptedException {
if (browser == null) {
throw new IllegalStateException("Browser not initialized. Call initialize() first.");
}
List allAuctions = new ArrayList<>();
int pageNumber = 1;
boolean hasMorePages = true;
System.out.println("Starting auction extraction from " + AUCTIONS_BASE_URL);
while (hasMorePages) {
System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
// Check cache first
String cachedHtml = loadFromCache(pageNumber);
String html;
if (cachedHtml != null) {
System.out.println(" ✓ Loaded from cache");
html = cachedHtml;
} else {
// Check if we've reached the maximum page visit limit
if (maxPageVisits > 0 && pageVisitCount >= maxPageVisits) {
System.out.println(" ⚠️ Reached maximum page visit limit (" + maxPageVisits + "), stopping");
break;
}
// Fetch with Playwright
html = fetchPageWithPlaywright(pageNumber);
pageVisitCount++; // Increment actual page fetch counter
if (html == null || html.isEmpty()) {
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
break;
}
System.out.println(" ✓ Fetched from website (visit " + pageVisitCount +
(maxPageVisits > 0 ? "/" + maxPageVisits : "") + ")");
// Save to cache
if (useCache) {
saveToCache(pageNumber, html);
}
// Rate limiting - wait 200ms before next request
Thread.sleep(RATE_LIMIT_MS);
}
// Parse auctions from HTML
List pageAuctions = parseAuctionsFromHtml(html);
if (pageAuctions.isEmpty()) {
System.out.println(" ⚠️ No auctions found on page, stopping pagination");
hasMorePages = false;
} else {
System.out.println(" ✓ Found " + pageAuctions.size() + " auctions");
allAuctions.addAll(pageAuctions);
pageNumber++;
}
}
System.out.println("\n✓ Total auctions extracted: " + allAuctions.size());
return allAuctions;
}
/**
* Fetches a single page using Playwright
*
* @param pageNumber Page number (1-indexed)
* @return HTML content of the page
*/
private String fetchPageWithPlaywright(int pageNumber) {
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
try {
Page page = browser.newPage();
// Set user agent
page.setExtraHTTPHeaders(Map.of(
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
));
// Navigate to page
page.navigate(url, new Page.NavigateOptions()
.setTimeout(30000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
// Wait for auction listings to appear
try {
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
.setTimeout(10000));
} catch (Exception e) {
// Continue even if selector not found
System.out.println(" ⚠️ Auction selector not found, attempting to parse anyway");
}
// Get HTML content
String html = page.content();
page.close();
return html;
} catch (Exception e) {
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
return null;
}
}
/**
* Parses auction data from HTML content
*
* @param html HTML content
* @return List of parsed auctions
*/
private List parseAuctionsFromHtml(String html) {
List auctions = new ArrayList<>();
// Simple regex-based parsing for auction links
// Format: seenIds = new HashSet<>();
while (linkMatcher.find()) {
String href = linkMatcher.group(1);
int auctionId = Integer.parseInt(linkMatcher.group(2));
// Avoid duplicates
if (seenIds.contains(auctionId)) {
continue;
}
// Extract auction type (A1 or A7)
String type = href.contains("A1-") ? "A1" : "A7";
// Try to find location and title near this link
String location = extractLocationNearLink(html, href);
String title = extractTitleFromHref(href);
Auction auction = new Auction();
auction.id = auctionId;
auction.type = type;
auction.title = title;
auction.location = location;
auction.url = "https://www.troostwijkauctions.com" + href;
auctions.add(auction);
seenIds.add(auctionId);
}
return auctions;
}
/**
* Extracts location text near an auction link
* Looks for ", NL" or other country codes
*/
private String extractLocationNearLink(String html, String href) {
int hrefPos = html.indexOf(href);
if (hrefPos == -1) return "Unknown";
// Look at 1000 characters before AND after the href for location info
int startPos = Math.max(hrefPos - 500, 0);
int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Pattern 1: Classic format "City, NL"
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
if (locMatcher.find()) {
String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2);
System.out.println(" Found location: " + location + " for auction " + href);
return location;
}
// Pattern 2: HTML format like "City, NL"
// Extract city and country code separately
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
"]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:)?\\s*\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
if (htmlMatcher.find()) {
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
String country = htmlMatcher.group(2);
String location = city + ", " + country;
System.out.println(" Found location (HTML): " + location + " for auction " + href);
return location;
}
// Pattern 3: Fallback - just find country code after HTML tags
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
"(?:-->||)\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
if (countryMatcher.find()) {
String country = countryMatcher.group(1);
System.out.println(" Found country code: " + country + " for auction " + href);
return "Unknown, " + country;
}
System.out.println(" ⚠️ No location found for auction " + href);
return "Unknown";
}
/**
* Extracts human-readable title from URL slug
* Converts "some-auction-title-A1-12345" to "Some Auction Title"
*/
private String extractTitleFromHref(String href) {
// Extract everything between "/a/" and "-A1-" or "-A7-"
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile(
"/a/(.+?)-A[17]-");
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
String slug = titleMatcher.group(1);
// Convert kebab-case to Title Case
String[] words = slug.split("-");
StringBuilder title = new StringBuilder();
for (String word : words) {
if (!word.isEmpty()) {
title.append(Character.toUpperCase(word.charAt(0)))
.append(word.substring(1))
.append(" ");
}
}
return title.toString().trim();
}
return "Untitled Auction";
}
/**
* Loads cached HTML for a page from SQLite database
* Returns null if not cached or cache has expired
*
* @param pageNumber Page number
* @return Cached HTML or null if not found/expired
*/
private String loadFromCache(int pageNumber) {
if (!useCache || cacheDb == null) return null;
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
return cacheDb.get(url);
}
/**
* Saves HTML to SQLite cache database with expiration time
*
* @param pageNumber Page number
* @param html HTML content
*/
private void saveToCache(int pageNumber, String html) {
if (!useCache || cacheDb == null) return;
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
}
/**
* Filters auctions by location
*
* @param auctions List of auctions
* @param locationFilter Location string to match (e.g., "NL")
* @return Filtered list
*/
public static List filterByLocation(List auctions, String locationFilter) {
return auctions.stream()
.filter(a -> a.location.contains(locationFilter))
.toList();
}
/**
* Entry point for testing
*
* Arguments:
* --max-visits : Maximum number of page visits (0 = unlimited, default)
* --no-cache : Disable caching
*/
public static void main(String[] args) throws Exception {
System.out.println("=== Troostwijk Auction Extractor ===\n");
// Parse command line arguments
boolean useCache = true;
int maxVisits = 0; // 0 = unlimited
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "--max-visits":
if (i + 1 < args.length) {
maxVisits = Integer.parseInt(args[++i]);
System.out.println("Max page visits set to: " + maxVisits);
}
break;
case "--no-cache":
useCache = false;
System.out.println("Caching disabled");
break;
case "--help":
System.out.println("Usage: java TroostwijkAuctionExtractor [options]");
System.out.println("Options:");
System.out.println(" --max-visits : Limit actual page fetches to n (0 = unlimited)");
System.out.println(" --no-cache : Disable page caching");
System.out.println(" --help : Show this help message");
return;
}
}
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache, maxVisits);
try {
// Initialize browser
extractor.initialize();
// Extract all auctions
List allAuctions = extractor.extractAllAuctions();
// Filter for Dutch auctions only
List dutchAuctions = filterByLocation(allAuctions, "NL");
System.out.println("\n=== Results ===");
System.out.println("Total auctions found: " + allAuctions.size());
System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
System.out.println("Actual page visits: " + extractor.pageVisitCount);
// Display first 10 Dutch auctions
System.out.println("\n=== Sample Dutch Auctions ===");
dutchAuctions.stream()
.limit(10)
.forEach(System.out::println);
} finally {
// Always close browser
extractor.close();
}
}
/**
* SQLite-based caching system for HTML pages with expiration support
*/
static class CacheDatabase {
private final String dbPath;
private Connection connection;
public CacheDatabase(String dbPath) {
this.dbPath = dbPath;
}
/**
* Initialize database and create schema
*/
public void initialize() throws SQLException, IOException {
// Create cache directory if it doesn't exist
Path cacheDir = Paths.get(dbPath).getParent();
if (cacheDir != null) {
Files.createDirectories(cacheDir);
}
connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
// Create cache table with URL as primary key
String createTable = """
CREATE TABLE IF NOT EXISTS page_cache (
url TEXT PRIMARY KEY,
html TEXT NOT NULL,
cached_at INTEGER NOT NULL,
expires_at INTEGER NOT NULL
)
""";
try (Statement stmt = connection.createStatement()) {
stmt.execute(createTable);
// Create index on expires_at for efficient cleanup
stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
}
// Clean up expired entries on initialization
cleanupExpired();
System.out.println("✓ Cache database initialized");
}
/**
* Get cached HTML for a URL if it exists and hasn't expired
*
* @param url The URL to look up
* @return Cached HTML or null if not found/expired
*/
public synchronized String get(String url) {
String sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setString(1, url);
ps.setLong(2, Instant.now().getEpochSecond());
ResultSet rs = ps.executeQuery();
if (rs.next()) {
return rs.getString("html");
}
} catch (SQLException e) {
System.err.println("Cache read error: " + e.getMessage());
}
return null;
}
/**
* Store HTML in cache with expiration time
*
* @param url The URL to cache
* @param html The HTML content
* @param expirationHours Hours until cache expires
*/
public synchronized void put(String url, String html, long expirationHours) {
String sql = """
INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)
VALUES (?, ?, ?, ?)
""";
long now = Instant.now().getEpochSecond();
long expiresAt = now + (expirationHours * 3600);
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setString(1, url);
ps.setString(2, html);
ps.setLong(3, now);
ps.setLong(4, expiresAt);
ps.executeUpdate();
} catch (SQLException e) {
System.err.println("Cache write error: " + e.getMessage());
}
}
/**
* Remove expired cache entries
*/
public synchronized void cleanupExpired() {
String sql = "DELETE FROM page_cache WHERE expires_at <= ?";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setLong(1, Instant.now().getEpochSecond());
int deleted = ps.executeUpdate();
if (deleted > 0) {
System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
}
} catch (SQLException e) {
System.err.println("Cache cleanup error: " + e.getMessage());
}
}
/**
* Get cache statistics
*/
public synchronized void printStats() {
String sql = "SELECT COUNT(*) as total, " +
"SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
"SUM(LENGTH(html)) as total_size " +
"FROM page_cache";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setLong(1, Instant.now().getEpochSecond());
ResultSet rs = ps.executeQuery();
if (rs.next()) {
int total = rs.getInt("total");
int valid = rs.getInt("valid");
long size = rs.getLong("total_size");
System.out.println("\n=== Cache Statistics ===");
System.out.println("Total entries: " + total);
System.out.println("Valid entries: " + valid);
System.out.println("Expired entries: " + (total - valid));
System.out.println("Total size: " + (size / 1024) + " KB");
}
} catch (SQLException e) {
System.err.println("Cache stats error: " + e.getMessage());
}
}
/**
* Close database connection
*/
public void close() {
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
System.err.println("Error closing cache database: " + e.getMessage());
}
}
}
}
}