package com.auction; import com.microsoft.playwright.*; import com.fasterxml.jackson.databind.ObjectMapper; import com.microsoft.playwright.options.WaitUntilState; import java.io.IOException; import java.nio.file.*; import java.sql.*; import java.time.Instant; import java.util.*; /** * TroostwijkAuctionExtractor * * Extracts auction listings from https://www.troostwijkauctions.com/auctions * using Playwright for Java (headless browser automation). * * Features: * - Uses Playwright for Java to load JavaScript-rendered content * - Iterates through all pages of auction listings * - Rate limiting: 200ms between each page request * - Caches visited pages in SQLite database with expiration times * - Extracts auction metadata: ID, title, location, URL * * Dependencies (Maven): * * com.microsoft.playwright * playwright * 1.40.0 * * * com.fasterxml.jackson.core * jackson-databind * 2.17.0 * * * org.xerial * sqlite-jdbc * 3.45.1.0 * * * After adding dependency, run: mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install" * This downloads the browser binaries needed by Playwright. */ public class TroostwijkAuctionExtractor { private static final String AUCTIONS_BASE_URL = "https://www.troostwijkauctions.com/auctions"; private static final int RATE_LIMIT_MS = 200; private static final String CACHE_DB_PATH = "cache/page_cache.db"; private static final long CACHE_EXPIRATION_HOURS = 24; // Cache expires after 24 hours private final ObjectMapper objectMapper; private final boolean useCache; private final CacheDatabase cacheDb; private final int maxPageVisits; // Maximum number of pages to fetch (0 = unlimited) private int pageVisitCount; // Counter for actual page fetches (not from cache) private Playwright playwright; private Browser browser; /** * Represents an auction listing */ public static class Auction { public int id; public String title; public String location; public String url; public String type; // e.g. "A1" or "A7" @Override public String toString() { return String.format("Auction{id=%d, type=%s, title='%s', location='%s', url='%s'}", id, type, title, location, url); } } /** * Constructor * * @param useCache Enable database caching of visited pages * @param maxPageVisits Maximum number of actual page fetches (0 = unlimited) */ public TroostwijkAuctionExtractor(boolean useCache, int maxPageVisits) throws SQLException, IOException { this.objectMapper = new ObjectMapper(); this.useCache = useCache; this.maxPageVisits = maxPageVisits; this.pageVisitCount = 0; this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null; if (useCache) { cacheDb.initialize(); } } /** * Constructor with default unlimited page visits * * @param useCache Enable database caching of visited pages */ public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException { this(useCache, 0); // 0 = unlimited } /** * Initializes Playwright and browser instance * Call this before extracting auctions */ public void initialize() { System.out.println("Initializing Playwright browser..."); this.playwright = Playwright.create(); this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions() .setHeadless(true) .setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox"))); System.out.println("✓ Browser ready"); } /** * Closes browser and Playwright instance * Call this when done extracting */ public void close() { if (browser != null) { browser.close(); } if (playwright != null) { playwright.close(); } if (cacheDb != null) { cacheDb.close(); } System.out.println("✓ Browser and cache closed"); } /** * Extracts all auctions from all pages * * @return List of all discovered auctions */ public List extractAllAuctions() throws InterruptedException { if (browser == null) { throw new IllegalStateException("Browser not initialized. Call initialize() first."); } List allAuctions = new ArrayList<>(); int pageNumber = 1; boolean hasMorePages = true; System.out.println("Starting auction extraction from " + AUCTIONS_BASE_URL); while (hasMorePages) { System.out.println("\n[Page " + pageNumber + "] Fetching auctions..."); // Check cache first String cachedHtml = loadFromCache(pageNumber); String html; if (cachedHtml != null) { System.out.println(" ✓ Loaded from cache"); html = cachedHtml; } else { // Check if we've reached the maximum page visit limit if (maxPageVisits > 0 && pageVisitCount >= maxPageVisits) { System.out.println(" ⚠️ Reached maximum page visit limit (" + maxPageVisits + "), stopping"); break; } // Fetch with Playwright html = fetchPageWithPlaywright(pageNumber); pageVisitCount++; // Increment actual page fetch counter if (html == null || html.isEmpty()) { System.out.println(" ⚠️ Failed to fetch page, stopping pagination"); break; } System.out.println(" ✓ Fetched from website (visit " + pageVisitCount + (maxPageVisits > 0 ? "/" + maxPageVisits : "") + ")"); // Save to cache if (useCache) { saveToCache(pageNumber, html); } // Rate limiting - wait 200ms before next request Thread.sleep(RATE_LIMIT_MS); } // Parse auctions from HTML List pageAuctions = parseAuctionsFromHtml(html); if (pageAuctions.isEmpty()) { System.out.println(" ⚠️ No auctions found on page, stopping pagination"); hasMorePages = false; } else { System.out.println(" ✓ Found " + pageAuctions.size() + " auctions"); allAuctions.addAll(pageAuctions); pageNumber++; } } System.out.println("\n✓ Total auctions extracted: " + allAuctions.size()); return allAuctions; } /** * Fetches a single page using Playwright * * @param pageNumber Page number (1-indexed) * @return HTML content of the page */ private String fetchPageWithPlaywright(int pageNumber) { String url = pageNumber == 1 ? AUCTIONS_BASE_URL : AUCTIONS_BASE_URL + "?page=" + pageNumber; try { Page page = browser.newPage(); // Set user agent page.setExtraHTTPHeaders(Map.of( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" )); // Navigate to page page.navigate(url, new Page.NavigateOptions() .setTimeout(30000) .setWaitUntil(WaitUntilState.NETWORKIDLE)); // Wait for auction listings to appear try { page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions() .setTimeout(10000)); } catch (Exception e) { // Continue even if selector not found System.out.println(" ⚠️ Auction selector not found, attempting to parse anyway"); } // Get HTML content String html = page.content(); page.close(); return html; } catch (Exception e) { System.err.println(" ⚠️ Playwright error: " + e.getMessage()); return null; } } /** * Parses auction data from HTML content * * @param html HTML content * @return List of parsed auctions */ private List parseAuctionsFromHtml(String html) { List auctions = new ArrayList<>(); // Simple regex-based parsing for auction links // Format: seenIds = new HashSet<>(); while (linkMatcher.find()) { String href = linkMatcher.group(1); int auctionId = Integer.parseInt(linkMatcher.group(2)); // Avoid duplicates if (seenIds.contains(auctionId)) { continue; } // Extract auction type (A1 or A7) String type = href.contains("A1-") ? "A1" : "A7"; // Try to find location and title near this link String location = extractLocationNearLink(html, href); String title = extractTitleFromHref(href); Auction auction = new Auction(); auction.id = auctionId; auction.type = type; auction.title = title; auction.location = location; auction.url = "https://www.troostwijkauctions.com" + href; auctions.add(auction); seenIds.add(auctionId); } return auctions; } /** * Extracts location text near an auction link * Looks for ", NL" or other country codes */ private String extractLocationNearLink(String html, String href) { int hrefPos = html.indexOf(href); if (hrefPos == -1) return "Unknown"; // Look at 1000 characters before AND after the href for location info int startPos = Math.max(hrefPos - 500, 0); int endPos = Math.min(hrefPos + 1000, html.length()); String context = html.substring(startPos, endPos); // Pattern 1: Classic format "City, NL" java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( "([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])"); java.util.regex.Matcher locMatcher = locPattern.matcher(context); if (locMatcher.find()) { String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2); System.out.println(" Found location: " + location + " for auction " + href); return location; } // Pattern 2: HTML format like "City, NL" // Extract city and country code separately java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile( "]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:)?\\s*\\s*([A-Z]{2})(?![A-Za-z])"); java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context); if (htmlMatcher.find()) { String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma String country = htmlMatcher.group(2); String location = city + ", " + country; System.out.println(" Found location (HTML): " + location + " for auction " + href); return location; } // Pattern 3: Fallback - just find country code after HTML tags java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile( "(?:-->||)\\s*([A-Z]{2})(?![A-Za-z])"); java.util.regex.Matcher countryMatcher = countryPattern.matcher(context); if (countryMatcher.find()) { String country = countryMatcher.group(1); System.out.println(" Found country code: " + country + " for auction " + href); return "Unknown, " + country; } System.out.println(" ⚠️ No location found for auction " + href); return "Unknown"; } /** * Extracts human-readable title from URL slug * Converts "some-auction-title-A1-12345" to "Some Auction Title" */ private String extractTitleFromHref(String href) { // Extract everything between "/a/" and "-A1-" or "-A7-" java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile( "/a/(.+?)-A[17]-"); java.util.regex.Matcher titleMatcher = titlePattern.matcher(href); if (titleMatcher.find()) { String slug = titleMatcher.group(1); // Convert kebab-case to Title Case String[] words = slug.split("-"); StringBuilder title = new StringBuilder(); for (String word : words) { if (!word.isEmpty()) { title.append(Character.toUpperCase(word.charAt(0))) .append(word.substring(1)) .append(" "); } } return title.toString().trim(); } return "Untitled Auction"; } /** * Loads cached HTML for a page from SQLite database * Returns null if not cached or cache has expired * * @param pageNumber Page number * @return Cached HTML or null if not found/expired */ private String loadFromCache(int pageNumber) { if (!useCache || cacheDb == null) return null; String url = pageNumber == 1 ? AUCTIONS_BASE_URL : AUCTIONS_BASE_URL + "?page=" + pageNumber; return cacheDb.get(url); } /** * Saves HTML to SQLite cache database with expiration time * * @param pageNumber Page number * @param html HTML content */ private void saveToCache(int pageNumber, String html) { if (!useCache || cacheDb == null) return; String url = pageNumber == 1 ? AUCTIONS_BASE_URL : AUCTIONS_BASE_URL + "?page=" + pageNumber; cacheDb.put(url, html, CACHE_EXPIRATION_HOURS); } /** * Filters auctions by location * * @param auctions List of auctions * @param locationFilter Location string to match (e.g., "NL") * @return Filtered list */ public static List filterByLocation(List auctions, String locationFilter) { return auctions.stream() .filter(a -> a.location.contains(locationFilter)) .toList(); } /** * Entry point for testing * * Arguments: * --max-visits : Maximum number of page visits (0 = unlimited, default) * --no-cache : Disable caching */ public static void main(String[] args) throws Exception { System.out.println("=== Troostwijk Auction Extractor ===\n"); // Parse command line arguments boolean useCache = true; int maxVisits = 0; // 0 = unlimited for (int i = 0; i < args.length; i++) { switch (args[i]) { case "--max-visits": if (i + 1 < args.length) { maxVisits = Integer.parseInt(args[++i]); System.out.println("Max page visits set to: " + maxVisits); } break; case "--no-cache": useCache = false; System.out.println("Caching disabled"); break; case "--help": System.out.println("Usage: java TroostwijkAuctionExtractor [options]"); System.out.println("Options:"); System.out.println(" --max-visits : Limit actual page fetches to n (0 = unlimited)"); System.out.println(" --no-cache : Disable page caching"); System.out.println(" --help : Show this help message"); return; } } TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache, maxVisits); try { // Initialize browser extractor.initialize(); // Extract all auctions List allAuctions = extractor.extractAllAuctions(); // Filter for Dutch auctions only List dutchAuctions = filterByLocation(allAuctions, "NL"); System.out.println("\n=== Results ==="); System.out.println("Total auctions found: " + allAuctions.size()); System.out.println("Dutch auctions (NL): " + dutchAuctions.size()); System.out.println("Actual page visits: " + extractor.pageVisitCount); // Display first 10 Dutch auctions System.out.println("\n=== Sample Dutch Auctions ==="); dutchAuctions.stream() .limit(10) .forEach(System.out::println); } finally { // Always close browser extractor.close(); } } /** * SQLite-based caching system for HTML pages with expiration support */ static class CacheDatabase { private final String dbPath; private Connection connection; public CacheDatabase(String dbPath) { this.dbPath = dbPath; } /** * Initialize database and create schema */ public void initialize() throws SQLException, IOException { // Create cache directory if it doesn't exist Path cacheDir = Paths.get(dbPath).getParent(); if (cacheDir != null) { Files.createDirectories(cacheDir); } connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath); // Create cache table with URL as primary key String createTable = """ CREATE TABLE IF NOT EXISTS page_cache ( url TEXT PRIMARY KEY, html TEXT NOT NULL, cached_at INTEGER NOT NULL, expires_at INTEGER NOT NULL ) """; try (Statement stmt = connection.createStatement()) { stmt.execute(createTable); // Create index on expires_at for efficient cleanup stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)"); } // Clean up expired entries on initialization cleanupExpired(); System.out.println("✓ Cache database initialized"); } /** * Get cached HTML for a URL if it exists and hasn't expired * * @param url The URL to look up * @return Cached HTML or null if not found/expired */ public synchronized String get(String url) { String sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?"; try (PreparedStatement ps = connection.prepareStatement(sql)) { ps.setString(1, url); ps.setLong(2, Instant.now().getEpochSecond()); ResultSet rs = ps.executeQuery(); if (rs.next()) { return rs.getString("html"); } } catch (SQLException e) { System.err.println("Cache read error: " + e.getMessage()); } return null; } /** * Store HTML in cache with expiration time * * @param url The URL to cache * @param html The HTML content * @param expirationHours Hours until cache expires */ public synchronized void put(String url, String html, long expirationHours) { String sql = """ INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at) VALUES (?, ?, ?, ?) """; long now = Instant.now().getEpochSecond(); long expiresAt = now + (expirationHours * 3600); try (PreparedStatement ps = connection.prepareStatement(sql)) { ps.setString(1, url); ps.setString(2, html); ps.setLong(3, now); ps.setLong(4, expiresAt); ps.executeUpdate(); } catch (SQLException e) { System.err.println("Cache write error: " + e.getMessage()); } } /** * Remove expired cache entries */ public synchronized void cleanupExpired() { String sql = "DELETE FROM page_cache WHERE expires_at <= ?"; try (PreparedStatement ps = connection.prepareStatement(sql)) { ps.setLong(1, Instant.now().getEpochSecond()); int deleted = ps.executeUpdate(); if (deleted > 0) { System.out.println("✓ Cleaned up " + deleted + " expired cache entries"); } } catch (SQLException e) { System.err.println("Cache cleanup error: " + e.getMessage()); } } /** * Get cache statistics */ public synchronized void printStats() { String sql = "SELECT COUNT(*) as total, " + "SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " + "SUM(LENGTH(html)) as total_size " + "FROM page_cache"; try (PreparedStatement ps = connection.prepareStatement(sql)) { ps.setLong(1, Instant.now().getEpochSecond()); ResultSet rs = ps.executeQuery(); if (rs.next()) { int total = rs.getInt("total"); int valid = rs.getInt("valid"); long size = rs.getLong("total_size"); System.out.println("\n=== Cache Statistics ==="); System.out.println("Total entries: " + total); System.out.println("Valid entries: " + valid); System.out.println("Expired entries: " + (total - valid)); System.out.println("Total size: " + (size / 1024) + " KB"); } } catch (SQLException e) { System.err.println("Cache stats error: " + e.getMessage()); } } /** * Close database connection */ public void close() { if (connection != null) { try { connection.close(); } catch (SQLException e) { System.err.println("Error closing cache database: " + e.getMessage()); } } } } }