diff --git a/pom.xml b/pom.xml index 2911c0e..51e2be6 100644 --- a/pom.xml +++ b/pom.xml @@ -14,8 +14,8 @@ UTF-8 - 11 - 11 + 21 + 21 2.17.0 4.9.0-0 @@ -55,6 +55,11 @@ opencv ${opencv.version} + + com.microsoft.playwright + playwright + 1.40.0 + @@ -65,22 +70,22 @@ maven-compiler-plugin 3.11.0 - 11 - 11 + 21 + 21 - + org.apache.maven.plugins diff --git a/src/main/java/com/auction/Main.java b/src/main/java/com/auction/Main.java index fa2d283..6a0091f 100644 --- a/src/main/java/com/auction/Main.java +++ b/src/main/java/com/auction/Main.java @@ -2,6 +2,13 @@ package com.auction; public class Main { public static void main(String[] args) { + // If arguments are passed, this is likely a one-off command via dokku run + // Just exit immediately to allow the command to run + if (args.length > 0) { + System.out.println("Command mode - exiting to allow shell commands"); + return; + } + System.out.println("Starting Troostwijk Auction Scraper..."); System.out.println("Container is running and healthy."); diff --git a/src/main/java/com/auction/TroostwijkAuctionExtractor.java b/src/main/java/com/auction/TroostwijkAuctionExtractor.java new file mode 100644 index 0000000..ab65248 --- /dev/null +++ b/src/main/java/com/auction/TroostwijkAuctionExtractor.java @@ -0,0 +1,563 @@ +package com.auction; + +import com.microsoft.playwright.*; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.microsoft.playwright.options.WaitUntilState; + +import java.io.IOException; +import java.nio.file.*; +import java.sql.*; +import java.time.Instant; +import java.util.*; + +/** + * TroostwijkAuctionExtractor + * + * Extracts auction listings from https://www.troostwijkauctions.com/auctions + * using Playwright for Java (headless browser automation). + * + * Features: + * - Uses Playwright for Java to load JavaScript-rendered content + * - Iterates through all pages of auction listings + * - Rate limiting: 200ms between each page request + * - Caches visited pages in SQLite database with expiration times + * - Extracts auction metadata: ID, title, location, URL + * + * Dependencies (Maven): + * + * com.microsoft.playwright + * playwright + * 1.40.0 + * + * + * com.fasterxml.jackson.core + * jackson-databind + * 2.17.0 + * + * + * org.xerial + * sqlite-jdbc + * 3.45.1.0 + * + * + * After adding dependency, run: mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install" + * This downloads the browser binaries needed by Playwright. + */ +public class TroostwijkAuctionExtractor { + + private static final String AUCTIONS_BASE_URL = "https://www.troostwijkauctions.com/auctions"; + private static final int RATE_LIMIT_MS = 200; + private static final String CACHE_DB_PATH = "cache/page_cache.db"; + private static final long CACHE_EXPIRATION_HOURS = 24; // Cache expires after 24 hours + + private final ObjectMapper objectMapper; + private final boolean useCache; + private final CacheDatabase cacheDb; + private Playwright playwright; + private Browser browser; + + /** + * Represents an auction listing + */ + public static class Auction { + public int id; + public String title; + public String location; + public String url; + public String type; // e.g. "A1" or "A7" + + @Override + public String toString() { + return String.format("Auction{id=%d, type=%s, title='%s', location='%s', url='%s'}", + id, type, title, location, url); + } + } + + /** + * Constructor + * + * @param useCache Enable database caching of visited pages + */ + public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException { + this.objectMapper = new ObjectMapper(); + this.useCache = useCache; + this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null; + + if (useCache) { + cacheDb.initialize(); + } + } + + /** + * Initializes Playwright and browser instance + * Call this before extracting auctions + */ + public void initialize() { + System.out.println("Initializing Playwright browser..."); + this.playwright = Playwright.create(); + this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions() + .setHeadless(true) + .setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox"))); + System.out.println("✓ Browser ready"); + } + + /** + * Closes browser and Playwright instance + * Call this when done extracting + */ + public void close() { + if (browser != null) { + browser.close(); + } + if (playwright != null) { + playwright.close(); + } + if (cacheDb != null) { + cacheDb.close(); + } + System.out.println("✓ Browser and cache closed"); + } + + /** + * Extracts all auctions from all pages + * + * @return List of all discovered auctions + */ + public List extractAllAuctions() throws InterruptedException { + if (browser == null) { + throw new IllegalStateException("Browser not initialized. Call initialize() first."); + } + + List allAuctions = new ArrayList<>(); + int pageNumber = 1; + boolean hasMorePages = true; + + System.out.println("Starting auction extraction from " + AUCTIONS_BASE_URL); + + while (hasMorePages) { + System.out.println("\n[Page " + pageNumber + "] Fetching auctions..."); + + // Check cache first + String cachedHtml = loadFromCache(pageNumber); + String html; + + if (cachedHtml != null) { + System.out.println(" ✓ Loaded from cache"); + html = cachedHtml; + } else { + // Fetch with Playwright + html = fetchPageWithPlaywright(pageNumber); + + if (html == null || html.isEmpty()) { + System.out.println(" ⚠️ Failed to fetch page, stopping pagination"); + break; + } + + // Save to cache + if (useCache) { + saveToCache(pageNumber, html); + } + + // Rate limiting - wait 200ms before next request + Thread.sleep(RATE_LIMIT_MS); + } + + // Parse auctions from HTML + List pageAuctions = parseAuctionsFromHtml(html); + + if (pageAuctions.isEmpty()) { + System.out.println(" ⚠️ No auctions found on page, stopping pagination"); + hasMorePages = false; + } else { + System.out.println(" ✓ Found " + pageAuctions.size() + " auctions"); + allAuctions.addAll(pageAuctions); + pageNumber++; + } + } + + System.out.println("\n✓ Total auctions extracted: " + allAuctions.size()); + return allAuctions; + } + + /** + * Fetches a single page using Playwright + * + * @param pageNumber Page number (1-indexed) + * @return HTML content of the page + */ + private String fetchPageWithPlaywright(int pageNumber) { + String url = pageNumber == 1 + ? AUCTIONS_BASE_URL + : AUCTIONS_BASE_URL + "?page=" + pageNumber; + + try { + Page page = browser.newPage(); + + // Set user agent + page.setExtraHTTPHeaders(Map.of( + "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + )); + + // Navigate to page + page.navigate(url, new Page.NavigateOptions() + .setTimeout(30000) + .setWaitUntil(WaitUntilState.NETWORKIDLE)); + + // Wait for auction listings to appear + try { + page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions() + .setTimeout(10000)); + } catch (Exception e) { + // Continue even if selector not found + System.out.println(" ⚠️ Auction selector not found, attempting to parse anyway"); + } + + // Get HTML content + String html = page.content(); + page.close(); + + return html; + + } catch (Exception e) { + System.err.println(" ⚠️ Playwright error: " + e.getMessage()); + return null; + } + } + + /** + * Parses auction data from HTML content + * + * @param html HTML content + * @return List of parsed auctions + */ + private List parseAuctionsFromHtml(String html) { + List auctions = new ArrayList<>(); + + // Simple regex-based parsing for auction links + // Format: seenIds = new HashSet<>(); + + while (linkMatcher.find()) { + String href = linkMatcher.group(1); + int auctionId = Integer.parseInt(linkMatcher.group(2)); + + // Avoid duplicates + if (seenIds.contains(auctionId)) { + continue; + } + + // Extract auction type (A1 or A7) + String type = href.contains("A1-") ? "A1" : "A7"; + + // Try to find location and title near this link + String location = extractLocationNearLink(html, href); + String title = extractTitleFromHref(href); + + Auction auction = new Auction(); + auction.id = auctionId; + auction.type = type; + auction.title = title; + auction.location = location; + auction.url = "https://www.troostwijkauctions.com" + href; + + auctions.add(auction); + seenIds.add(auctionId); + } + + return auctions; + } + + /** + * Extracts location text near an auction link + * Looks for ", NL" or other country codes + */ + private String extractLocationNearLink(String html, String href) { + int hrefPos = html.indexOf(href); + if (hrefPos == -1) return "Unknown"; + + // Look at 500 characters after the href for location info + int endPos = Math.min(hrefPos + 500, html.length()); + String context = html.substring(hrefPos, endPos); + + // Try to find location pattern like "City, NL" or "City, Country" + java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( + "([A-Za-z\\s]+),\\s*([A-Z]{2})"); + java.util.regex.Matcher locMatcher = locPattern.matcher(context); + + if (locMatcher.find()) { + return locMatcher.group(1).trim() + ", " + locMatcher.group(2); + } + + return "Unknown"; + } + + /** + * Extracts human-readable title from URL slug + * Converts "some-auction-title-A1-12345" to "Some Auction Title" + */ + private String extractTitleFromHref(String href) { + // Extract everything between "/a/" and "-A1-" or "-A7-" + java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile( + "/a/(.+?)-A[17]-"); + java.util.regex.Matcher titleMatcher = titlePattern.matcher(href); + + if (titleMatcher.find()) { + String slug = titleMatcher.group(1); + // Convert kebab-case to Title Case + String[] words = slug.split("-"); + StringBuilder title = new StringBuilder(); + for (String word : words) { + if (!word.isEmpty()) { + title.append(Character.toUpperCase(word.charAt(0))) + .append(word.substring(1)) + .append(" "); + } + } + return title.toString().trim(); + } + + return "Untitled Auction"; + } + + /** + * Loads cached HTML for a page from SQLite database + * Returns null if not cached or cache has expired + * + * @param pageNumber Page number + * @return Cached HTML or null if not found/expired + */ + private String loadFromCache(int pageNumber) { + if (!useCache || cacheDb == null) return null; + + String url = pageNumber == 1 + ? AUCTIONS_BASE_URL + : AUCTIONS_BASE_URL + "?page=" + pageNumber; + + return cacheDb.get(url); + } + + /** + * Saves HTML to SQLite cache database with expiration time + * + * @param pageNumber Page number + * @param html HTML content + */ + private void saveToCache(int pageNumber, String html) { + if (!useCache || cacheDb == null) return; + + String url = pageNumber == 1 + ? AUCTIONS_BASE_URL + : AUCTIONS_BASE_URL + "?page=" + pageNumber; + + cacheDb.put(url, html, CACHE_EXPIRATION_HOURS); + } + + /** + * Filters auctions by location + * + * @param auctions List of auctions + * @param locationFilter Location string to match (e.g., "NL") + * @return Filtered list + */ + public static List filterByLocation(List auctions, String locationFilter) { + return auctions.stream() + .filter(a -> a.location.contains(locationFilter)) + .toList(); + } + + /** + * Entry point for testing + */ + public static void main(String[] args) throws Exception { + System.out.println("=== Troostwijk Auction Extractor ===\n"); + + // Enable caching by default + boolean useCache = true; + TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache); + + try { + // Initialize browser + extractor.initialize(); + + // Extract all auctions + List allAuctions = extractor.extractAllAuctions(); + + // Filter for Dutch auctions only + List dutchAuctions = filterByLocation(allAuctions, "NL"); + + System.out.println("\n=== Results ==="); + System.out.println("Total auctions found: " + allAuctions.size()); + System.out.println("Dutch auctions (NL): " + dutchAuctions.size()); + + // Display first 10 Dutch auctions + System.out.println("\n=== Sample Dutch Auctions ==="); + dutchAuctions.stream() + .limit(10) + .forEach(System.out::println); + + } finally { + // Always close browser + extractor.close(); + } + } + + /** + * SQLite-based caching system for HTML pages with expiration support + */ + static class CacheDatabase { + private final String dbPath; + private Connection connection; + + public CacheDatabase(String dbPath) { + this.dbPath = dbPath; + } + + /** + * Initialize database and create schema + */ + public void initialize() throws SQLException, IOException { + // Create cache directory if it doesn't exist + Path cacheDir = Paths.get(dbPath).getParent(); + if (cacheDir != null) { + Files.createDirectories(cacheDir); + } + + connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath); + + // Create cache table with URL as primary key + String createTable = """ + CREATE TABLE IF NOT EXISTS page_cache ( + url TEXT PRIMARY KEY, + html TEXT NOT NULL, + cached_at INTEGER NOT NULL, + expires_at INTEGER NOT NULL + ) + """; + + try (Statement stmt = connection.createStatement()) { + stmt.execute(createTable); + // Create index on expires_at for efficient cleanup + stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)"); + } + + // Clean up expired entries on initialization + cleanupExpired(); + + System.out.println("✓ Cache database initialized"); + } + + /** + * Get cached HTML for a URL if it exists and hasn't expired + * + * @param url The URL to look up + * @return Cached HTML or null if not found/expired + */ + public synchronized String get(String url) { + String sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?"; + + try (PreparedStatement ps = connection.prepareStatement(sql)) { + ps.setString(1, url); + ps.setLong(2, Instant.now().getEpochSecond()); + + ResultSet rs = ps.executeQuery(); + if (rs.next()) { + return rs.getString("html"); + } + } catch (SQLException e) { + System.err.println("Cache read error: " + e.getMessage()); + } + + return null; + } + + /** + * Store HTML in cache with expiration time + * + * @param url The URL to cache + * @param html The HTML content + * @param expirationHours Hours until cache expires + */ + public synchronized void put(String url, String html, long expirationHours) { + String sql = """ + INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at) + VALUES (?, ?, ?, ?) + """; + + long now = Instant.now().getEpochSecond(); + long expiresAt = now + (expirationHours * 3600); + + try (PreparedStatement ps = connection.prepareStatement(sql)) { + ps.setString(1, url); + ps.setString(2, html); + ps.setLong(3, now); + ps.setLong(4, expiresAt); + ps.executeUpdate(); + } catch (SQLException e) { + System.err.println("Cache write error: " + e.getMessage()); + } + } + + /** + * Remove expired cache entries + */ + public synchronized void cleanupExpired() { + String sql = "DELETE FROM page_cache WHERE expires_at <= ?"; + + try (PreparedStatement ps = connection.prepareStatement(sql)) { + ps.setLong(1, Instant.now().getEpochSecond()); + int deleted = ps.executeUpdate(); + if (deleted > 0) { + System.out.println("✓ Cleaned up " + deleted + " expired cache entries"); + } + } catch (SQLException e) { + System.err.println("Cache cleanup error: " + e.getMessage()); + } + } + + /** + * Get cache statistics + */ + public synchronized void printStats() { + String sql = "SELECT COUNT(*) as total, " + + "SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " + + "SUM(LENGTH(html)) as total_size " + + "FROM page_cache"; + + try (PreparedStatement ps = connection.prepareStatement(sql)) { + ps.setLong(1, Instant.now().getEpochSecond()); + ResultSet rs = ps.executeQuery(); + + if (rs.next()) { + int total = rs.getInt("total"); + int valid = rs.getInt("valid"); + long size = rs.getLong("total_size"); + + System.out.println("\n=== Cache Statistics ==="); + System.out.println("Total entries: " + total); + System.out.println("Valid entries: " + valid); + System.out.println("Expired entries: " + (total - valid)); + System.out.println("Total size: " + (size / 1024) + " KB"); + } + } catch (SQLException e) { + System.err.println("Cache stats error: " + e.getMessage()); + } + } + + /** + * Close database connection + */ + public void close() { + if (connection != null) { + try { + connection.close(); + } catch (SQLException e) { + System.err.println("Error closing cache database: " + e.getMessage()); + } + } + } + } +}