This commit is contained in:
2025-11-28 05:05:33 +01:00
parent c26264b92a
commit ec2efd4661
3 changed files with 591 additions and 16 deletions

37
pom.xml
View File

@@ -14,8 +14,8 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>11</maven.compiler.source> <maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target> <maven.compiler.target>21</maven.compiler.target>
<jackson.version>2.17.0</jackson.version> <jackson.version>2.17.0</jackson.version>
<opencv.version>4.9.0-0</opencv.version> <opencv.version>4.9.0-0</opencv.version>
</properties> </properties>
@@ -55,6 +55,11 @@
<artifactId>opencv</artifactId> <artifactId>opencv</artifactId>
<version>${opencv.version}</version> <version>${opencv.version}</version>
</dependency> </dependency>
<dependency>
<groupId>com.microsoft.playwright</groupId>
<artifactId>playwright</artifactId>
<version>1.40.0</version>
</dependency>
</dependencies> </dependencies>
<build> <build>
@@ -65,22 +70,22 @@
<artifactId>maven-compiler-plugin</artifactId> <artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version> <version>3.11.0</version>
<configuration> <configuration>
<source>11</source> <source>21</source>
<target>11</target> <target>21</target>
</configuration> </configuration>
</plugin> </plugin>
<!-- <plugin> <!-- <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId> <artifactId>maven-jar-plugin</artifactId>
<version>3.2.0</version> <version>3.2.0</version>
<configuration> <configuration>
<archive> <archive>
<manifest> <manifest>
<mainClass>com.auction.Main</mainClass> <mainClass>com.auction.Main</mainClass>
</manifest> </manifest>
</archive> </archive>
</configuration> </configuration>
</plugin>--> </plugin>-->
<!-- Maven Assembly Plugin for creating executable JAR with dependencies --> <!-- Maven Assembly Plugin for creating executable JAR with dependencies -->
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>

View File

@@ -2,6 +2,13 @@ package com.auction;
public class Main { public class Main {
public static void main(String[] args) { public static void main(String[] args) {
// If arguments are passed, this is likely a one-off command via dokku run
// Just exit immediately to allow the command to run
if (args.length > 0) {
System.out.println("Command mode - exiting to allow shell commands");
return;
}
System.out.println("Starting Troostwijk Auction Scraper..."); System.out.println("Starting Troostwijk Auction Scraper...");
System.out.println("Container is running and healthy."); System.out.println("Container is running and healthy.");

View File

@@ -0,0 +1,563 @@
package com.auction;
import com.microsoft.playwright.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.microsoft.playwright.options.WaitUntilState;
import java.io.IOException;
import java.nio.file.*;
import java.sql.*;
import java.time.Instant;
import java.util.*;
/**
* TroostwijkAuctionExtractor
*
* Extracts auction listings from https://www.troostwijkauctions.com/auctions
* using Playwright for Java (headless browser automation).
*
* Features:
* - Uses Playwright for Java to load JavaScript-rendered content
* - Iterates through all pages of auction listings
* - Rate limiting: 200ms between each page request
* - Caches visited pages in SQLite database with expiration times
* - Extracts auction metadata: ID, title, location, URL
*
* Dependencies (Maven):
* <dependency>
* <groupId>com.microsoft.playwright</groupId>
* <artifactId>playwright</artifactId>
* <version>1.40.0</version>
* </dependency>
* <dependency>
* <groupId>com.fasterxml.jackson.core</groupId>
* <artifactId>jackson-databind</artifactId>
* <version>2.17.0</version>
* </dependency>
* <dependency>
* <groupId>org.xerial</groupId>
* <artifactId>sqlite-jdbc</artifactId>
* <version>3.45.1.0</version>
* </dependency>
*
* After adding dependency, run: mvn exec:java -e -D exec.mainClass=com.microsoft.playwright.CLI -D exec.args="install"
* This downloads the browser binaries needed by Playwright.
*/
public class TroostwijkAuctionExtractor {
private static final String AUCTIONS_BASE_URL = "https://www.troostwijkauctions.com/auctions";
private static final int RATE_LIMIT_MS = 200;
private static final String CACHE_DB_PATH = "cache/page_cache.db";
private static final long CACHE_EXPIRATION_HOURS = 24; // Cache expires after 24 hours
private final ObjectMapper objectMapper;
private final boolean useCache;
private final CacheDatabase cacheDb;
private Playwright playwright;
private Browser browser;
/**
* Represents an auction listing
*/
public static class Auction {
public int id;
public String title;
public String location;
public String url;
public String type; // e.g. "A1" or "A7"
@Override
public String toString() {
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', url='%s'}",
id, type, title, location, url);
}
}
/**
* Constructor
*
* @param useCache Enable database caching of visited pages
*/
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
this.objectMapper = new ObjectMapper();
this.useCache = useCache;
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
if (useCache) {
cacheDb.initialize();
}
}
/**
* Initializes Playwright and browser instance
* Call this before extracting auctions
*/
public void initialize() {
System.out.println("Initializing Playwright browser...");
this.playwright = Playwright.create();
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(true)
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
System.out.println("✓ Browser ready");
}
/**
* Closes browser and Playwright instance
* Call this when done extracting
*/
public void close() {
if (browser != null) {
browser.close();
}
if (playwright != null) {
playwright.close();
}
if (cacheDb != null) {
cacheDb.close();
}
System.out.println("✓ Browser and cache closed");
}
/**
* Extracts all auctions from all pages
*
* @return List of all discovered auctions
*/
public List<Auction> extractAllAuctions() throws InterruptedException {
if (browser == null) {
throw new IllegalStateException("Browser not initialized. Call initialize() first.");
}
List<Auction> allAuctions = new ArrayList<>();
int pageNumber = 1;
boolean hasMorePages = true;
System.out.println("Starting auction extraction from " + AUCTIONS_BASE_URL);
while (hasMorePages) {
System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
// Check cache first
String cachedHtml = loadFromCache(pageNumber);
String html;
if (cachedHtml != null) {
System.out.println(" ✓ Loaded from cache");
html = cachedHtml;
} else {
// Fetch with Playwright
html = fetchPageWithPlaywright(pageNumber);
if (html == null || html.isEmpty()) {
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
break;
}
// Save to cache
if (useCache) {
saveToCache(pageNumber, html);
}
// Rate limiting - wait 200ms before next request
Thread.sleep(RATE_LIMIT_MS);
}
// Parse auctions from HTML
List<Auction> pageAuctions = parseAuctionsFromHtml(html);
if (pageAuctions.isEmpty()) {
System.out.println(" ⚠️ No auctions found on page, stopping pagination");
hasMorePages = false;
} else {
System.out.println(" ✓ Found " + pageAuctions.size() + " auctions");
allAuctions.addAll(pageAuctions);
pageNumber++;
}
}
System.out.println("\n✓ Total auctions extracted: " + allAuctions.size());
return allAuctions;
}
/**
* Fetches a single page using Playwright
*
* @param pageNumber Page number (1-indexed)
* @return HTML content of the page
*/
private String fetchPageWithPlaywright(int pageNumber) {
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
try {
Page page = browser.newPage();
// Set user agent
page.setExtraHTTPHeaders(Map.of(
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
));
// Navigate to page
page.navigate(url, new Page.NavigateOptions()
.setTimeout(30000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
// Wait for auction listings to appear
try {
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
.setTimeout(10000));
} catch (Exception e) {
// Continue even if selector not found
System.out.println(" ⚠️ Auction selector not found, attempting to parse anyway");
}
// Get HTML content
String html = page.content();
page.close();
return html;
} catch (Exception e) {
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
return null;
}
}
/**
* Parses auction data from HTML content
*
* @param html HTML content
* @return List of parsed auctions
*/
private List<Auction> parseAuctionsFromHtml(String html) {
List<Auction> auctions = new ArrayList<>();
// Simple regex-based parsing for auction links
// Format: <a href="/a/title-A1-12345" or "/a/title-A7-12345"
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
Set<Integer> seenIds = new HashSet<>();
while (linkMatcher.find()) {
String href = linkMatcher.group(1);
int auctionId = Integer.parseInt(linkMatcher.group(2));
// Avoid duplicates
if (seenIds.contains(auctionId)) {
continue;
}
// Extract auction type (A1 or A7)
String type = href.contains("A1-") ? "A1" : "A7";
// Try to find location and title near this link
String location = extractLocationNearLink(html, href);
String title = extractTitleFromHref(href);
Auction auction = new Auction();
auction.id = auctionId;
auction.type = type;
auction.title = title;
auction.location = location;
auction.url = "https://www.troostwijkauctions.com" + href;
auctions.add(auction);
seenIds.add(auctionId);
}
return auctions;
}
/**
* Extracts location text near an auction link
* Looks for ", NL" or other country codes
*/
private String extractLocationNearLink(String html, String href) {
int hrefPos = html.indexOf(href);
if (hrefPos == -1) return "Unknown";
// Look at 500 characters after the href for location info
int endPos = Math.min(hrefPos + 500, html.length());
String context = html.substring(hrefPos, endPos);
// Try to find location pattern like "City, NL" or "City, Country"
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z\\s]+),\\s*([A-Z]{2})");
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
if (locMatcher.find()) {
return locMatcher.group(1).trim() + ", " + locMatcher.group(2);
}
return "Unknown";
}
/**
* Extracts human-readable title from URL slug
* Converts "some-auction-title-A1-12345" to "Some Auction Title"
*/
private String extractTitleFromHref(String href) {
// Extract everything between "/a/" and "-A1-" or "-A7-"
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile(
"/a/(.+?)-A[17]-");
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
String slug = titleMatcher.group(1);
// Convert kebab-case to Title Case
String[] words = slug.split("-");
StringBuilder title = new StringBuilder();
for (String word : words) {
if (!word.isEmpty()) {
title.append(Character.toUpperCase(word.charAt(0)))
.append(word.substring(1))
.append(" ");
}
}
return title.toString().trim();
}
return "Untitled Auction";
}
/**
* Loads cached HTML for a page from SQLite database
* Returns null if not cached or cache has expired
*
* @param pageNumber Page number
* @return Cached HTML or null if not found/expired
*/
private String loadFromCache(int pageNumber) {
if (!useCache || cacheDb == null) return null;
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
return cacheDb.get(url);
}
/**
* Saves HTML to SQLite cache database with expiration time
*
* @param pageNumber Page number
* @param html HTML content
*/
private void saveToCache(int pageNumber, String html) {
if (!useCache || cacheDb == null) return;
String url = pageNumber == 1
? AUCTIONS_BASE_URL
: AUCTIONS_BASE_URL + "?page=" + pageNumber;
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
}
/**
* Filters auctions by location
*
* @param auctions List of auctions
* @param locationFilter Location string to match (e.g., "NL")
* @return Filtered list
*/
public static List<Auction> filterByLocation(List<Auction> auctions, String locationFilter) {
return auctions.stream()
.filter(a -> a.location.contains(locationFilter))
.toList();
}
/**
* Entry point for testing
*/
public static void main(String[] args) throws Exception {
System.out.println("=== Troostwijk Auction Extractor ===\n");
// Enable caching by default
boolean useCache = true;
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache);
try {
// Initialize browser
extractor.initialize();
// Extract all auctions
List<Auction> allAuctions = extractor.extractAllAuctions();
// Filter for Dutch auctions only
List<Auction> dutchAuctions = filterByLocation(allAuctions, "NL");
System.out.println("\n=== Results ===");
System.out.println("Total auctions found: " + allAuctions.size());
System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
// Display first 10 Dutch auctions
System.out.println("\n=== Sample Dutch Auctions ===");
dutchAuctions.stream()
.limit(10)
.forEach(System.out::println);
} finally {
// Always close browser
extractor.close();
}
}
/**
* SQLite-based caching system for HTML pages with expiration support
*/
static class CacheDatabase {
private final String dbPath;
private Connection connection;
public CacheDatabase(String dbPath) {
this.dbPath = dbPath;
}
/**
* Initialize database and create schema
*/
public void initialize() throws SQLException, IOException {
// Create cache directory if it doesn't exist
Path cacheDir = Paths.get(dbPath).getParent();
if (cacheDir != null) {
Files.createDirectories(cacheDir);
}
connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
// Create cache table with URL as primary key
String createTable = """
CREATE TABLE IF NOT EXISTS page_cache (
url TEXT PRIMARY KEY,
html TEXT NOT NULL,
cached_at INTEGER NOT NULL,
expires_at INTEGER NOT NULL
)
""";
try (Statement stmt = connection.createStatement()) {
stmt.execute(createTable);
// Create index on expires_at for efficient cleanup
stmt.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON page_cache(expires_at)");
}
// Clean up expired entries on initialization
cleanupExpired();
System.out.println("✓ Cache database initialized");
}
/**
* Get cached HTML for a URL if it exists and hasn't expired
*
* @param url The URL to look up
* @return Cached HTML or null if not found/expired
*/
public synchronized String get(String url) {
String sql = "SELECT html FROM page_cache WHERE url = ? AND expires_at > ?";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setString(1, url);
ps.setLong(2, Instant.now().getEpochSecond());
ResultSet rs = ps.executeQuery();
if (rs.next()) {
return rs.getString("html");
}
} catch (SQLException e) {
System.err.println("Cache read error: " + e.getMessage());
}
return null;
}
/**
* Store HTML in cache with expiration time
*
* @param url The URL to cache
* @param html The HTML content
* @param expirationHours Hours until cache expires
*/
public synchronized void put(String url, String html, long expirationHours) {
String sql = """
INSERT OR REPLACE INTO page_cache (url, html, cached_at, expires_at)
VALUES (?, ?, ?, ?)
""";
long now = Instant.now().getEpochSecond();
long expiresAt = now + (expirationHours * 3600);
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setString(1, url);
ps.setString(2, html);
ps.setLong(3, now);
ps.setLong(4, expiresAt);
ps.executeUpdate();
} catch (SQLException e) {
System.err.println("Cache write error: " + e.getMessage());
}
}
/**
* Remove expired cache entries
*/
public synchronized void cleanupExpired() {
String sql = "DELETE FROM page_cache WHERE expires_at <= ?";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setLong(1, Instant.now().getEpochSecond());
int deleted = ps.executeUpdate();
if (deleted > 0) {
System.out.println("✓ Cleaned up " + deleted + " expired cache entries");
}
} catch (SQLException e) {
System.err.println("Cache cleanup error: " + e.getMessage());
}
}
/**
* Get cache statistics
*/
public synchronized void printStats() {
String sql = "SELECT COUNT(*) as total, " +
"SUM(CASE WHEN expires_at > ? THEN 1 ELSE 0 END) as valid, " +
"SUM(LENGTH(html)) as total_size " +
"FROM page_cache";
try (PreparedStatement ps = connection.prepareStatement(sql)) {
ps.setLong(1, Instant.now().getEpochSecond());
ResultSet rs = ps.executeQuery();
if (rs.next()) {
int total = rs.getInt("total");
int valid = rs.getInt("valid");
long size = rs.getLong("total_size");
System.out.println("\n=== Cache Statistics ===");
System.out.println("Total entries: " + total);
System.out.println("Valid entries: " + valid);
System.out.println("Expired entries: " + (total - valid));
System.out.println("Total size: " + (size / 1024) + " KB");
}
} catch (SQLException e) {
System.err.println("Cache stats error: " + e.getMessage());
}
}
/**
* Close database connection
*/
public void close() {
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
System.err.println("Error closing cache database: " + e.getMessage());
}
}
}
}
}