Files
auctiora/src/main/java/com/auction/TroostwijkScraper.java
Tour 4bb36e23e9 start
Former-commit-id: 853c3cf53e
2025-12-03 15:09:39 +01:00

687 lines
27 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package com.auction;
/*
* TroostwijkScraper
*
* This example shows how you could build a Javabased scraper for the Dutch
* auctions on Troostwijk Auctions. The scraper uses a combination of
* HTTP requests and HTML parsing with the jsoup library to discover active
* auctions, calls Troostwijk's internal JSON API to fetch lot (kavel) data
* efficiently, writes the results into a local SQLite database, performs
* object detection on lot images using OpenCV's DNN module, and sends
* desktop/email notifications when bids change or lots are about to expire.
* The implementation uses well known open source libraries for each of these
* concerns. You can adjust the API endpoints and CSS selectors as
* Troostwijk's site evolves. The code is organised into small helper
* classes to make it easier to maintain.
*
* Dependencies (add these to your Maven/Gradle project):
*
* - org.jsoup:jsoup:1.17.2 HTML parser and HTTP client.
* - com.fasterxml.jackson.core:jackson-databind:2.17.0 JSON parsing.
* - org.xerial:sqlite-jdbc:3.45.1.0 SQLite JDBC driver.
* - com.sun.mail:javax.mail:1.6.2 JavaMail for email notifications (free).
* - org.openpnp:opencv:4.9.0-0 (with native libraries) OpenCV for image
* processing and object detection.
*
* Before running this program you must ensure that the native OpenCV
* binaries are on your library path (e.g. via -Djava.library.path).
* Desktop notifications work out of the box on Windows, macOS, and Linux.
* For email notifications, you need a Gmail account with an app password
* (free, requires 2FA enabled). See https://support.google.com/accounts/answer/185833
*
* The scraper performs four major tasks:
* 1. Discover all auctions located in the Netherlands.
* 2. For each auction, fetch all lots (kavels) including images and
* bidding information, and persist the data into SQLite tables.
* 3. Monitor bidding and closing times on a schedule and send desktop/email
* notifications when bids change or lots are about to expire.
* 4. Run object detection on downloaded lot images to automatically
* label objects using a YOLO model. The results are stored in the
* database for later search.
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import com.microsoft.playwright.Browser;
import com.microsoft.playwright.BrowserType;
import com.microsoft.playwright.Page;
import com.microsoft.playwright.Playwright;
import com.microsoft.playwright.options.WaitUntilState;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/**
* Main scraper class. It encapsulates the logic for scraping auctions,
* persisting data, scheduling updates, and performing object detection.
*/
public class TroostwijkScraper {
// Base URLs adjust these if Troostwijk changes their site structure
private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/auctions";
private static final String LOT_API = "https://api.troostwijkauctions.com/lot/7/list";
private static final String CACHE_DB_PATH = "cache/page_cache.db";
private static final long CACHE_EXPIRATION_HOURS = 24;
private static final int RATE_LIMIT_MS = 200;
// HTTP client used for API calls
private final HttpClient httpClient;
private final ObjectMapper objectMapper;
public final DatabaseService db;
private final NotificationService notifier;
private final ObjectDetectionService detector;
private final CacheDatabase cacheDb;
private final boolean useCache;
private Playwright playwright;
private Browser browser;
/**
* Constructor. Creates supporting services and ensures the database
* tables exist.
*
* @param databasePath Path to SQLite database file
* @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email
* @param unused Unused parameter (kept for compatibility)
* @param yoloCfgPath Path to YOLO configuration file
* @param yoloWeightsPath Path to YOLO weights file
* @param classNamesPath Path to file containing class names
*/
public TroostwijkScraper(String databasePath, String notificationConfig, String unused,
String yoloCfgPath, String yoloWeightsPath, String classNamesPath) throws SQLException, IOException {
this(databasePath, notificationConfig, unused, yoloCfgPath, yoloWeightsPath, classNamesPath, true);
}
/**
* Constructor with cache control.
*
* @param databasePath Path to SQLite database file
* @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email
* @param unused Unused parameter (kept for compatibility)
* @param yoloCfgPath Path to YOLO configuration file
* @param yoloWeightsPath Path to YOLO weights file
* @param classNamesPath Path to file containing class names
* @param useCache Enable page caching
*/
public TroostwijkScraper(String databasePath, String notificationConfig, String unused,
String yoloCfgPath, String yoloWeightsPath, String classNamesPath,
boolean useCache) throws SQLException, IOException {
this.httpClient = HttpClient.newHttpClient();
this.objectMapper = new ObjectMapper();
this.db = new DatabaseService(databasePath);
this.notifier = new NotificationService(notificationConfig, unused);
this.detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath);
this.useCache = useCache;
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
// initialize DB
db.ensureSchema();
if (useCache) {
cacheDb.initialize();
}
}
/**
* Initializes Playwright browser for JavaScript-rendered pages.
* Call this before using discoverDutchAuctions().
*/
public void initializeBrowser() {
if (playwright == null) {
IO.println("Initializing Playwright browser...");
this.playwright = Playwright.create();
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(true)
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
IO.println("✓ Browser ready");
}
}
/**
* Closes browser and cache resources.
*/
public void close() {
if (browser != null) {
browser.close();
browser = null;
}
if (playwright != null) {
playwright.close();
playwright = null;
}
if (cacheDb != null) {
cacheDb.close();
}
}
/**
* Discovers all active Dutch auctions by crawling the auctions page.
*
* Uses Playwright to render JavaScript-heavy pages and extract auction data.
* Supports caching to avoid unnecessary page fetches. Filters auctions whose
* location contains ", NL" (indicating the Netherlands). Each auction link
* contains a unique sale ID in the format A1-xxxxx or A7-xxxxx.
*
* Auctions are saved to the database and can be retrieved with getDutchAuctions().
*
* @return a list of sale identifiers for auctions located in NL (legacy compatibility)
*/
public List<Integer> discoverDutchAuctions() {
Set<Integer> saleIds = new HashSet<>();
// Check if browser is initialized
if (browser == null) {
initializeBrowser();
}
var pageNumber = 1;
var hasMorePages = true;
IO.println("Starting Dutch auction discovery from " + AUCTIONS_PAGE);
while (hasMorePages) {
IO.println("\n[Page " + pageNumber + "] Fetching auctions...");
// Check cache first
var html = loadFromCache(pageNumber);
if (html != null) {
IO.println(" ✓ Loaded from cache");
} else {
// Fetch with Playwright
html = fetchPageWithPlaywright(pageNumber);
if (html == null || html.isEmpty()) {
IO.println(" ⚠️ Failed to fetch page, stopping pagination");
break;
}
IO.println(" ✓ Fetched from website");
// Save to cache
if (useCache) {
saveToCache(pageNumber, html);
}
// Rate limiting
try {
Thread.sleep(RATE_LIMIT_MS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
// Parse auctions from HTML (saves Dutch auctions to database)
var foundOnPage = parseAuctionsFromHtml(html, saleIds);
if (foundOnPage == 0) {
IO.println(" ⚠️ No Dutch auctions found on page, stopping pagination");
hasMorePages = false;
} else {
IO.println(" ✓ Found " + foundOnPage + " Dutch auctions");
pageNumber++;
}
}
IO.println("\n✓ Total Dutch auctions discovered: " + saleIds.size());
return new ArrayList<>(saleIds);
}
/**
* Fetches a single page using Playwright
*/
private String fetchPageWithPlaywright(int pageNumber) {
var url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
try {
var page = browser.newPage();
// Set user agent
page.setExtraHTTPHeaders(Map.of(
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
));
// Navigate to page
page.navigate(url, new Page.NavigateOptions()
.setTimeout(30000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
// Wait for auction listings to appear
try {
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
.setTimeout(10000));
} catch (Exception e) {
// Continue even if selector not found
IO.println(" ⚠️ Auction selector not found");
}
// Get HTML content
var html = page.content();
page.close();
return html;
} catch (Exception e) {
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
return null;
}
}
/**
* Parses auctions from HTML using JSoup and saves Dutch auctions to database.
* Uses proper HTML parsing instead of regex for more reliable extraction.
* @return number of Dutch auctions found on this page
*/
private int parseAuctionsFromHtml(String html, Set<Integer> saleIds) {
var foundCount = 0;
try {
var doc = org.jsoup.Jsoup.parse(html);
// Find all auction links (format: /a/title-A1-12345 or /a/title-A7-12345)
var auctionLinks = doc.select("a[href^='/a/']");
for (var link : auctionLinks) {
var href = link.attr("href");
// Extract auction ID from URL
var pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)");
var matcher = pattern.matcher(href);
if (!matcher.find()) {
continue;
}
var typeNum = matcher.group(1);
var auctionId = Integer.parseInt(matcher.group(2));
// Skip duplicates
if (saleIds.contains(auctionId)) {
continue;
}
// Extract auction info using JSoup
var auction = extractAuctionInfo(link, href, auctionId, "A" + typeNum);
// Only keep Dutch auctions
if (auction != null && "NL".equals(auction.country)) {
saleIds.add(auctionId);
foundCount++;
// Save to database
try {
db.upsertAuction(auction);
IO.println(" Found Dutch auction: " + auctionId + " - " + auction.title + " (" + auction.location + ")");
} catch (SQLException e) {
System.err.println(" Failed to save auction: " + e.getMessage());
}
}
}
} catch (Exception e) {
System.err.println(" Error parsing HTML: " + e.getMessage());
}
return foundCount;
}
/**
* Extracts auction information from a link element using JSoup
* This method intelligently parses the HTML structure to extract:
* - Title
* - Location (city and country)
* - Lot count (if available)
*/
private AuctionInfo extractAuctionInfo(org.jsoup.nodes.Element link, String href, int auctionId, String type) {
var auction = new AuctionInfo();
auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;
// Extract title from href (convert kebab-case to title)
var titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-");
var titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
var slug = titleMatcher.group(1);
auction.title = slug.replace("-", " ");
// Capitalize first letter
if (!auction.title.isEmpty()) {
auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1);
}
} else {
auction.title = "Unknown Auction";
}
// Try to find title in link text (more accurate)
var linkText = link.text();
if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) {
// If link text doesn't contain numbers, it's likely the title
var parts = linkText.split(",|\\d+");
if (parts.length > 0 && parts[0].trim().length() > 5) {
auction.title = parts[0].trim();
}
}
// Extract location using JSoup selectors
// Look for <p> tags that contain location info
var locationElements = link.select("p");
for (var p : locationElements) {
var text = p.text();
// Pattern: "City, Country" or "City, Region, Country"
if (text.matches(".*[A-Z]{2}$")) {
// Ends with 2-letter country code
var countryCode = text.substring(text.length() - 2);
var cityPart = text.substring(0, text.length() - 2).trim();
// Remove trailing comma or whitespace
cityPart = cityPart.replaceAll("[,\\s]+$", "");
auction.country = countryCode;
auction.city = cityPart;
auction.location = cityPart + ", " + countryCode;
break;
}
}
// Fallback: check HTML content directly
if (auction.country == null) {
var html = link.html();
var locPattern = java.util.regex.Pattern.compile(
"([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
var locMatcher = locPattern.matcher(html);
if (locMatcher.find()) {
var city = locMatcher.group(1).trim().replaceAll(",$", "");
var country = locMatcher.group(2);
auction.city = city;
auction.country = country;
auction.location = city + ", " + country;
}
}
// Extract lot count if available (kavels/lots)
var textElements = link.select("*");
for (var elem : textElements) {
var text = elem.ownText();
if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) {
var countPattern = java.util.regex.Pattern.compile("(\\d+)");
var countMatcher = countPattern.matcher(text);
if (countMatcher.find()) {
auction.lotCount = Integer.parseInt(countMatcher.group(1));
break;
}
}
}
return auction;
}
/**
* Loads cached HTML for a page
*/
private String loadFromCache(int pageNumber) {
if (!useCache || cacheDb == null) return null;
var url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
return cacheDb.get(url);
}
/**
* Saves HTML to cache
*/
private void saveToCache(int pageNumber, String html) {
if (!useCache || cacheDb == null) return;
var url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
}
/**
* Retrieves all lots for a given sale ID using Troostwijk's internal JSON
* API. The API accepts parameters such as batchSize, offset, and saleID.
* A large batchSize returns many lots at once. We loop until no further
* results are returned. Each JSON result is mapped to our Lot domain
* object and persisted to the database.
*
* @param saleId the sale identifier
*/
public void fetchLotsForSale(int saleId) {
var batchSize = 200;
var offset = 0;
var more = true;
var totalLots = 0;
while (more) {
try {
var url = LOT_API + "?batchSize=" + batchSize
+ "&listType=7&offset=" + offset
+ "&sortOption=0&saleID=" + saleId
+ "&parentID=0&relationID=0&buildversion=201807311";
IO.println(" Fetching lots from API (offset=" + offset + ")...");
var request = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Accept", "application/json")
.header("User-Agent", "Mozilla/5.0")
.GET()
.build();
var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
if (response.statusCode() != 200) {
System.err.println(" ⚠️ API call failed for sale " + saleId);
System.err.println(" Status: " + response.statusCode());
System.err.println(" Response: " + response.body().substring(0, Math.min(200, response.body().length())));
break;
}
var root = objectMapper.readTree(response.body());
var results = root.path("results");
if (!results.isArray() || results.isEmpty()) {
if (offset == 0) {
IO.println(" ⚠️ No lots found for sale " + saleId);
IO.println(" API Response: " + response.body().substring(0, Math.min(500, response.body().length())));
}
more = false;
break;
}
var lotsInBatch = results.size();
IO.println(" Found " + lotsInBatch + " lots in this batch");
for (var node : results) {
var lot = new Lot();
lot.saleId = saleId;
lot.lotId = node.path("lotID").asInt();
lot.title = node.path("t").asText();
lot.description = node.path("d").asText();
lot.manufacturer = node.path("mf").asText();
lot.type = node.path("typ").asText();
lot.year = node.path("yb").asInt();
lot.category = node.path("lc").asText();
// Current bid; field names may differ (e.g. currentBid or cb)
lot.currentBid = node.path("cb").asDouble();
lot.currency = node.path("cu").asText();
lot.url = "https://www.troostwijkauctions.com/nl" + node.path("url").asText();
// Save basic lot info into DB
db.upsertLot(lot);
totalLots++;
// Download images and perform object detection
List<String> imageUrls = new ArrayList<>();
var imgs = node.path("imgs");
if (imgs.isArray()) {
for (var imgNode : imgs) {
var imgUrl = imgNode.asText();
imageUrls.add(imgUrl);
}
}
// Download and analyze images (optional, can be slow)
for (var imgUrl : imageUrls) {
var fileName = downloadImage(imgUrl, saleId, lot.lotId);
if (fileName != null) {
// run object detection once per image
var labels = detector.detectObjects(fileName);
db.insertImage(lot.lotId, imgUrl, fileName, labels);
}
}
}
IO.println(" ✓ Processed " + totalLots + " lots so far");
offset += batchSize;
} catch (IOException | InterruptedException e) {
System.err.println("Error fetching lots for sale " + saleId + ": " + e.getMessage());
more = false;
} catch (SQLException e) {
System.err.println("Database error: " + e.getMessage());
}
}
}
/**
* Downloads an image from the given URL to a local directory. Images
* are stored under "images/<saleId>/<lotId>/" to keep them organised.
*
* @param imageUrl remote image URL
* @param saleId sale identifier
* @param lotId lot identifier
* @return absolute path to saved file or null on failure
*/
private String downloadImage(String imageUrl, int saleId, int lotId) {
try {
var request = HttpRequest.newBuilder()
.uri(URI.create(imageUrl))
.GET()
.build();
var response = httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());
if (response.statusCode() == 200) {
var dir = Paths.get("images", String.valueOf(saleId), String.valueOf(lotId));
Files.createDirectories(dir);
var fileName = Paths.get(imageUrl).getFileName().toString();
var dest = dir.resolve(fileName);
Files.copy(response.body(), dest);
return dest.toAbsolutePath().toString();
}
} catch (IOException | InterruptedException e) {
System.err.println("Failed to download image " + imageUrl + ": " + e.getMessage());
}
return null;
}
/**
* Schedules periodic monitoring of all lots. The scheduler runs every
* hour to refresh current bids and closing times. For lots that
* are within 30 minutes of closing, it increases the polling frequency
* automatically. When a new bid is detected or a lot is about to
* expire, a Pushover notification is sent to the configured user.
* Note: In production, ensure proper shutdown handling for the scheduler.
*/
public void scheduleMonitoring() {
var scheduler = Executors.newScheduledThreadPool(1);
scheduler.scheduleAtFixedRate(() -> {
try {
var activeLots = db.getActiveLots();
for (var lot : activeLots) {
// refresh the lot's bidding information via API
refreshLotBid(lot);
// check closing time to adjust monitoring
var minutesLeft = lot.minutesUntilClose();
if (minutesLeft < 30) {
// send warning when within 5 minutes
if (minutesLeft <= 5 && !lot.closingNotified) {
notifier.sendNotification("Kavel " + lot.lotId + " sluit binnen " + minutesLeft + " min.",
"Lot nearing closure", 1);
lot.closingNotified = true;
db.updateLotNotificationFlags(lot);
}
// schedule additional quick check for this lot
scheduler.schedule(() -> refreshLotBid(lot), 5, TimeUnit.MINUTES);
}
}
} catch (SQLException e) {
System.err.println("Error during scheduled monitoring: " + e.getMessage());
}
}, 0, 1, TimeUnit.HOURS);
}
/**
* Refreshes the bid for a single lot and sends notification if it has
* changed since the last check. The method calls the same API used for
* initial scraping but only extracts the current bid for the given lot.
*
* @param lot the lot to refresh
*/
private void refreshLotBid(Lot lot) {
try {
var url = LOT_API + "?batchSize=1&listType=7&offset=0&sortOption=0&saleID=" + lot.saleId
+ "&parentID=0&relationID=0&buildversion=201807311&lotID=" + lot.lotId;
var request = HttpRequest.newBuilder().uri(URI.create(url)).GET().build();
var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
if (response.statusCode() != 200) return;
var root = objectMapper.readTree(response.body());
var results = root.path("results");
if (results.isArray() && !results.isEmpty()) {
var node = results.get(0);
var newBid = node.path("cb").asDouble();
if (Double.compare(newBid, lot.currentBid) > 0) {
var previous = lot.currentBid;
lot.currentBid = newBid;
db.updateLotCurrentBid(lot);
var msg = String.format("Nieuw bod op kavel %d: €%.2f (was €%.2f)", lot.lotId, newBid, previous);
notifier.sendNotification(msg, "Kavel bieding update", 0);
}
}
} catch (IOException | InterruptedException | SQLException e) {
System.err.println("Failed to refresh bid for lot " + lot.lotId + ": " + e.getMessage());
}
}
/**
* Prints statistics about the data in the database.
*/
public void printDatabaseStats() {
try {
var allLots = db.getAllLots();
var imageCount = db.getImageCount();
IO.println(" Total lots in database: " + allLots.size());
IO.println(" Total images downloaded: " + imageCount);
if (!allLots.isEmpty()) {
var totalBids = allLots.stream().mapToDouble(l -> l.currentBid).sum();
IO.println(" Total current bids: €" + String.format("%.2f", totalBids));
}
} catch (SQLException e) {
System.err.println(" ⚠️ Could not retrieve database stats: " + e.getMessage());
}
}
// ----------------------------------------------------------------------
// Domain classes and services
// ----------------------------------------------------------------------
}