package com.auction; /* * TroostwijkScraper * * This example shows how you could build a Java‐based scraper for the Dutch * auctions on Troostwijk Auctions. The scraper uses a combination of * HTTP requests and HTML parsing with the jsoup library to discover active * auctions, calls Troostwijk's internal JSON API to fetch lot (kavel) data * efficiently, writes the results into a local SQLite database, performs * object detection on lot images using OpenCV's DNN module, and sends * desktop/email notifications when bids change or lots are about to expire. * The implementation uses well known open source libraries for each of these * concerns. You can adjust the API endpoints and CSS selectors as * Troostwijk's site evolves. The code is organised into small helper * classes to make it easier to maintain. * * Dependencies (add these to your Maven/Gradle project): * * - org.jsoup:jsoup:1.17.2 – HTML parser and HTTP client. * - com.fasterxml.jackson.core:jackson-databind:2.17.0 – JSON parsing. * - org.xerial:sqlite-jdbc:3.45.1.0 – SQLite JDBC driver. * - com.sun.mail:javax.mail:1.6.2 – JavaMail for email notifications (free). * - org.openpnp:opencv:4.9.0-0 (with native libraries) – OpenCV for image * processing and object detection. * * Before running this program you must ensure that the native OpenCV * binaries are on your library path (e.g. via -Djava.library.path). * Desktop notifications work out of the box on Windows, macOS, and Linux. * For email notifications, you need a Gmail account with an app password * (free, requires 2FA enabled). See https://support.google.com/accounts/answer/185833 * * The scraper performs four major tasks: * 1. Discover all auctions located in the Netherlands. * 2. For each auction, fetch all lots (kavels) including images and * bidding information, and persist the data into SQLite tables. * 3. Monitor bidding and closing times on a schedule and send desktop/email * notifications when bids change or lots are about to expire. * 4. Run object detection on downloaded lot images to automatically * label objects using a YOLO model. The results are stored in the * database for later search. */ import com.fasterxml.jackson.databind.ObjectMapper; import com.microsoft.playwright.Browser; import com.microsoft.playwright.BrowserType; import com.microsoft.playwright.Page; import com.microsoft.playwright.Playwright; import com.microsoft.playwright.options.WaitUntilState; import java.io.IOException; import java.net.URI; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.nio.file.Files; import java.nio.file.Paths; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** * Main scraper class. It encapsulates the logic for scraping auctions, * persisting data, scheduling updates, and performing object detection. */ public class TroostwijkScraper { // Base URLs – adjust these if Troostwijk changes their site structure private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/auctions"; private static final String LOT_API = "https://api.troostwijkauctions.com/lot/7/list"; private static final String CACHE_DB_PATH = "cache/page_cache.db"; private static final long CACHE_EXPIRATION_HOURS = 24; private static final int RATE_LIMIT_MS = 200; // HTTP client used for API calls private final HttpClient httpClient; private final ObjectMapper objectMapper; public final DatabaseService db; private final NotificationService notifier; private final ObjectDetectionService detector; private final CacheDatabase cacheDb; private final boolean useCache; private Playwright playwright; private Browser browser; /** * Constructor. Creates supporting services and ensures the database * tables exist. * * @param databasePath Path to SQLite database file * @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email * @param unused Unused parameter (kept for compatibility) * @param yoloCfgPath Path to YOLO configuration file * @param yoloWeightsPath Path to YOLO weights file * @param classNamesPath Path to file containing class names */ public TroostwijkScraper(String databasePath, String notificationConfig, String unused, String yoloCfgPath, String yoloWeightsPath, String classNamesPath) throws SQLException, IOException { this(databasePath, notificationConfig, unused, yoloCfgPath, yoloWeightsPath, classNamesPath, true); } /** * Constructor with cache control. * * @param databasePath Path to SQLite database file * @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email * @param unused Unused parameter (kept for compatibility) * @param yoloCfgPath Path to YOLO configuration file * @param yoloWeightsPath Path to YOLO weights file * @param classNamesPath Path to file containing class names * @param useCache Enable page caching */ public TroostwijkScraper(String databasePath, String notificationConfig, String unused, String yoloCfgPath, String yoloWeightsPath, String classNamesPath, boolean useCache) throws SQLException, IOException { this.httpClient = HttpClient.newHttpClient(); this.objectMapper = new ObjectMapper(); this.db = new DatabaseService(databasePath); this.notifier = new NotificationService(notificationConfig, unused); this.detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath); this.useCache = useCache; this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null; // initialize DB db.ensureSchema(); if (useCache) { cacheDb.initialize(); } } /** * Initializes Playwright browser for JavaScript-rendered pages. * Call this before using discoverDutchAuctions(). */ public void initializeBrowser() { if (playwright == null) { IO.println("Initializing Playwright browser..."); this.playwright = Playwright.create(); this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions() .setHeadless(true) .setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox"))); IO.println("✓ Browser ready"); } } /** * Closes browser and cache resources. */ public void close() { if (browser != null) { browser.close(); browser = null; } if (playwright != null) { playwright.close(); playwright = null; } if (cacheDb != null) { cacheDb.close(); } } /** * Discovers all active Dutch auctions by crawling the auctions page. * * Uses Playwright to render JavaScript-heavy pages and extract auction data. * Supports caching to avoid unnecessary page fetches. Filters auctions whose * location contains ", NL" (indicating the Netherlands). Each auction link * contains a unique sale ID in the format A1-xxxxx or A7-xxxxx. * * Auctions are saved to the database and can be retrieved with getDutchAuctions(). * * @return a list of sale identifiers for auctions located in NL (legacy compatibility) */ public List discoverDutchAuctions() { Set saleIds = new HashSet<>(); // Check if browser is initialized if (browser == null) { initializeBrowser(); } var pageNumber = 1; var hasMorePages = true; IO.println("Starting Dutch auction discovery from " + AUCTIONS_PAGE); while (hasMorePages) { IO.println("\n[Page " + pageNumber + "] Fetching auctions..."); // Check cache first var html = loadFromCache(pageNumber); if (html != null) { IO.println(" ✓ Loaded from cache"); } else { // Fetch with Playwright html = fetchPageWithPlaywright(pageNumber); if (html == null || html.isEmpty()) { IO.println(" ⚠️ Failed to fetch page, stopping pagination"); break; } IO.println(" ✓ Fetched from website"); // Save to cache if (useCache) { saveToCache(pageNumber, html); } // Rate limiting try { Thread.sleep(RATE_LIMIT_MS); } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } } // Parse auctions from HTML (saves Dutch auctions to database) var foundOnPage = parseAuctionsFromHtml(html, saleIds); if (foundOnPage == 0) { IO.println(" ⚠️ No Dutch auctions found on page, stopping pagination"); hasMorePages = false; } else { IO.println(" ✓ Found " + foundOnPage + " Dutch auctions"); pageNumber++; } } IO.println("\n✓ Total Dutch auctions discovered: " + saleIds.size()); return new ArrayList<>(saleIds); } /** * Fetches a single page using Playwright */ private String fetchPageWithPlaywright(int pageNumber) { var url = pageNumber == 1 ? AUCTIONS_PAGE : AUCTIONS_PAGE + "?page=" + pageNumber; try { var page = browser.newPage(); // Set user agent page.setExtraHTTPHeaders(Map.of( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" )); // Navigate to page page.navigate(url, new Page.NavigateOptions() .setTimeout(30000) .setWaitUntil(WaitUntilState.NETWORKIDLE)); // Wait for auction listings to appear try { page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions() .setTimeout(10000)); } catch (Exception e) { // Continue even if selector not found IO.println(" ⚠️ Auction selector not found"); } // Get HTML content var html = page.content(); page.close(); return html; } catch (Exception e) { System.err.println(" ⚠️ Playwright error: " + e.getMessage()); return null; } } /** * Parses auctions from HTML using JSoup and saves Dutch auctions to database. * Uses proper HTML parsing instead of regex for more reliable extraction. * @return number of Dutch auctions found on this page */ private int parseAuctionsFromHtml(String html, Set saleIds) { var foundCount = 0; try { var doc = org.jsoup.Jsoup.parse(html); // Find all auction links (format: /a/title-A1-12345 or /a/title-A7-12345) var auctionLinks = doc.select("a[href^='/a/']"); for (var link : auctionLinks) { var href = link.attr("href"); // Extract auction ID from URL var pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)"); var matcher = pattern.matcher(href); if (!matcher.find()) { continue; } var typeNum = matcher.group(1); var auctionId = Integer.parseInt(matcher.group(2)); // Skip duplicates if (saleIds.contains(auctionId)) { continue; } // Extract auction info using JSoup var auction = extractAuctionInfo(link, href, auctionId, "A" + typeNum); // Only keep Dutch auctions if (auction != null && "NL".equals(auction.country)) { saleIds.add(auctionId); foundCount++; // Save to database try { db.upsertAuction(auction); IO.println(" Found Dutch auction: " + auctionId + " - " + auction.title + " (" + auction.location + ")"); } catch (SQLException e) { System.err.println(" Failed to save auction: " + e.getMessage()); } } } } catch (Exception e) { System.err.println(" Error parsing HTML: " + e.getMessage()); } return foundCount; } /** * Extracts auction information from a link element using JSoup * This method intelligently parses the HTML structure to extract: * - Title * - Location (city and country) * - Lot count (if available) */ private AuctionInfo extractAuctionInfo(org.jsoup.nodes.Element link, String href, int auctionId, String type) { var auction = new AuctionInfo(); auction.auctionId = auctionId; auction.type = type; auction.url = "https://www.troostwijkauctions.com" + href; // Extract title from href (convert kebab-case to title) var titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-"); var titleMatcher = titlePattern.matcher(href); if (titleMatcher.find()) { var slug = titleMatcher.group(1); auction.title = slug.replace("-", " "); // Capitalize first letter if (!auction.title.isEmpty()) { auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1); } } else { auction.title = "Unknown Auction"; } // Try to find title in link text (more accurate) var linkText = link.text(); if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) { // If link text doesn't contain numbers, it's likely the title var parts = linkText.split(",|\\d+"); if (parts.length > 0 && parts[0].trim().length() > 5) { auction.title = parts[0].trim(); } } // Extract location using JSoup selectors // Look for

tags that contain location info var locationElements = link.select("p"); for (var p : locationElements) { var text = p.text(); // Pattern: "City, Country" or "City, Region, Country" if (text.matches(".*[A-Z]{2}$")) { // Ends with 2-letter country code var countryCode = text.substring(text.length() - 2); var cityPart = text.substring(0, text.length() - 2).trim(); // Remove trailing comma or whitespace cityPart = cityPart.replaceAll("[,\\s]+$", ""); auction.country = countryCode; auction.city = cityPart; auction.location = cityPart + ", " + countryCode; break; } } // Fallback: check HTML content directly if (auction.country == null) { var html = link.html(); var locPattern = java.util.regex.Pattern.compile( "([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:)?\\s*\\s*([A-Z]{2})(?![A-Za-z])"); var locMatcher = locPattern.matcher(html); if (locMatcher.find()) { var city = locMatcher.group(1).trim().replaceAll(",$", ""); var country = locMatcher.group(2); auction.city = city; auction.country = country; auction.location = city + ", " + country; } } // Extract lot count if available (kavels/lots) var textElements = link.select("*"); for (var elem : textElements) { var text = elem.ownText(); if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) { var countPattern = java.util.regex.Pattern.compile("(\\d+)"); var countMatcher = countPattern.matcher(text); if (countMatcher.find()) { auction.lotCount = Integer.parseInt(countMatcher.group(1)); break; } } } return auction; } /** * Loads cached HTML for a page */ private String loadFromCache(int pageNumber) { if (!useCache || cacheDb == null) return null; var url = pageNumber == 1 ? AUCTIONS_PAGE : AUCTIONS_PAGE + "?page=" + pageNumber; return cacheDb.get(url); } /** * Saves HTML to cache */ private void saveToCache(int pageNumber, String html) { if (!useCache || cacheDb == null) return; var url = pageNumber == 1 ? AUCTIONS_PAGE : AUCTIONS_PAGE + "?page=" + pageNumber; cacheDb.put(url, html, CACHE_EXPIRATION_HOURS); } /** * Retrieves all lots for a given sale ID using Troostwijk's internal JSON * API. The API accepts parameters such as batchSize, offset, and saleID. * A large batchSize returns many lots at once. We loop until no further * results are returned. Each JSON result is mapped to our Lot domain * object and persisted to the database. * * @param saleId the sale identifier */ public void fetchLotsForSale(int saleId) { var batchSize = 200; var offset = 0; var more = true; var totalLots = 0; while (more) { try { var url = LOT_API + "?batchSize=" + batchSize + "&listType=7&offset=" + offset + "&sortOption=0&saleID=" + saleId + "&parentID=0&relationID=0&buildversion=201807311"; IO.println(" Fetching lots from API (offset=" + offset + ")..."); var request = HttpRequest.newBuilder() .uri(URI.create(url)) .header("Accept", "application/json") .header("User-Agent", "Mozilla/5.0") .GET() .build(); var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString()); if (response.statusCode() != 200) { System.err.println(" ⚠️ API call failed for sale " + saleId); System.err.println(" Status: " + response.statusCode()); System.err.println(" Response: " + response.body().substring(0, Math.min(200, response.body().length()))); break; } var root = objectMapper.readTree(response.body()); var results = root.path("results"); if (!results.isArray() || results.isEmpty()) { if (offset == 0) { IO.println(" ⚠️ No lots found for sale " + saleId); IO.println(" API Response: " + response.body().substring(0, Math.min(500, response.body().length()))); } more = false; break; } var lotsInBatch = results.size(); IO.println(" Found " + lotsInBatch + " lots in this batch"); for (var node : results) { var lot = new Lot(); lot.saleId = saleId; lot.lotId = node.path("lotID").asInt(); lot.title = node.path("t").asText(); lot.description = node.path("d").asText(); lot.manufacturer = node.path("mf").asText(); lot.type = node.path("typ").asText(); lot.year = node.path("yb").asInt(); lot.category = node.path("lc").asText(); // Current bid; field names may differ (e.g. currentBid or cb) lot.currentBid = node.path("cb").asDouble(); lot.currency = node.path("cu").asText(); lot.url = "https://www.troostwijkauctions.com/nl" + node.path("url").asText(); // Save basic lot info into DB db.upsertLot(lot); totalLots++; // Download images and perform object detection List imageUrls = new ArrayList<>(); var imgs = node.path("imgs"); if (imgs.isArray()) { for (var imgNode : imgs) { var imgUrl = imgNode.asText(); imageUrls.add(imgUrl); } } // Download and analyze images (optional, can be slow) for (var imgUrl : imageUrls) { var fileName = downloadImage(imgUrl, saleId, lot.lotId); if (fileName != null) { // run object detection once per image var labels = detector.detectObjects(fileName); db.insertImage(lot.lotId, imgUrl, fileName, labels); } } } IO.println(" ✓ Processed " + totalLots + " lots so far"); offset += batchSize; } catch (IOException | InterruptedException e) { System.err.println("Error fetching lots for sale " + saleId + ": " + e.getMessage()); more = false; } catch (SQLException e) { System.err.println("Database error: " + e.getMessage()); } } } /** * Downloads an image from the given URL to a local directory. Images * are stored under "images///" to keep them organised. * * @param imageUrl remote image URL * @param saleId sale identifier * @param lotId lot identifier * @return absolute path to saved file or null on failure */ private String downloadImage(String imageUrl, int saleId, int lotId) { try { var request = HttpRequest.newBuilder() .uri(URI.create(imageUrl)) .GET() .build(); var response = httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream()); if (response.statusCode() == 200) { var dir = Paths.get("images", String.valueOf(saleId), String.valueOf(lotId)); Files.createDirectories(dir); var fileName = Paths.get(imageUrl).getFileName().toString(); var dest = dir.resolve(fileName); Files.copy(response.body(), dest); return dest.toAbsolutePath().toString(); } } catch (IOException | InterruptedException e) { System.err.println("Failed to download image " + imageUrl + ": " + e.getMessage()); } return null; } /** * Schedules periodic monitoring of all lots. The scheduler runs every * hour to refresh current bids and closing times. For lots that * are within 30 minutes of closing, it increases the polling frequency * automatically. When a new bid is detected or a lot is about to * expire, a Pushover notification is sent to the configured user. * Note: In production, ensure proper shutdown handling for the scheduler. */ public void scheduleMonitoring() { var scheduler = Executors.newScheduledThreadPool(1); scheduler.scheduleAtFixedRate(() -> { try { var activeLots = db.getActiveLots(); for (var lot : activeLots) { // refresh the lot's bidding information via API refreshLotBid(lot); // check closing time to adjust monitoring var minutesLeft = lot.minutesUntilClose(); if (minutesLeft < 30) { // send warning when within 5 minutes if (minutesLeft <= 5 && !lot.closingNotified) { notifier.sendNotification("Kavel " + lot.lotId + " sluit binnen " + minutesLeft + " min.", "Lot nearing closure", 1); lot.closingNotified = true; db.updateLotNotificationFlags(lot); } // schedule additional quick check for this lot scheduler.schedule(() -> refreshLotBid(lot), 5, TimeUnit.MINUTES); } } } catch (SQLException e) { System.err.println("Error during scheduled monitoring: " + e.getMessage()); } }, 0, 1, TimeUnit.HOURS); } /** * Refreshes the bid for a single lot and sends notification if it has * changed since the last check. The method calls the same API used for * initial scraping but only extracts the current bid for the given lot. * * @param lot the lot to refresh */ private void refreshLotBid(Lot lot) { try { var url = LOT_API + "?batchSize=1&listType=7&offset=0&sortOption=0&saleID=" + lot.saleId + "&parentID=0&relationID=0&buildversion=201807311&lotID=" + lot.lotId; var request = HttpRequest.newBuilder().uri(URI.create(url)).GET().build(); var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString()); if (response.statusCode() != 200) return; var root = objectMapper.readTree(response.body()); var results = root.path("results"); if (results.isArray() && !results.isEmpty()) { var node = results.get(0); var newBid = node.path("cb").asDouble(); if (Double.compare(newBid, lot.currentBid) > 0) { var previous = lot.currentBid; lot.currentBid = newBid; db.updateLotCurrentBid(lot); var msg = String.format("Nieuw bod op kavel %d: €%.2f (was €%.2f)", lot.lotId, newBid, previous); notifier.sendNotification(msg, "Kavel bieding update", 0); } } } catch (IOException | InterruptedException | SQLException e) { System.err.println("Failed to refresh bid for lot " + lot.lotId + ": " + e.getMessage()); } } /** * Prints statistics about the data in the database. */ public void printDatabaseStats() { try { var allLots = db.getAllLots(); var imageCount = db.getImageCount(); IO.println(" Total lots in database: " + allLots.size()); IO.println(" Total images downloaded: " + imageCount); if (!allLots.isEmpty()) { var totalBids = allLots.stream().mapToDouble(l -> l.currentBid).sum(); IO.println(" Total current bids: €" + String.format("%.2f", totalBids)); } } catch (SQLException e) { System.err.println(" ⚠️ Could not retrieve database stats: " + e.getMessage()); } } // ---------------------------------------------------------------------- // Domain classes and services // ---------------------------------------------------------------------- }