This commit is contained in:
2025-11-28 05:54:30 +01:00
parent f5ee240283
commit 0f5800441a
4 changed files with 268 additions and 65 deletions

BIN
cache/page_cache.db vendored

Binary file not shown.

View File

@@ -303,19 +303,24 @@ public class TroostwijkAuctionExtractor {
int hrefPos = html.indexOf(href); int hrefPos = html.indexOf(href);
if (hrefPos == -1) return "Unknown"; if (hrefPos == -1) return "Unknown";
// Look at 500 characters after the href for location info // Look at 1000 characters before AND after the href for location info
int endPos = Math.min(hrefPos + 500, html.length()); int startPos = Math.max(hrefPos - 500, 0);
String context = html.substring(hrefPos, endPos); int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Try to find location pattern like "City, NL" or "City, Country" // Try to find location pattern like "City, NL" or "City, Country"
// More flexible pattern to catch various location formats
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z\\s]+),\\s*([A-Z]{2})"); "([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(context); java.util.regex.Matcher locMatcher = locPattern.matcher(context);
if (locMatcher.find()) { if (locMatcher.find()) {
return locMatcher.group(1).trim() + ", " + locMatcher.group(2); String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2);
System.out.println(" Found location: " + location + " for auction " + href);
return location;
} }
System.out.println(" ⚠️ No location found for auction " + href);
return "Unknown"; return "Unknown";
} }

View File

@@ -56,19 +56,28 @@ import java.sql.PreparedStatement;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Statement; import java.sql.Statement;
import java.time.Instant;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import com.auction.TroostwijkAuctionExtractor.CacheDatabase;
import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.microsoft.playwright.Browser;
import com.microsoft.playwright.BrowserType;
import com.microsoft.playwright.Page;
import com.microsoft.playwright.Playwright;
import com.microsoft.playwright.options.WaitUntilState;
import net.bytebuddy.build.Plugin.Engine.Source.Element;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.opencv.core.Core; import org.opencv.core.Core;
import org.opencv.core.Mat; import org.opencv.core.Mat;
import org.opencv.core.Scalar; import org.opencv.core.Scalar;
@@ -76,6 +85,7 @@ import org.opencv.core.Size;
import org.opencv.dnn.Dnn; import org.opencv.dnn.Dnn;
import org.opencv.dnn.Net; import org.opencv.dnn.Net;
import org.opencv.imgcodecs.Imgcodecs; import org.opencv.imgcodecs.Imgcodecs;
import org.w3c.dom.Document;
import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV; import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV;
import static org.opencv.dnn.Dnn.DNN_TARGET_CPU; import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
@@ -84,17 +94,24 @@ import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
* persisting data, scheduling updates, and performing object detection. * persisting data, scheduling updates, and performing object detection.
*/ */
public class TroostwijkScraper { public class TroostwijkScraper {
// Base URLs adjust these if Troostwijk changes their site structure // Base URLs adjust these if Troostwijk changes their site structure
private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/nl/auctions"; private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/auctions";
private static final String LOT_API = "https://api.troostwijkauctions.com/lot/7/list"; private static final String LOT_API = "https://api.troostwijkauctions.com/lot/7/list";
private static final String CACHE_DB_PATH = "cache/page_cache.db";
private static final long CACHE_EXPIRATION_HOURS = 24;
private static final int RATE_LIMIT_MS = 200;
// HTTP client used for API calls // HTTP client used for API calls
private final HttpClient httpClient; private final HttpClient httpClient;
private final ObjectMapper objectMapper; private final ObjectMapper objectMapper;
public final DatabaseService db; public final DatabaseService db;
private final NotificationService notifier; private final NotificationService notifier;
private final ObjectDetectionService detector; private final ObjectDetectionService detector;
private final CacheDatabase cacheDb;
private final boolean useCache;
private Playwright playwright;
private Browser browser;
/** /**
* Constructor. Creates supporting services and ensures the database * Constructor. Creates supporting services and ensures the database
@@ -109,80 +126,256 @@ public class TroostwijkScraper {
*/ */
public TroostwijkScraper(String databasePath, String notificationConfig, String unused, public TroostwijkScraper(String databasePath, String notificationConfig, String unused,
String yoloCfgPath, String yoloWeightsPath, String classNamesPath) throws SQLException, IOException { String yoloCfgPath, String yoloWeightsPath, String classNamesPath) throws SQLException, IOException {
this(databasePath, notificationConfig, unused, yoloCfgPath, yoloWeightsPath, classNamesPath, true);
}
/**
* Constructor with cache control.
*
* @param databasePath Path to SQLite database file
* @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email
* @param unused Unused parameter (kept for compatibility)
* @param yoloCfgPath Path to YOLO configuration file
* @param yoloWeightsPath Path to YOLO weights file
* @param classNamesPath Path to file containing class names
* @param useCache Enable page caching
*/
public TroostwijkScraper(String databasePath, String notificationConfig, String unused,
String yoloCfgPath, String yoloWeightsPath, String classNamesPath,
boolean useCache) throws SQLException, IOException {
this.httpClient = HttpClient.newHttpClient(); this.httpClient = HttpClient.newHttpClient();
this.objectMapper = new ObjectMapper(); this.objectMapper = new ObjectMapper();
this.db = new DatabaseService(databasePath); this.db = new DatabaseService(databasePath);
this.notifier = new NotificationService(notificationConfig, unused); this.notifier = new NotificationService(notificationConfig, unused);
this.detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath); this.detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath);
this.useCache = useCache;
this.cacheDb = useCache ? new TroostwijkAuctionExtractor.CacheDatabase(CACHE_DB_PATH) : null;
// initialize DB // initialize DB
db.ensureSchema(); db.ensureSchema();
if (useCache) {
cacheDb.initialize();
}
}
/**
* Initializes Playwright browser for JavaScript-rendered pages.
* Call this before using discoverDutchAuctions().
*/
public void initializeBrowser() {
if (playwright == null) {
System.out.println("Initializing Playwright browser...");
this.playwright = Playwright.create();
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(true)
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
System.out.println("✓ Browser ready");
}
}
/**
* Closes browser and cache resources.
*/
public void close() {
if (browser != null) {
browser.close();
browser = null;
}
if (playwright != null) {
playwright.close();
playwright = null;
}
if (cacheDb != null) {
cacheDb.close();
}
} }
/** /**
* Discovers all active Dutch auctions by crawling the auctions page. * Discovers all active Dutch auctions by crawling the auctions page.
* *
* Troostwijk lists auctions for many countries on one page. We parse * Uses Playwright to render JavaScript-heavy pages and extract auction data.
* the page with jsoup and filter auctions whose location contains ", NL" * Supports caching to avoid unnecessary page fetches. Filters auctions whose
* (indicating the Netherlands). Each auction link contains a unique sale ID * location contains ", NL" (indicating the Netherlands). Each auction link
* in the format A1-xxxxx or A7-xxxxx which we extract from the URL. * contains a unique sale ID in the format A1-xxxxx or A7-xxxxx.
* *
* @return a list of sale identifiers for auctions located in NL * @return a list of sale identifiers for auctions located in NL
*/ */
public List<Integer> discoverDutchAuctions() { public List<Integer> discoverDutchAuctions() {
List<Integer> saleIds = new ArrayList<>(); Set<Integer> saleIds = new HashSet<>();
try {
// Fetch the auctions overview page // Check if browser is initialized
Document doc = Jsoup.connect(AUCTIONS_PAGE).get(); if (browser == null) {
initializeBrowser();
// Select all anchor elements that link to auction pages }
// The URL pattern is: /a/auction-title-A1-xxxxx or /a/auction-title-A7-xxxxx
Elements auctionLinks = doc.select("a[href^='/a/']"); int pageNumber = 1;
boolean hasMorePages = true;
System.out.println("Found " + auctionLinks.size() + " potential auction links");
System.out.println("Starting Dutch auction discovery from " + AUCTIONS_PAGE);
for (Element link : auctionLinks) {
// Get the href to extract the auction ID while (hasMorePages) {
String href = link.attr("href"); System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
// Check if this link contains location text with ", NL" // Check cache first
String linkText = link.text(); String html = loadFromCache(pageNumber);
// Look for location in any div inside the link if (html != null) {
Elements divs = link.select("div"); System.out.println(" ✓ Loaded from cache");
boolean isDutch = false; } else {
for (Element div : divs) { // Fetch with Playwright
String text = div.text(); html = fetchPageWithPlaywright(pageNumber);
if (text.contains(", NL")) {
isDutch = true; if (html == null || html.isEmpty()) {
break; System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
} break;
} }
if (isDutch) { System.out.println(" ✓ Fetched from website");
// Extract auction ID from URL
// Format: /a/title-A1-38375 or /a/title-A7-12345 // Save to cache
// We want the number after A1- or A7- if (useCache) {
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("A[17]-(\\d+)"); saveToCache(pageNumber, html);
java.util.regex.Matcher matcher = pattern.matcher(href); }
if (matcher.find()) { // Rate limiting
try { try {
int saleId = Integer.parseInt(matcher.group(1)); Thread.sleep(RATE_LIMIT_MS);
if (!saleIds.contains(saleId)) { } catch (InterruptedException e) {
saleIds.add(saleId); Thread.currentThread().interrupt();
System.out.println(" Found Dutch auction: " + saleId + " - " + href); break;
}
} catch (NumberFormatException e) {
// Skip invalid IDs
}
}
} }
} }
} catch (IOException e) {
System.err.println("Failed to discover auctions: " + e.getMessage()); // Parse auctions from HTML
e.printStackTrace(); int foundOnPage = parseAuctionsFromHtml(html, saleIds);
if (foundOnPage == 0) {
System.out.println(" ⚠️ No Dutch auctions found on page, stopping pagination");
hasMorePages = false;
} else {
System.out.println(" ✓ Found " + foundOnPage + " Dutch auctions");
pageNumber++;
}
} }
return saleIds;
System.out.println("\n✓ Total Dutch auctions discovered: " + saleIds.size());
return new ArrayList<>(saleIds);
}
/**
* Fetches a single page using Playwright
*/
private String fetchPageWithPlaywright(int pageNumber) {
String url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
try {
Page page = browser.newPage();
// Set user agent
page.setExtraHTTPHeaders(Map.of(
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
));
// Navigate to page
page.navigate(url, new Page.NavigateOptions()
.setTimeout(30000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
// Wait for auction listings to appear
try {
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
.setTimeout(10000));
} catch (Exception e) {
// Continue even if selector not found
System.out.println(" ⚠️ Auction selector not found");
}
// Get HTML content
String html = page.content();
page.close();
return html;
} catch (Exception e) {
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
return null;
}
}
/**
* Parses auctions from HTML and adds Dutch auctions to the set
* @return number of Dutch auctions found on this page
*/
private int parseAuctionsFromHtml(String html, Set<Integer> saleIds) {
int foundCount = 0;
// Simple regex-based parsing for auction links
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
while (linkMatcher.find()) {
String href = linkMatcher.group(1);
int auctionId = Integer.parseInt(linkMatcher.group(2));
// Avoid duplicates
if (saleIds.contains(auctionId)) {
continue;
}
// Check if this auction is Dutch (location contains ", NL")
if (isDutchAuction(html, href)) {
saleIds.add(auctionId);
foundCount++;
System.out.println(" Found Dutch auction: " + auctionId + " - " + href);
}
}
return foundCount;
}
/**
* Checks if an auction is located in the Netherlands
*/
private boolean isDutchAuction(String html, String href) {
int hrefPos = html.indexOf(href);
if (hrefPos == -1) return false;
// Look at 1000 characters before and after the href for location info
int startPos = Math.max(hrefPos - 500, 0);
int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Look for ", NL" pattern
return context.contains(", NL");
}
/**
* Loads cached HTML for a page
*/
private String loadFromCache(int pageNumber) {
if (!useCache || cacheDb == null) return null;
String url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
return cacheDb.get(url);
}
/**
* Saves HTML to cache
*/
private void saveToCache(int pageNumber, String html) {
if (!useCache || cacheDb == null) return;
String url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
} }
/** /**

View File

@@ -55,6 +55,11 @@ public class TroostwijkScraperTest {
@AfterEach @AfterEach
public void tearDown() { public void tearDown() {
// Clean up browser and cache
if (scraper != null) {
scraper.close();
}
// Clean up test database // Clean up test database
File dbFile = new File(testDatabasePath); File dbFile = new File(testDatabasePath);
if (dbFile.exists()) { if (dbFile.exists()) {