This commit is contained in:
2025-11-28 05:54:30 +01:00
parent f5ee240283
commit 0f5800441a
4 changed files with 268 additions and 65 deletions

BIN
cache/page_cache.db vendored

Binary file not shown.

View File

@@ -303,19 +303,24 @@ public class TroostwijkAuctionExtractor {
int hrefPos = html.indexOf(href);
if (hrefPos == -1) return "Unknown";
// Look at 500 characters after the href for location info
int endPos = Math.min(hrefPos + 500, html.length());
String context = html.substring(hrefPos, endPos);
// Look at 1000 characters before AND after the href for location info
int startPos = Math.max(hrefPos - 500, 0);
int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Try to find location pattern like "City, NL" or "City, Country"
// More flexible pattern to catch various location formats
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z\\s]+),\\s*([A-Z]{2})");
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
if (locMatcher.find()) {
return locMatcher.group(1).trim() + ", " + locMatcher.group(2);
String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2);
System.out.println(" Found location: " + location + " for auction " + href);
return location;
}
System.out.println(" ⚠️ No location found for auction " + href);
return "Unknown";
}

View File

@@ -56,19 +56,28 @@ import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.time.Instant;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import com.auction.TroostwijkAuctionExtractor.CacheDatabase;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.microsoft.playwright.Browser;
import com.microsoft.playwright.BrowserType;
import com.microsoft.playwright.Page;
import com.microsoft.playwright.Playwright;
import com.microsoft.playwright.options.WaitUntilState;
import net.bytebuddy.build.Plugin.Engine.Source.Element;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.core.Scalar;
@@ -76,6 +85,7 @@ import org.opencv.core.Size;
import org.opencv.dnn.Dnn;
import org.opencv.dnn.Net;
import org.opencv.imgcodecs.Imgcodecs;
import org.w3c.dom.Document;
import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV;
import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
@@ -86,8 +96,11 @@ import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
public class TroostwijkScraper {
// Base URLs adjust these if Troostwijk changes their site structure
private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/nl/auctions";
private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/auctions";
private static final String LOT_API = "https://api.troostwijkauctions.com/lot/7/list";
private static final String CACHE_DB_PATH = "cache/page_cache.db";
private static final long CACHE_EXPIRATION_HOURS = 24;
private static final int RATE_LIMIT_MS = 200;
// HTTP client used for API calls
private final HttpClient httpClient;
@@ -95,6 +108,10 @@ public class TroostwijkScraper {
public final DatabaseService db;
private final NotificationService notifier;
private final ObjectDetectionService detector;
private final CacheDatabase cacheDb;
private final boolean useCache;
private Playwright playwright;
private Browser browser;
/**
* Constructor. Creates supporting services and ensures the database
@@ -109,80 +126,256 @@ public class TroostwijkScraper {
*/
public TroostwijkScraper(String databasePath, String notificationConfig, String unused,
String yoloCfgPath, String yoloWeightsPath, String classNamesPath) throws SQLException, IOException {
this(databasePath, notificationConfig, unused, yoloCfgPath, yoloWeightsPath, classNamesPath, true);
}
/**
* Constructor with cache control.
*
* @param databasePath Path to SQLite database file
* @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email
* @param unused Unused parameter (kept for compatibility)
* @param yoloCfgPath Path to YOLO configuration file
* @param yoloWeightsPath Path to YOLO weights file
* @param classNamesPath Path to file containing class names
* @param useCache Enable page caching
*/
public TroostwijkScraper(String databasePath, String notificationConfig, String unused,
String yoloCfgPath, String yoloWeightsPath, String classNamesPath,
boolean useCache) throws SQLException, IOException {
this.httpClient = HttpClient.newHttpClient();
this.objectMapper = new ObjectMapper();
this.db = new DatabaseService(databasePath);
this.notifier = new NotificationService(notificationConfig, unused);
this.detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath);
this.useCache = useCache;
this.cacheDb = useCache ? new TroostwijkAuctionExtractor.CacheDatabase(CACHE_DB_PATH) : null;
// initialize DB
db.ensureSchema();
if (useCache) {
cacheDb.initialize();
}
}
/**
* Initializes Playwright browser for JavaScript-rendered pages.
* Call this before using discoverDutchAuctions().
*/
public void initializeBrowser() {
if (playwright == null) {
System.out.println("Initializing Playwright browser...");
this.playwright = Playwright.create();
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(true)
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
System.out.println("✓ Browser ready");
}
}
/**
* Closes browser and cache resources.
*/
public void close() {
if (browser != null) {
browser.close();
browser = null;
}
if (playwright != null) {
playwright.close();
playwright = null;
}
if (cacheDb != null) {
cacheDb.close();
}
}
/**
* Discovers all active Dutch auctions by crawling the auctions page.
*
* Troostwijk lists auctions for many countries on one page. We parse
* the page with jsoup and filter auctions whose location contains ", NL"
* (indicating the Netherlands). Each auction link contains a unique sale ID
* in the format A1-xxxxx or A7-xxxxx which we extract from the URL.
* Uses Playwright to render JavaScript-heavy pages and extract auction data.
* Supports caching to avoid unnecessary page fetches. Filters auctions whose
* location contains ", NL" (indicating the Netherlands). Each auction link
* contains a unique sale ID in the format A1-xxxxx or A7-xxxxx.
*
* @return a list of sale identifiers for auctions located in NL
*/
public List<Integer> discoverDutchAuctions() {
List<Integer> saleIds = new ArrayList<>();
try {
// Fetch the auctions overview page
Document doc = Jsoup.connect(AUCTIONS_PAGE).get();
Set<Integer> saleIds = new HashSet<>();
// Select all anchor elements that link to auction pages
// The URL pattern is: /a/auction-title-A1-xxxxx or /a/auction-title-A7-xxxxx
Elements auctionLinks = doc.select("a[href^='/a/']");
// Check if browser is initialized
if (browser == null) {
initializeBrowser();
}
System.out.println("Found " + auctionLinks.size() + " potential auction links");
int pageNumber = 1;
boolean hasMorePages = true;
for (Element link : auctionLinks) {
// Get the href to extract the auction ID
String href = link.attr("href");
System.out.println("Starting Dutch auction discovery from " + AUCTIONS_PAGE);
// Check if this link contains location text with ", NL"
String linkText = link.text();
while (hasMorePages) {
System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
// Look for location in any div inside the link
Elements divs = link.select("div");
boolean isDutch = false;
for (Element div : divs) {
String text = div.text();
if (text.contains(", NL")) {
isDutch = true;
break;
}
// Check cache first
String html = loadFromCache(pageNumber);
if (html != null) {
System.out.println(" ✓ Loaded from cache");
} else {
// Fetch with Playwright
html = fetchPageWithPlaywright(pageNumber);
if (html == null || html.isEmpty()) {
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
break;
}
if (isDutch) {
// Extract auction ID from URL
// Format: /a/title-A1-38375 or /a/title-A7-12345
// We want the number after A1- or A7-
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("A[17]-(\\d+)");
java.util.regex.Matcher matcher = pattern.matcher(href);
System.out.println(" ✓ Fetched from website");
if (matcher.find()) {
try {
int saleId = Integer.parseInt(matcher.group(1));
if (!saleIds.contains(saleId)) {
saleIds.add(saleId);
System.out.println(" Found Dutch auction: " + saleId + " - " + href);
}
} catch (NumberFormatException e) {
// Skip invalid IDs
}
}
// Save to cache
if (useCache) {
saveToCache(pageNumber, html);
}
// Rate limiting
try {
Thread.sleep(RATE_LIMIT_MS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
} catch (IOException e) {
System.err.println("Failed to discover auctions: " + e.getMessage());
e.printStackTrace();
// Parse auctions from HTML
int foundOnPage = parseAuctionsFromHtml(html, saleIds);
if (foundOnPage == 0) {
System.out.println(" ⚠️ No Dutch auctions found on page, stopping pagination");
hasMorePages = false;
} else {
System.out.println(" ✓ Found " + foundOnPage + " Dutch auctions");
pageNumber++;
}
}
return saleIds;
System.out.println("\n✓ Total Dutch auctions discovered: " + saleIds.size());
return new ArrayList<>(saleIds);
}
/**
* Fetches a single page using Playwright
*/
private String fetchPageWithPlaywright(int pageNumber) {
String url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
try {
Page page = browser.newPage();
// Set user agent
page.setExtraHTTPHeaders(Map.of(
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
));
// Navigate to page
page.navigate(url, new Page.NavigateOptions()
.setTimeout(30000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
// Wait for auction listings to appear
try {
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
.setTimeout(10000));
} catch (Exception e) {
// Continue even if selector not found
System.out.println(" ⚠️ Auction selector not found");
}
// Get HTML content
String html = page.content();
page.close();
return html;
} catch (Exception e) {
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
return null;
}
}
/**
* Parses auctions from HTML and adds Dutch auctions to the set
* @return number of Dutch auctions found on this page
*/
private int parseAuctionsFromHtml(String html, Set<Integer> saleIds) {
int foundCount = 0;
// Simple regex-based parsing for auction links
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
while (linkMatcher.find()) {
String href = linkMatcher.group(1);
int auctionId = Integer.parseInt(linkMatcher.group(2));
// Avoid duplicates
if (saleIds.contains(auctionId)) {
continue;
}
// Check if this auction is Dutch (location contains ", NL")
if (isDutchAuction(html, href)) {
saleIds.add(auctionId);
foundCount++;
System.out.println(" Found Dutch auction: " + auctionId + " - " + href);
}
}
return foundCount;
}
/**
* Checks if an auction is located in the Netherlands
*/
private boolean isDutchAuction(String html, String href) {
int hrefPos = html.indexOf(href);
if (hrefPos == -1) return false;
// Look at 1000 characters before and after the href for location info
int startPos = Math.max(hrefPos - 500, 0);
int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Look for ", NL" pattern
return context.contains(", NL");
}
/**
* Loads cached HTML for a page
*/
private String loadFromCache(int pageNumber) {
if (!useCache || cacheDb == null) return null;
String url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
return cacheDb.get(url);
}
/**
* Saves HTML to cache
*/
private void saveToCache(int pageNumber, String html) {
if (!useCache || cacheDb == null) return;
String url = pageNumber == 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + "?page=" + pageNumber;
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
}
/**

View File

@@ -55,6 +55,11 @@ public class TroostwijkScraperTest {
@AfterEach
public void tearDown() {
// Clean up browser and cache
if (scraper != null) {
scraper.close();
}
// Clean up test database
File dbFile = new File(testDatabasePath);
if (dbFile.exists()) {