This commit is contained in:
2025-11-28 05:16:51 +01:00
parent ec2efd4661
commit b560240c17
6 changed files with 307 additions and 3 deletions

View File

@@ -53,6 +53,8 @@ public class TroostwijkAuctionExtractor {
private final ObjectMapper objectMapper;
private final boolean useCache;
private final CacheDatabase cacheDb;
private final int maxPageVisits; // Maximum number of pages to fetch (0 = unlimited)
private int pageVisitCount; // Counter for actual page fetches (not from cache)
private Playwright playwright;
private Browser browser;
@@ -77,10 +79,13 @@ public class TroostwijkAuctionExtractor {
* Constructor
*
* @param useCache Enable database caching of visited pages
* @param maxPageVisits Maximum number of actual page fetches (0 = unlimited)
*/
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
public TroostwijkAuctionExtractor(boolean useCache, int maxPageVisits) throws SQLException, IOException {
this.objectMapper = new ObjectMapper();
this.useCache = useCache;
this.maxPageVisits = maxPageVisits;
this.pageVisitCount = 0;
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
if (useCache) {
@@ -88,6 +93,15 @@ public class TroostwijkAuctionExtractor {
}
}
/**
* Constructor with default unlimited page visits
*
* @param useCache Enable database caching of visited pages
*/
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
this(useCache, 0); // 0 = unlimited
}
/**
* Initializes Playwright and browser instance
* Call this before extracting auctions
@@ -145,14 +159,24 @@ public class TroostwijkAuctionExtractor {
System.out.println(" ✓ Loaded from cache");
html = cachedHtml;
} else {
// Check if we've reached the maximum page visit limit
if (maxPageVisits > 0 && pageVisitCount >= maxPageVisits) {
System.out.println(" ⚠️ Reached maximum page visit limit (" + maxPageVisits + "), stopping");
break;
}
// Fetch with Playwright
html = fetchPageWithPlaywright(pageNumber);
pageVisitCount++; // Increment actual page fetch counter
if (html == null || html.isEmpty()) {
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
break;
}
System.out.println(" ✓ Fetched from website (visit " + pageVisitCount +
(maxPageVisits > 0 ? "/" + maxPageVisits : "") + ")");
// Save to cache
if (useCache) {
saveToCache(pageNumber, html);
@@ -371,13 +395,41 @@ public class TroostwijkAuctionExtractor {
/**
* Entry point for testing
*
* Arguments:
* --max-visits <number> : Maximum number of page visits (0 = unlimited, default)
* --no-cache : Disable caching
*/
public static void main(String[] args) throws Exception {
System.out.println("=== Troostwijk Auction Extractor ===\n");
// Enable caching by default
// Parse command line arguments
boolean useCache = true;
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache);
int maxVisits = 0; // 0 = unlimited
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "--max-visits":
if (i + 1 < args.length) {
maxVisits = Integer.parseInt(args[++i]);
System.out.println("Max page visits set to: " + maxVisits);
}
break;
case "--no-cache":
useCache = false;
System.out.println("Caching disabled");
break;
case "--help":
System.out.println("Usage: java TroostwijkAuctionExtractor [options]");
System.out.println("Options:");
System.out.println(" --max-visits <n> : Limit actual page fetches to n (0 = unlimited)");
System.out.println(" --no-cache : Disable page caching");
System.out.println(" --help : Show this help message");
return;
}
}
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache, maxVisits);
try {
// Initialize browser
@@ -392,6 +444,7 @@ public class TroostwijkAuctionExtractor {
System.out.println("\n=== Results ===");
System.out.println("Total auctions found: " + allAuctions.size());
System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
System.out.println("Actual page visits: " + extractor.pageVisitCount);
// Display first 10 Dutch auctions
System.out.println("\n=== Sample Dutch Auctions ===");