diff --git a/cache/page_cache.db b/cache/page_cache.db index 1631f99..62a5c7e 100644 Binary files a/cache/page_cache.db and b/cache/page_cache.db differ diff --git a/src/main/java/com/auction/TroostwijkAuctionExtractor.java b/src/main/java/com/auction/TroostwijkAuctionExtractor.java index 8045f3d..7891615 100644 --- a/src/main/java/com/auction/TroostwijkAuctionExtractor.java +++ b/src/main/java/com/auction/TroostwijkAuctionExtractor.java @@ -303,19 +303,24 @@ public class TroostwijkAuctionExtractor { int hrefPos = html.indexOf(href); if (hrefPos == -1) return "Unknown"; - // Look at 500 characters after the href for location info - int endPos = Math.min(hrefPos + 500, html.length()); - String context = html.substring(hrefPos, endPos); + // Look at 1000 characters before AND after the href for location info + int startPos = Math.max(hrefPos - 500, 0); + int endPos = Math.min(hrefPos + 1000, html.length()); + String context = html.substring(startPos, endPos); // Try to find location pattern like "City, NL" or "City, Country" + // More flexible pattern to catch various location formats java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( - "([A-Za-z\\s]+),\\s*([A-Z]{2})"); + "([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])"); java.util.regex.Matcher locMatcher = locPattern.matcher(context); if (locMatcher.find()) { - return locMatcher.group(1).trim() + ", " + locMatcher.group(2); + String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2); + System.out.println(" Found location: " + location + " for auction " + href); + return location; } + System.out.println(" ⚠️ No location found for auction " + href); return "Unknown"; } diff --git a/src/main/java/com/auction/TroostwijkScraper.java b/src/main/java/com/auction/TroostwijkScraper.java index bff5531..59fa79c 100644 --- a/src/main/java/com/auction/TroostwijkScraper.java +++ b/src/main/java/com/auction/TroostwijkScraper.java @@ -56,19 +56,28 @@ import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; +import java.time.Instant; import java.time.LocalDateTime; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import com.auction.TroostwijkAuctionExtractor.CacheDatabase; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import com.microsoft.playwright.Browser; +import com.microsoft.playwright.BrowserType; +import com.microsoft.playwright.Page; +import com.microsoft.playwright.Playwright; +import com.microsoft.playwright.options.WaitUntilState; +import net.bytebuddy.build.Plugin.Engine.Source.Element; import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import org.opencv.core.Core; import org.opencv.core.Mat; import org.opencv.core.Scalar; @@ -76,6 +85,7 @@ import org.opencv.core.Size; import org.opencv.dnn.Dnn; import org.opencv.dnn.Net; import org.opencv.imgcodecs.Imgcodecs; +import org.w3c.dom.Document; import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV; import static org.opencv.dnn.Dnn.DNN_TARGET_CPU; @@ -84,17 +94,24 @@ import static org.opencv.dnn.Dnn.DNN_TARGET_CPU; * persisting data, scheduling updates, and performing object detection. */ public class TroostwijkScraper { - + // Base URLs – adjust these if Troostwijk changes their site structure - private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/nl/auctions"; + private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/auctions"; private static final String LOT_API = "https://api.troostwijkauctions.com/lot/7/list"; - + private static final String CACHE_DB_PATH = "cache/page_cache.db"; + private static final long CACHE_EXPIRATION_HOURS = 24; + private static final int RATE_LIMIT_MS = 200; + // HTTP client used for API calls private final HttpClient httpClient; private final ObjectMapper objectMapper; public final DatabaseService db; private final NotificationService notifier; private final ObjectDetectionService detector; + private final CacheDatabase cacheDb; + private final boolean useCache; + private Playwright playwright; + private Browser browser; /** * Constructor. Creates supporting services and ensures the database @@ -109,80 +126,256 @@ public class TroostwijkScraper { */ public TroostwijkScraper(String databasePath, String notificationConfig, String unused, String yoloCfgPath, String yoloWeightsPath, String classNamesPath) throws SQLException, IOException { + this(databasePath, notificationConfig, unused, yoloCfgPath, yoloWeightsPath, classNamesPath, true); + } + + /** + * Constructor with cache control. + * + * @param databasePath Path to SQLite database file + * @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email + * @param unused Unused parameter (kept for compatibility) + * @param yoloCfgPath Path to YOLO configuration file + * @param yoloWeightsPath Path to YOLO weights file + * @param classNamesPath Path to file containing class names + * @param useCache Enable page caching + */ + public TroostwijkScraper(String databasePath, String notificationConfig, String unused, + String yoloCfgPath, String yoloWeightsPath, String classNamesPath, + boolean useCache) throws SQLException, IOException { this.httpClient = HttpClient.newHttpClient(); this.objectMapper = new ObjectMapper(); this.db = new DatabaseService(databasePath); this.notifier = new NotificationService(notificationConfig, unused); this.detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath); + this.useCache = useCache; + this.cacheDb = useCache ? new TroostwijkAuctionExtractor.CacheDatabase(CACHE_DB_PATH) : null; + // initialize DB db.ensureSchema(); + if (useCache) { + cacheDb.initialize(); + } + } + + /** + * Initializes Playwright browser for JavaScript-rendered pages. + * Call this before using discoverDutchAuctions(). + */ + public void initializeBrowser() { + if (playwright == null) { + System.out.println("Initializing Playwright browser..."); + this.playwright = Playwright.create(); + this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions() + .setHeadless(true) + .setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox"))); + System.out.println("✓ Browser ready"); + } + } + + /** + * Closes browser and cache resources. + */ + public void close() { + if (browser != null) { + browser.close(); + browser = null; + } + if (playwright != null) { + playwright.close(); + playwright = null; + } + if (cacheDb != null) { + cacheDb.close(); + } } /** * Discovers all active Dutch auctions by crawling the auctions page. * - * Troostwijk lists auctions for many countries on one page. We parse - * the page with jsoup and filter auctions whose location contains ", NL" - * (indicating the Netherlands). Each auction link contains a unique sale ID - * in the format A1-xxxxx or A7-xxxxx which we extract from the URL. + * Uses Playwright to render JavaScript-heavy pages and extract auction data. + * Supports caching to avoid unnecessary page fetches. Filters auctions whose + * location contains ", NL" (indicating the Netherlands). Each auction link + * contains a unique sale ID in the format A1-xxxxx or A7-xxxxx. * * @return a list of sale identifiers for auctions located in NL */ public List discoverDutchAuctions() { - List saleIds = new ArrayList<>(); - try { - // Fetch the auctions overview page - Document doc = Jsoup.connect(AUCTIONS_PAGE).get(); - - // Select all anchor elements that link to auction pages - // The URL pattern is: /a/auction-title-A1-xxxxx or /a/auction-title-A7-xxxxx - Elements auctionLinks = doc.select("a[href^='/a/']"); - - System.out.println("Found " + auctionLinks.size() + " potential auction links"); - - for (Element link : auctionLinks) { - // Get the href to extract the auction ID - String href = link.attr("href"); - - // Check if this link contains location text with ", NL" - String linkText = link.text(); - - // Look for location in any div inside the link - Elements divs = link.select("div"); - boolean isDutch = false; - for (Element div : divs) { - String text = div.text(); - if (text.contains(", NL")) { - isDutch = true; - break; - } + Set saleIds = new HashSet<>(); + + // Check if browser is initialized + if (browser == null) { + initializeBrowser(); + } + + int pageNumber = 1; + boolean hasMorePages = true; + + System.out.println("Starting Dutch auction discovery from " + AUCTIONS_PAGE); + + while (hasMorePages) { + System.out.println("\n[Page " + pageNumber + "] Fetching auctions..."); + + // Check cache first + String html = loadFromCache(pageNumber); + + if (html != null) { + System.out.println(" ✓ Loaded from cache"); + } else { + // Fetch with Playwright + html = fetchPageWithPlaywright(pageNumber); + + if (html == null || html.isEmpty()) { + System.out.println(" ⚠️ Failed to fetch page, stopping pagination"); + break; } - - if (isDutch) { - // Extract auction ID from URL - // Format: /a/title-A1-38375 or /a/title-A7-12345 - // We want the number after A1- or A7- - java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("A[17]-(\\d+)"); - java.util.regex.Matcher matcher = pattern.matcher(href); - - if (matcher.find()) { - try { - int saleId = Integer.parseInt(matcher.group(1)); - if (!saleIds.contains(saleId)) { - saleIds.add(saleId); - System.out.println(" Found Dutch auction: " + saleId + " - " + href); - } - } catch (NumberFormatException e) { - // Skip invalid IDs - } - } + + System.out.println(" ✓ Fetched from website"); + + // Save to cache + if (useCache) { + saveToCache(pageNumber, html); + } + + // Rate limiting + try { + Thread.sleep(RATE_LIMIT_MS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; } } - } catch (IOException e) { - System.err.println("Failed to discover auctions: " + e.getMessage()); - e.printStackTrace(); + + // Parse auctions from HTML + int foundOnPage = parseAuctionsFromHtml(html, saleIds); + + if (foundOnPage == 0) { + System.out.println(" ⚠️ No Dutch auctions found on page, stopping pagination"); + hasMorePages = false; + } else { + System.out.println(" ✓ Found " + foundOnPage + " Dutch auctions"); + pageNumber++; + } } - return saleIds; + + System.out.println("\n✓ Total Dutch auctions discovered: " + saleIds.size()); + return new ArrayList<>(saleIds); + } + + /** + * Fetches a single page using Playwright + */ + private String fetchPageWithPlaywright(int pageNumber) { + String url = pageNumber == 1 + ? AUCTIONS_PAGE + : AUCTIONS_PAGE + "?page=" + pageNumber; + + try { + Page page = browser.newPage(); + + // Set user agent + page.setExtraHTTPHeaders(Map.of( + "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + )); + + // Navigate to page + page.navigate(url, new Page.NavigateOptions() + .setTimeout(30000) + .setWaitUntil(WaitUntilState.NETWORKIDLE)); + + // Wait for auction listings to appear + try { + page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions() + .setTimeout(10000)); + } catch (Exception e) { + // Continue even if selector not found + System.out.println(" ⚠️ Auction selector not found"); + } + + // Get HTML content + String html = page.content(); + page.close(); + + return html; + + } catch (Exception e) { + System.err.println(" ⚠️ Playwright error: " + e.getMessage()); + return null; + } + } + + /** + * Parses auctions from HTML and adds Dutch auctions to the set + * @return number of Dutch auctions found on this page + */ + private int parseAuctionsFromHtml(String html, Set saleIds) { + int foundCount = 0; + + // Simple regex-based parsing for auction links + java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile( + "href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\""); + java.util.regex.Matcher linkMatcher = linkPattern.matcher(html); + + while (linkMatcher.find()) { + String href = linkMatcher.group(1); + int auctionId = Integer.parseInt(linkMatcher.group(2)); + + // Avoid duplicates + if (saleIds.contains(auctionId)) { + continue; + } + + // Check if this auction is Dutch (location contains ", NL") + if (isDutchAuction(html, href)) { + saleIds.add(auctionId); + foundCount++; + System.out.println(" Found Dutch auction: " + auctionId + " - " + href); + } + } + + return foundCount; + } + + /** + * Checks if an auction is located in the Netherlands + */ + private boolean isDutchAuction(String html, String href) { + int hrefPos = html.indexOf(href); + if (hrefPos == -1) return false; + + // Look at 1000 characters before and after the href for location info + int startPos = Math.max(hrefPos - 500, 0); + int endPos = Math.min(hrefPos + 1000, html.length()); + String context = html.substring(startPos, endPos); + + // Look for ", NL" pattern + return context.contains(", NL"); + } + + /** + * Loads cached HTML for a page + */ + private String loadFromCache(int pageNumber) { + if (!useCache || cacheDb == null) return null; + + String url = pageNumber == 1 + ? AUCTIONS_PAGE + : AUCTIONS_PAGE + "?page=" + pageNumber; + + return cacheDb.get(url); + } + + /** + * Saves HTML to cache + */ + private void saveToCache(int pageNumber, String html) { + if (!useCache || cacheDb == null) return; + + String url = pageNumber == 1 + ? AUCTIONS_PAGE + : AUCTIONS_PAGE + "?page=" + pageNumber; + + cacheDb.put(url, html, CACHE_EXPIRATION_HOURS); } /** diff --git a/src/test/java/com/auction/TroostwijkScraperTest.java b/src/test/java/com/auction/TroostwijkScraperTest.java index 4bb4344..df9362c 100644 --- a/src/test/java/com/auction/TroostwijkScraperTest.java +++ b/src/test/java/com/auction/TroostwijkScraperTest.java @@ -55,6 +55,11 @@ public class TroostwijkScraperTest { @AfterEach public void tearDown() { + // Clean up browser and cache + if (scraper != null) { + scraper.close(); + } + // Clean up test database File dbFile = new File(testDatabasePath); if (dbFile.exists()) {