start
This commit is contained in:
BIN
cache/page_cache.db
vendored
BIN
cache/page_cache.db
vendored
Binary file not shown.
@@ -303,19 +303,24 @@ public class TroostwijkAuctionExtractor {
|
||||
int hrefPos = html.indexOf(href);
|
||||
if (hrefPos == -1) return "Unknown";
|
||||
|
||||
// Look at 500 characters after the href for location info
|
||||
int endPos = Math.min(hrefPos + 500, html.length());
|
||||
String context = html.substring(hrefPos, endPos);
|
||||
// Look at 1000 characters before AND after the href for location info
|
||||
int startPos = Math.max(hrefPos - 500, 0);
|
||||
int endPos = Math.min(hrefPos + 1000, html.length());
|
||||
String context = html.substring(startPos, endPos);
|
||||
|
||||
// Try to find location pattern like "City, NL" or "City, Country"
|
||||
// More flexible pattern to catch various location formats
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-Za-z\\s]+),\\s*([A-Z]{2})");
|
||||
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
|
||||
|
||||
if (locMatcher.find()) {
|
||||
return locMatcher.group(1).trim() + ", " + locMatcher.group(2);
|
||||
String location = locMatcher.group(1).trim() + ", " + locMatcher.group(2);
|
||||
System.out.println(" Found location: " + location + " for auction " + href);
|
||||
return location;
|
||||
}
|
||||
|
||||
System.out.println(" ⚠️ No location found for auction " + href);
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
|
||||
@@ -56,19 +56,28 @@ import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.time.Instant;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.auction.TroostwijkAuctionExtractor.CacheDatabase;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.microsoft.playwright.Browser;
|
||||
import com.microsoft.playwright.BrowserType;
|
||||
import com.microsoft.playwright.Page;
|
||||
import com.microsoft.playwright.Playwright;
|
||||
import com.microsoft.playwright.options.WaitUntilState;
|
||||
import net.bytebuddy.build.Plugin.Engine.Source.Element;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.opencv.core.Core;
|
||||
import org.opencv.core.Mat;
|
||||
import org.opencv.core.Scalar;
|
||||
@@ -76,6 +85,7 @@ import org.opencv.core.Size;
|
||||
import org.opencv.dnn.Dnn;
|
||||
import org.opencv.dnn.Net;
|
||||
import org.opencv.imgcodecs.Imgcodecs;
|
||||
import org.w3c.dom.Document;
|
||||
import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV;
|
||||
import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
|
||||
|
||||
@@ -84,17 +94,24 @@ import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
|
||||
* persisting data, scheduling updates, and performing object detection.
|
||||
*/
|
||||
public class TroostwijkScraper {
|
||||
|
||||
|
||||
// Base URLs – adjust these if Troostwijk changes their site structure
|
||||
private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/nl/auctions";
|
||||
private static final String AUCTIONS_PAGE = "https://www.troostwijkauctions.com/auctions";
|
||||
private static final String LOT_API = "https://api.troostwijkauctions.com/lot/7/list";
|
||||
|
||||
private static final String CACHE_DB_PATH = "cache/page_cache.db";
|
||||
private static final long CACHE_EXPIRATION_HOURS = 24;
|
||||
private static final int RATE_LIMIT_MS = 200;
|
||||
|
||||
// HTTP client used for API calls
|
||||
private final HttpClient httpClient;
|
||||
private final ObjectMapper objectMapper;
|
||||
public final DatabaseService db;
|
||||
private final NotificationService notifier;
|
||||
private final ObjectDetectionService detector;
|
||||
private final CacheDatabase cacheDb;
|
||||
private final boolean useCache;
|
||||
private Playwright playwright;
|
||||
private Browser browser;
|
||||
|
||||
/**
|
||||
* Constructor. Creates supporting services and ensures the database
|
||||
@@ -109,80 +126,256 @@ public class TroostwijkScraper {
|
||||
*/
|
||||
public TroostwijkScraper(String databasePath, String notificationConfig, String unused,
|
||||
String yoloCfgPath, String yoloWeightsPath, String classNamesPath) throws SQLException, IOException {
|
||||
this(databasePath, notificationConfig, unused, yoloCfgPath, yoloWeightsPath, classNamesPath, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor with cache control.
|
||||
*
|
||||
* @param databasePath Path to SQLite database file
|
||||
* @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email
|
||||
* @param unused Unused parameter (kept for compatibility)
|
||||
* @param yoloCfgPath Path to YOLO configuration file
|
||||
* @param yoloWeightsPath Path to YOLO weights file
|
||||
* @param classNamesPath Path to file containing class names
|
||||
* @param useCache Enable page caching
|
||||
*/
|
||||
public TroostwijkScraper(String databasePath, String notificationConfig, String unused,
|
||||
String yoloCfgPath, String yoloWeightsPath, String classNamesPath,
|
||||
boolean useCache) throws SQLException, IOException {
|
||||
this.httpClient = HttpClient.newHttpClient();
|
||||
this.objectMapper = new ObjectMapper();
|
||||
this.db = new DatabaseService(databasePath);
|
||||
this.notifier = new NotificationService(notificationConfig, unused);
|
||||
this.detector = new ObjectDetectionService(yoloCfgPath, yoloWeightsPath, classNamesPath);
|
||||
this.useCache = useCache;
|
||||
this.cacheDb = useCache ? new TroostwijkAuctionExtractor.CacheDatabase(CACHE_DB_PATH) : null;
|
||||
|
||||
// initialize DB
|
||||
db.ensureSchema();
|
||||
if (useCache) {
|
||||
cacheDb.initialize();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes Playwright browser for JavaScript-rendered pages.
|
||||
* Call this before using discoverDutchAuctions().
|
||||
*/
|
||||
public void initializeBrowser() {
|
||||
if (playwright == null) {
|
||||
System.out.println("Initializing Playwright browser...");
|
||||
this.playwright = Playwright.create();
|
||||
this.browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
|
||||
.setHeadless(true)
|
||||
.setArgs(Arrays.asList("--no-sandbox", "--disable-setuid-sandbox")));
|
||||
System.out.println("✓ Browser ready");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes browser and cache resources.
|
||||
*/
|
||||
public void close() {
|
||||
if (browser != null) {
|
||||
browser.close();
|
||||
browser = null;
|
||||
}
|
||||
if (playwright != null) {
|
||||
playwright.close();
|
||||
playwright = null;
|
||||
}
|
||||
if (cacheDb != null) {
|
||||
cacheDb.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Discovers all active Dutch auctions by crawling the auctions page.
|
||||
*
|
||||
* Troostwijk lists auctions for many countries on one page. We parse
|
||||
* the page with jsoup and filter auctions whose location contains ", NL"
|
||||
* (indicating the Netherlands). Each auction link contains a unique sale ID
|
||||
* in the format A1-xxxxx or A7-xxxxx which we extract from the URL.
|
||||
* Uses Playwright to render JavaScript-heavy pages and extract auction data.
|
||||
* Supports caching to avoid unnecessary page fetches. Filters auctions whose
|
||||
* location contains ", NL" (indicating the Netherlands). Each auction link
|
||||
* contains a unique sale ID in the format A1-xxxxx or A7-xxxxx.
|
||||
*
|
||||
* @return a list of sale identifiers for auctions located in NL
|
||||
*/
|
||||
public List<Integer> discoverDutchAuctions() {
|
||||
List<Integer> saleIds = new ArrayList<>();
|
||||
try {
|
||||
// Fetch the auctions overview page
|
||||
Document doc = Jsoup.connect(AUCTIONS_PAGE).get();
|
||||
|
||||
// Select all anchor elements that link to auction pages
|
||||
// The URL pattern is: /a/auction-title-A1-xxxxx or /a/auction-title-A7-xxxxx
|
||||
Elements auctionLinks = doc.select("a[href^='/a/']");
|
||||
|
||||
System.out.println("Found " + auctionLinks.size() + " potential auction links");
|
||||
|
||||
for (Element link : auctionLinks) {
|
||||
// Get the href to extract the auction ID
|
||||
String href = link.attr("href");
|
||||
|
||||
// Check if this link contains location text with ", NL"
|
||||
String linkText = link.text();
|
||||
|
||||
// Look for location in any div inside the link
|
||||
Elements divs = link.select("div");
|
||||
boolean isDutch = false;
|
||||
for (Element div : divs) {
|
||||
String text = div.text();
|
||||
if (text.contains(", NL")) {
|
||||
isDutch = true;
|
||||
break;
|
||||
}
|
||||
Set<Integer> saleIds = new HashSet<>();
|
||||
|
||||
// Check if browser is initialized
|
||||
if (browser == null) {
|
||||
initializeBrowser();
|
||||
}
|
||||
|
||||
int pageNumber = 1;
|
||||
boolean hasMorePages = true;
|
||||
|
||||
System.out.println("Starting Dutch auction discovery from " + AUCTIONS_PAGE);
|
||||
|
||||
while (hasMorePages) {
|
||||
System.out.println("\n[Page " + pageNumber + "] Fetching auctions...");
|
||||
|
||||
// Check cache first
|
||||
String html = loadFromCache(pageNumber);
|
||||
|
||||
if (html != null) {
|
||||
System.out.println(" ✓ Loaded from cache");
|
||||
} else {
|
||||
// Fetch with Playwright
|
||||
html = fetchPageWithPlaywright(pageNumber);
|
||||
|
||||
if (html == null || html.isEmpty()) {
|
||||
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
|
||||
break;
|
||||
}
|
||||
|
||||
if (isDutch) {
|
||||
// Extract auction ID from URL
|
||||
// Format: /a/title-A1-38375 or /a/title-A7-12345
|
||||
// We want the number after A1- or A7-
|
||||
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("A[17]-(\\d+)");
|
||||
java.util.regex.Matcher matcher = pattern.matcher(href);
|
||||
|
||||
if (matcher.find()) {
|
||||
try {
|
||||
int saleId = Integer.parseInt(matcher.group(1));
|
||||
if (!saleIds.contains(saleId)) {
|
||||
saleIds.add(saleId);
|
||||
System.out.println(" Found Dutch auction: " + saleId + " - " + href);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
// Skip invalid IDs
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println(" ✓ Fetched from website");
|
||||
|
||||
// Save to cache
|
||||
if (useCache) {
|
||||
saveToCache(pageNumber, html);
|
||||
}
|
||||
|
||||
// Rate limiting
|
||||
try {
|
||||
Thread.sleep(RATE_LIMIT_MS);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
System.err.println("Failed to discover auctions: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
|
||||
// Parse auctions from HTML
|
||||
int foundOnPage = parseAuctionsFromHtml(html, saleIds);
|
||||
|
||||
if (foundOnPage == 0) {
|
||||
System.out.println(" ⚠️ No Dutch auctions found on page, stopping pagination");
|
||||
hasMorePages = false;
|
||||
} else {
|
||||
System.out.println(" ✓ Found " + foundOnPage + " Dutch auctions");
|
||||
pageNumber++;
|
||||
}
|
||||
}
|
||||
return saleIds;
|
||||
|
||||
System.out.println("\n✓ Total Dutch auctions discovered: " + saleIds.size());
|
||||
return new ArrayList<>(saleIds);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches a single page using Playwright
|
||||
*/
|
||||
private String fetchPageWithPlaywright(int pageNumber) {
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_PAGE
|
||||
: AUCTIONS_PAGE + "?page=" + pageNumber;
|
||||
|
||||
try {
|
||||
Page page = browser.newPage();
|
||||
|
||||
// Set user agent
|
||||
page.setExtraHTTPHeaders(Map.of(
|
||||
"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
));
|
||||
|
||||
// Navigate to page
|
||||
page.navigate(url, new Page.NavigateOptions()
|
||||
.setTimeout(30000)
|
||||
.setWaitUntil(WaitUntilState.NETWORKIDLE));
|
||||
|
||||
// Wait for auction listings to appear
|
||||
try {
|
||||
page.waitForSelector("a[href^='/a/']", new Page.WaitForSelectorOptions()
|
||||
.setTimeout(10000));
|
||||
} catch (Exception e) {
|
||||
// Continue even if selector not found
|
||||
System.out.println(" ⚠️ Auction selector not found");
|
||||
}
|
||||
|
||||
// Get HTML content
|
||||
String html = page.content();
|
||||
page.close();
|
||||
|
||||
return html;
|
||||
|
||||
} catch (Exception e) {
|
||||
System.err.println(" ⚠️ Playwright error: " + e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses auctions from HTML and adds Dutch auctions to the set
|
||||
* @return number of Dutch auctions found on this page
|
||||
*/
|
||||
private int parseAuctionsFromHtml(String html, Set<Integer> saleIds) {
|
||||
int foundCount = 0;
|
||||
|
||||
// Simple regex-based parsing for auction links
|
||||
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
|
||||
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
|
||||
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
|
||||
|
||||
while (linkMatcher.find()) {
|
||||
String href = linkMatcher.group(1);
|
||||
int auctionId = Integer.parseInt(linkMatcher.group(2));
|
||||
|
||||
// Avoid duplicates
|
||||
if (saleIds.contains(auctionId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this auction is Dutch (location contains ", NL")
|
||||
if (isDutchAuction(html, href)) {
|
||||
saleIds.add(auctionId);
|
||||
foundCount++;
|
||||
System.out.println(" Found Dutch auction: " + auctionId + " - " + href);
|
||||
}
|
||||
}
|
||||
|
||||
return foundCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if an auction is located in the Netherlands
|
||||
*/
|
||||
private boolean isDutchAuction(String html, String href) {
|
||||
int hrefPos = html.indexOf(href);
|
||||
if (hrefPos == -1) return false;
|
||||
|
||||
// Look at 1000 characters before and after the href for location info
|
||||
int startPos = Math.max(hrefPos - 500, 0);
|
||||
int endPos = Math.min(hrefPos + 1000, html.length());
|
||||
String context = html.substring(startPos, endPos);
|
||||
|
||||
// Look for ", NL" pattern
|
||||
return context.contains(", NL");
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads cached HTML for a page
|
||||
*/
|
||||
private String loadFromCache(int pageNumber) {
|
||||
if (!useCache || cacheDb == null) return null;
|
||||
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_PAGE
|
||||
: AUCTIONS_PAGE + "?page=" + pageNumber;
|
||||
|
||||
return cacheDb.get(url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves HTML to cache
|
||||
*/
|
||||
private void saveToCache(int pageNumber, String html) {
|
||||
if (!useCache || cacheDb == null) return;
|
||||
|
||||
String url = pageNumber == 1
|
||||
? AUCTIONS_PAGE
|
||||
: AUCTIONS_PAGE + "?page=" + pageNumber;
|
||||
|
||||
cacheDb.put(url, html, CACHE_EXPIRATION_HOURS);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -55,6 +55,11 @@ public class TroostwijkScraperTest {
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() {
|
||||
// Clean up browser and cache
|
||||
if (scraper != null) {
|
||||
scraper.close();
|
||||
}
|
||||
|
||||
// Clean up test database
|
||||
File dbFile = new File(testDatabasePath);
|
||||
if (dbFile.exists()) {
|
||||
|
||||
Reference in New Issue
Block a user