@@ -56,19 +56,28 @@ import java.sql.PreparedStatement;
import java.sql.ResultSet ;
import java.sql.SQLException ;
import java.sql.Statement ;
import java.time.Instant ;
import java.time.LocalDateTime ;
import java.util.ArrayList ;
import java.util.Arrays ;
import java.util.HashSet ;
import java.util.List ;
import java.util.Map ;
import java.util.Set ;
import java.util.concurrent.Executors ;
import java.util.concurrent.ScheduledExecutorService ;
import java.util.concurrent.TimeUnit ;
import com.auction.TroostwijkAuctionExtractor.CacheDatabase ;
import com.fasterxml.jackson.databind.JsonNode ;
import com.fasterxml.jackson.databind.ObjectMapper ;
import com.microsoft.playwright.Browser ;
import com.microsoft.playwright.BrowserType ;
import com.microsoft.playwright.Page ;
import com.microsoft.playwright.Playwright ;
import com.microsoft.playwright.options.WaitUntilState ;
import net.bytebuddy.build.Plugin.Engine.Source.Element ;
import org.jsoup.Jsoup ;
import org.jsoup.nodes.Document ;
import org.jsoup.nodes.Element ;
import org.jsoup.select.Elements ;
import org.opencv.core.Core ;
import org.opencv.core.Mat ;
import org.opencv.core.Scalar ;
@@ -76,6 +85,7 @@ import org.opencv.core.Size;
import org.opencv.dnn.Dnn ;
import org.opencv.dnn.Net ;
import org.opencv.imgcodecs.Imgcodecs ;
import org.w3c.dom.Document ;
import static org.opencv.dnn.Dnn.DNN_BACKEND_OPENCV ;
import static org.opencv.dnn.Dnn.DNN_TARGET_CPU ;
@@ -84,17 +94,24 @@ import static org.opencv.dnn.Dnn.DNN_TARGET_CPU;
* persisting data, scheduling updates, and performing object detection.
*/
public class TroostwijkScraper {
// Base URLs – adjust these if Troostwijk changes their site structure
private static final String AUCTIONS_PAGE = " https://www.troostwijkauctions.com/nl/ auctions " ;
private static final String AUCTIONS_PAGE = " https://www.troostwijkauctions.com/auctions " ;
private static final String LOT_API = " https://api.troostwijkauctions.com/lot/7/list " ;
private static final String CACHE_DB_PATH = " cache/page_cache.db " ;
private static final long CACHE_EXPIRATION_HOURS = 24 ;
private static final int RATE_LIMIT_MS = 200 ;
// HTTP client used for API calls
private final HttpClient httpClient ;
private final ObjectMapper objectMapper ;
public final DatabaseService db ;
private final NotificationService notifier ;
private final ObjectDetectionService detector ;
private final CacheDatabase cacheDb ;
private final boolean useCache ;
private Playwright playwright ;
private Browser browser ;
/**
* Constructor. Creates supporting services and ensures the database
@@ -109,80 +126,256 @@ public class TroostwijkScraper {
*/
public TroostwijkScraper ( String databasePath , String notificationConfig , String unused ,
String yoloCfgPath , String yoloWeightsPath , String classNamesPath ) throws SQLException , IOException {
this ( databasePath , notificationConfig , unused , yoloCfgPath , yoloWeightsPath , classNamesPath , true ) ;
}
/**
* Constructor with cache control.
*
* @param databasePath Path to SQLite database file
* @param notificationConfig "desktop" for desktop only, or "smtp:user:pass:toEmail" for email
* @param unused Unused parameter (kept for compatibility)
* @param yoloCfgPath Path to YOLO configuration file
* @param yoloWeightsPath Path to YOLO weights file
* @param classNamesPath Path to file containing class names
* @param useCache Enable page caching
*/
public TroostwijkScraper ( String databasePath , String notificationConfig , String unused ,
String yoloCfgPath , String yoloWeightsPath , String classNamesPath ,
boolean useCache ) throws SQLException , IOException {
this . httpClient = HttpClient . newHttpClient ( ) ;
this . objectMapper = new ObjectMapper ( ) ;
this . db = new DatabaseService ( databasePath ) ;
this . notifier = new NotificationService ( notificationConfig , unused ) ;
this . detector = new ObjectDetectionService ( yoloCfgPath , yoloWeightsPath , classNamesPath ) ;
this . useCache = useCache ;
this . cacheDb = useCache ? new TroostwijkAuctionExtractor . CacheDatabase ( CACHE_DB_PATH ) : null ;
// initialize DB
db . ensureSchema ( ) ;
if ( useCache ) {
cacheDb . initialize ( ) ;
}
}
/**
* Initializes Playwright browser for JavaScript-rendered pages.
* Call this before using discoverDutchAuctions().
*/
public void initializeBrowser ( ) {
if ( playwright = = null ) {
System . out . println ( " Initializing Playwright browser... " ) ;
this . playwright = Playwright . create ( ) ;
this . browser = playwright . chromium ( ) . launch ( new BrowserType . LaunchOptions ( )
. setHeadless ( true )
. setArgs ( Arrays . asList ( " --no-sandbox " , " --disable-setuid-sandbox " ) ) ) ;
System . out . println ( " ✓ Browser ready " ) ;
}
}
/**
* Closes browser and cache resources.
*/
public void close ( ) {
if ( browser ! = null ) {
browser . close ( ) ;
browser = null ;
}
if ( playwright ! = null ) {
playwright . close ( ) ;
playwright = null ;
}
if ( cacheDb ! = null ) {
cacheDb . close ( ) ;
}
}
/**
* Discovers all active Dutch auctions by crawling the auctions page.
*
* Troostwijk lists auctions for many countries on one page. We parse
* the page with jsoup and f ilter auctions whose location contains ", NL"
* (indicating the Netherlands). Each auction link contains a unique sale ID
* in the format A1-xxxxx or A7-xxxxx which we extract from the URL .
* Uses Playwright to render JavaScript-heavy pages and extract auction data.
* Supports caching to avoid unnecessary page fetches. F ilters auctions whose
* location contains ", NL" (indicating the Netherlands). Each auction link
* contains a unique sale ID in the format A1-xxxxx or A7-xxxxx.
*
* @return a list of sale identifiers for auctions located in NL
*/
public List < Integer > discoverDutchAuctions ( ) {
Lis t< Integer > saleIds = new ArrayLis t< > ( ) ;
try {
// Fetch the auctions overview page
Document doc = Jsoup . connect ( AUCTIONS_PAGE ) . get ( ) ;
// Select all anchor elements that link to auction pages
// The URL pattern is: /a/auction-title-A1-xxxxx or /a/auction-title-A7-xxxxx
Eleme nts auctionLinks = doc . select ( " a[href^='/a/'] " ) ;
System . out . println ( " Found " + auctionLinks . size ( ) + " potential auction links " ) ;
for ( Element link : auctionLinks ) {
// Get the href to extract the auction ID
String href = link . attr ( " href " ) ;
// Check if this link contains location text with ", NL"
String linkText = link . text ( ) ;
// Look for location in any div inside the link
Elements divs = link . select ( " div " ) ;
boolean isDutch = false ;
for ( Element div : divs ) {
String text = div . text ( ) ;
if ( text . contains ( " , NL " ) ) {
isDutch = true ;
break ;
}
Se t< Integer > saleIds = new HashSe t< > ( ) ;
// Check if browser is initialized
if ( browser = = null ) {
initializeBrowser ( ) ;
}
i nt pageNumber = 1 ;
boolean hasMorePages = true ;
System . out . println ( " Starting Dutch auction discovery from " + AUCTIONS_PAGE ) ;
while ( hasMorePages ) {
System . out . println ( " \ n[Page " + pageNumber + " ] Fetching auctions... " ) ;
// Check cache first
String html = loadFromCache ( pageNumber ) ;
if ( html ! = null ) {
System . out . println ( " ✓ Loaded from cache " ) ;
} else {
// Fetch with Playwright
html = fetchPageWithPlaywright ( pageNumber ) ;
if ( html = = null | | html . isEmpty ( ) ) {
System . out . println ( " ⚠️ Failed to fetch page, stopping pagination " ) ;
break ;
}
if ( isDutch ) {
// Extract auction ID from URL
// Format: /a/title-A1-38375 or /a/title-A7-12345
// We want the number after A1- or A7-
j ava . util . regex . Pattern pattern = java . util . regex . Pattern . compile ( " A[17]-( \\ d+) " ) ;
java . util . regex . Matcher matcher = pattern . matcher ( href ) ;
if ( matcher . find ( ) ) {
try {
int saleId = Integer . parseInt ( matcher . group ( 1 ) ) ;
if ( ! saleIds . contains ( saleId ) ) {
saleIds . add ( saleId ) ;
System . out . println ( " Found Dutch auction: " + saleId + " - " + href ) ;
}
} catch ( NumberFormatException e ) {
// Skip invalid IDs
}
}
System . out . println ( " ✓ Fetched from website " ) ;
// Save to cache
if ( useCache ) {
s aveToCache ( pageNumber , html ) ;
}
// Rate limiting
try {
Thread . sleep ( RATE_LIMIT_MS ) ;
} catch ( InterruptedException e ) {
Thread . currentThread ( ) . interrupt ( ) ;
break ;
}
}
} catch ( IOException e ) {
System . err . println ( " Failed to discover auctions: " + e . getMessage ( ) ) ;
e . printStackTrace ( ) ;
// Parse auctions from HTML
int foundOnPage = parseAuctionsFromHtml ( html , saleIds ) ;
if ( foundOnPage = = 0 ) {
System . out . println ( " ⚠️ No Dutch auctions found on page, stopping pagination " ) ;
hasMorePages = false ;
} else {
System . out . println ( " ✓ Found " + foundOnPage + " Dutch auctions " ) ;
pageNumber + + ;
}
}
return saleIds ;
System . out . println ( " \ n✓ Total Dutch auctions discovered: " + saleIds . size ( ) ) ;
return new ArrayList < > ( saleIds ) ;
}
/**
* Fetches a single page using Playwright
*/
private String fetchPageWithPlaywright ( int pageNumber ) {
String url = pageNumber = = 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + " ?page= " + pageNumber ;
try {
Page page = browser . newPage ( ) ;
// Set user agent
page . setExtraHTTPHeaders ( Map . of (
" User-Agent " , " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
) ) ;
// Navigate to page
page . navigate ( url , new Page . NavigateOptions ( )
. setTimeout ( 30000 )
. setWaitUntil ( WaitUntilState . NETWORKIDLE ) ) ;
// Wait for auction listings to appear
try {
page . waitForSelector ( " a[href^='/a/'] " , new Page . WaitForSelectorOptions ( )
. setTimeout ( 10000 ) ) ;
} catch ( Exception e ) {
// Continue even if selector not found
System . out . println ( " ⚠️ Auction selector not found " ) ;
}
// Get HTML content
String html = page . content ( ) ;
page . close ( ) ;
return html ;
} catch ( Exception e ) {
System . err . println ( " ⚠️ Playwright error: " + e . getMessage ( ) ) ;
return null ;
}
}
/**
* Parses auctions from HTML and adds Dutch auctions to the set
* @return number of Dutch auctions found on this page
*/
private int parseAuctionsFromHtml ( String html , Set < Integer > saleIds ) {
int foundCount = 0 ;
// Simple regex-based parsing for auction links
java . util . regex . Pattern linkPattern = java . util . regex . Pattern . compile (
" href= \" (/a/[^ \" ]+A[17]-( \\ d+)[^ \" ]*) \" " ) ;
java . util . regex . Matcher linkMatcher = linkPattern . matcher ( html ) ;
while ( linkMatcher . find ( ) ) {
String href = linkMatcher . group ( 1 ) ;
int auctionId = Integer . parseInt ( linkMatcher . group ( 2 ) ) ;
// Avoid duplicates
if ( saleIds . contains ( auctionId ) ) {
continue ;
}
// Check if this auction is Dutch (location contains ", NL")
if ( isDutchAuction ( html , href ) ) {
saleIds . add ( auctionId ) ;
foundCount + + ;
System . out . println ( " Found Dutch auction: " + auctionId + " - " + href ) ;
}
}
return foundCount ;
}
/**
* Checks if an auction is located in the Netherlands
*/
private boolean isDutchAuction ( String html , String href ) {
int hrefPos = html . indexOf ( href ) ;
if ( hrefPos = = - 1 ) return false ;
// Look at 1000 characters before and after the href for location info
int startPos = Math . max ( hrefPos - 500 , 0 ) ;
int endPos = Math . min ( hrefPos + 1000 , html . length ( ) ) ;
String context = html . substring ( startPos , endPos ) ;
// Look for ", NL" pattern
return context . contains ( " , NL " ) ;
}
/**
* Loads cached HTML for a page
*/
private String loadFromCache ( int pageNumber ) {
if ( ! useCache | | cacheDb = = null ) return null ;
String url = pageNumber = = 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + " ?page= " + pageNumber ;
return cacheDb . get ( url ) ;
}
/**
* Saves HTML to cache
*/
private void saveToCache ( int pageNumber , String html ) {
if ( ! useCache | | cacheDb = = null ) return ;
String url = pageNumber = = 1
? AUCTIONS_PAGE
: AUCTIONS_PAGE + " ?page= " + pageNumber ;
cacheDb . put ( url , html , CACHE_EXPIRATION_HOURS ) ;
}
/**