This commit is contained in:
2025-11-28 06:23:30 +01:00
parent 0f5800441a
commit b174f77f6c
6 changed files with 512 additions and 37 deletions

View File

@@ -2,5 +2,6 @@
<project version="4">
<component name="DataSourcePerFileMappings">
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9/console.sql" value="9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9" />
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9cb4e997-fbca-4426-9093-d308871c5d5e/console.sql" value="9cb4e997-fbca-4426-9093-d308871c5d5e" />
</component>
</project>

BIN
cache/page_cache.db vendored

Binary file not shown.

View File

@@ -308,8 +308,7 @@ public class TroostwijkAuctionExtractor {
int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Try to find location pattern like "City, NL" or "City, Country"
// More flexible pattern to catch various location formats
// Pattern 1: Classic format "City, NL"
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
@@ -320,6 +319,31 @@ public class TroostwijkAuctionExtractor {
return location;
}
// Pattern 2: HTML format like "<span>City,<!-- --> </span>NL"
// Extract city and country code separately
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
"<span[^>]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
if (htmlMatcher.find()) {
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
String country = htmlMatcher.group(2);
String location = city + ", " + country;
System.out.println(" Found location (HTML): " + location + " for auction " + href);
return location;
}
// Pattern 3: Fallback - just find country code after HTML tags
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
"(?:-->|</span>|</div>)\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
if (countryMatcher.find()) {
String country = countryMatcher.group(1);
System.out.println(" Found country code: " + country + " for auction " + href);
return "Unknown, " + country;
}
System.out.println(" ⚠️ No location found for auction " + href);
return "Unknown";
}

View File

@@ -198,7 +198,9 @@ public class TroostwijkScraper {
* location contains ", NL" (indicating the Netherlands). Each auction link
* contains a unique sale ID in the format A1-xxxxx or A7-xxxxx.
*
* @return a list of sale identifiers for auctions located in NL
* Auctions are saved to the database and can be retrieved with getDutchAuctions().
*
* @return a list of sale identifiers for auctions located in NL (legacy compatibility)
*/
public List<Integer> discoverDutchAuctions() {
Set<Integer> saleIds = new HashSet<>();
@@ -246,7 +248,7 @@ public class TroostwijkScraper {
}
}
// Parse auctions from HTML
// Parse auctions from HTML (saves Dutch auctions to database)
int foundOnPage = parseAuctionsFromHtml(html, saleIds);
if (foundOnPage == 0) {
@@ -262,6 +264,21 @@ public class TroostwijkScraper {
return new ArrayList<>(saleIds);
}
/**
* Gets all Dutch auctions from the database.
* Call discoverDutchAuctions() first to populate the database.
*
* @return List of Dutch auctions with full metadata
*/
public List<AuctionInfo> getDutchAuctions() {
try {
return db.getAuctionsByCountry("NL");
} catch (SQLException e) {
System.err.println("Failed to retrieve Dutch auctions from database: " + e.getMessage());
return new ArrayList<>();
}
}
/**
* Fetches a single page using Playwright
*/
@@ -305,51 +322,152 @@ public class TroostwijkScraper {
}
/**
* Parses auctions from HTML and adds Dutch auctions to the set
* Parses auctions from HTML using JSoup and saves Dutch auctions to database.
* Uses proper HTML parsing instead of regex for more reliable extraction.
* @return number of Dutch auctions found on this page
*/
private int parseAuctionsFromHtml(String html, Set<Integer> saleIds) {
int foundCount = 0;
// Simple regex-based parsing for auction links
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
try {
org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html);
while (linkMatcher.find()) {
String href = linkMatcher.group(1);
int auctionId = Integer.parseInt(linkMatcher.group(2));
// Find all auction links (format: /a/title-A1-12345 or /a/title-A7-12345)
org.jsoup.select.Elements auctionLinks = doc.select("a[href^='/a/']");
// Avoid duplicates
if (saleIds.contains(auctionId)) {
continue;
}
// Check if this auction is Dutch (location contains ", NL")
if (isDutchAuction(html, href)) {
saleIds.add(auctionId);
foundCount++;
System.out.println(" Found Dutch auction: " + auctionId + " - " + href);
for (org.jsoup.nodes.Element link : auctionLinks) {
String href = link.attr("href");
// Extract auction ID from URL
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)");
java.util.regex.Matcher matcher = pattern.matcher(href);
if (!matcher.find()) {
continue;
}
String typeNum = matcher.group(1);
int auctionId = Integer.parseInt(matcher.group(2));
// Skip duplicates
if (saleIds.contains(auctionId)) {
continue;
}
// Extract auction info using JSoup
AuctionInfo auction = extractAuctionInfo(link, href, auctionId, "A" + typeNum);
// Only keep Dutch auctions
if (auction != null && "NL".equals(auction.country)) {
saleIds.add(auctionId);
foundCount++;
// Save to database
try {
db.upsertAuction(auction);
System.out.println(" Found Dutch auction: " + auctionId + " - " + auction.title + " (" + auction.location + ")");
} catch (SQLException e) {
System.err.println(" Failed to save auction: " + e.getMessage());
}
}
}
} catch (Exception e) {
System.err.println(" Error parsing HTML: " + e.getMessage());
}
return foundCount;
}
/**
* Checks if an auction is located in the Netherlands
* Extracts auction information from a link element using JSoup
* This method intelligently parses the HTML structure to extract:
* - Title
* - Location (city and country)
* - Lot count (if available)
*/
private boolean isDutchAuction(String html, String href) {
int hrefPos = html.indexOf(href);
if (hrefPos == -1) return false;
private AuctionInfo extractAuctionInfo(org.jsoup.nodes.Element link, String href, int auctionId, String type) {
AuctionInfo auction = new AuctionInfo();
auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;
// Look at 1000 characters before and after the href for location info
int startPos = Math.max(hrefPos - 500, 0);
int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Extract title from href (convert kebab-case to title)
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-");
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
String slug = titleMatcher.group(1);
auction.title = slug.replace("-", " ");
// Capitalize first letter
if (!auction.title.isEmpty()) {
auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1);
}
} else {
auction.title = "Unknown Auction";
}
// Look for ", NL" pattern
return context.contains(", NL");
// Try to find title in link text (more accurate)
String linkText = link.text();
if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) {
// If link text doesn't contain numbers, it's likely the title
String[] parts = linkText.split(",|\\d+");
if (parts.length > 0 && parts[0].trim().length() > 5) {
auction.title = parts[0].trim();
}
}
// Extract location using JSoup selectors
// Look for <p> tags that contain location info
org.jsoup.select.Elements locationElements = link.select("p");
for (org.jsoup.nodes.Element p : locationElements) {
String text = p.text();
// Pattern: "City, Country" or "City, Region, Country"
if (text.matches(".*[A-Z]{2}$")) {
// Ends with 2-letter country code
String countryCode = text.substring(text.length() - 2);
String cityPart = text.substring(0, text.length() - 2).trim();
// Remove trailing comma or whitespace
cityPart = cityPart.replaceAll("[,\\s]+$", "");
auction.country = countryCode;
auction.city = cityPart;
auction.location = cityPart + ", " + countryCode;
break;
}
}
// Fallback: check HTML content directly
if (auction.country == null) {
String html = link.html();
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(html);
if (locMatcher.find()) {
String city = locMatcher.group(1).trim().replaceAll(",$", "");
String country = locMatcher.group(2);
auction.city = city;
auction.country = country;
auction.location = city + ", " + country;
}
}
// Extract lot count if available (kavels/lots)
org.jsoup.select.Elements textElements = link.select("*");
for (org.jsoup.nodes.Element elem : textElements) {
String text = elem.ownText();
if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) {
java.util.regex.Pattern countPattern = java.util.regex.Pattern.compile("(\\d+)");
java.util.regex.Matcher countMatcher = countPattern.matcher(text);
if (countMatcher.find()) {
auction.lotCount = Integer.parseInt(countMatcher.group(1));
break;
}
}
}
return auction;
}
/**
@@ -664,6 +782,27 @@ public class TroostwijkScraper {
// Domain classes and services
// ----------------------------------------------------------------------
/**
* Represents auction metadata (veiling informatie)
*/
public static class AuctionInfo {
public int auctionId; // Unique auction ID (from URL)
public String title; // Auction title
public String location; // Location (e.g., "Amsterdam, NL")
public String city; // City name
public String country; // Country code (e.g., "NL")
public String url; // Full auction URL
public String type; // Auction type (A1 or A7)
public int lotCount; // Number of lots/kavels
public LocalDateTime closingTime; // Closing time if available
@Override
public String toString() {
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', lots=%d, url='%s'}",
auctionId, type, title, location, lotCount, url);
}
}
/**
* Simple POJO representing a lot (kavel) in an auction. It keeps track
* of the sale it belongs to, current bid and closing time. The method
@@ -704,18 +843,33 @@ public class TroostwijkScraper {
}
/**
* Creates tables if they do not already exist. The schema includes
* tables for sales, lots, images, and object labels. This method is
* tables for auctions, lots, images, and object labels. This method is
* idempotent; it can be called multiple times.
*/
void ensureSchema() throws SQLException {
try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) {
// Sales table
// Auctions table (veilingen)
stmt.execute("CREATE TABLE IF NOT EXISTS auctions ("
+ "auction_id INTEGER PRIMARY KEY,"
+ "title TEXT NOT NULL,"
+ "location TEXT,"
+ "city TEXT,"
+ "country TEXT,"
+ "url TEXT NOT NULL,"
+ "type TEXT,"
+ "lot_count INTEGER DEFAULT 0,"
+ "closing_time TEXT,"
+ "discovered_at INTEGER" // Unix timestamp
+ ")");
// Sales table (legacy - keep for compatibility)
stmt.execute("CREATE TABLE IF NOT EXISTS sales ("
+ "sale_id INTEGER PRIMARY KEY,"
+ "title TEXT,"
+ "location TEXT,"
+ "closing_time TEXT"
+ ")");
// Lots table
stmt.execute("CREATE TABLE IF NOT EXISTS lots ("
+ "lot_id INTEGER PRIMARY KEY,"
@@ -731,8 +885,9 @@ public class TroostwijkScraper {
+ "url TEXT,"
+ "closing_time TEXT,"
+ "closing_notified INTEGER DEFAULT 0,"
+ "FOREIGN KEY (sale_id) REFERENCES sales(sale_id)"
+ "FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)"
+ ")");
// Images table
stmt.execute("CREATE TABLE IF NOT EXISTS images ("
+ "id INTEGER PRIMARY KEY AUTOINCREMENT,"
@@ -742,9 +897,99 @@ public class TroostwijkScraper {
+ "labels TEXT,"
+ "FOREIGN KEY (lot_id) REFERENCES lots(lot_id)"
+ ")");
// Create indexes for better query performance
stmt.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)");
stmt.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)");
}
}
/**
* Inserts or updates an auction record
*/
synchronized void upsertAuction(AuctionInfo auction) throws SQLException {
String sql = "INSERT INTO auctions (auction_id, title, location, city, country, url, type, lot_count, closing_time, discovered_at)"
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+ " ON CONFLICT(auction_id) DO UPDATE SET "
+ "title = excluded.title, location = excluded.location, city = excluded.city, "
+ "country = excluded.country, url = excluded.url, type = excluded.type, "
+ "lot_count = excluded.lot_count, closing_time = excluded.closing_time";
try (Connection conn = DriverManager.getConnection(url); PreparedStatement ps = conn.prepareStatement(sql)) {
ps.setInt(1, auction.auctionId);
ps.setString(2, auction.title);
ps.setString(3, auction.location);
ps.setString(4, auction.city);
ps.setString(5, auction.country);
ps.setString(6, auction.url);
ps.setString(7, auction.type);
ps.setInt(8, auction.lotCount);
ps.setString(9, auction.closingTime != null ? auction.closingTime.toString() : null);
ps.setLong(10, Instant.now().getEpochSecond());
ps.executeUpdate();
}
}
/**
* Retrieves all auctions from the database
*/
synchronized List<AuctionInfo> getAllAuctions() throws SQLException {
List<AuctionInfo> auctions = new ArrayList<>();
String sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time FROM auctions";
try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) {
ResultSet rs = stmt.executeQuery(sql);
while (rs.next()) {
AuctionInfo auction = new AuctionInfo();
auction.auctionId = rs.getInt("auction_id");
auction.title = rs.getString("title");
auction.location = rs.getString("location");
auction.city = rs.getString("city");
auction.country = rs.getString("country");
auction.url = rs.getString("url");
auction.type = rs.getString("type");
auction.lotCount = rs.getInt("lot_count");
String closing = rs.getString("closing_time");
if (closing != null) {
auction.closingTime = LocalDateTime.parse(closing);
}
auctions.add(auction);
}
}
return auctions;
}
/**
* Retrieves auctions by country code
*/
synchronized List<AuctionInfo> getAuctionsByCountry(String countryCode) throws SQLException {
List<AuctionInfo> auctions = new ArrayList<>();
String sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time "
+ "FROM auctions WHERE country = ?";
try (Connection conn = DriverManager.getConnection(url); PreparedStatement ps = conn.prepareStatement(sql)) {
ps.setString(1, countryCode);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
AuctionInfo auction = new AuctionInfo();
auction.auctionId = rs.getInt("auction_id");
auction.title = rs.getString("title");
auction.location = rs.getString("location");
auction.city = rs.getString("city");
auction.country = rs.getString("country");
auction.url = rs.getString("url");
auction.type = rs.getString("type");
auction.lotCount = rs.getInt("lot_count");
String closing = rs.getString("closing_time");
if (closing != null) {
auction.closingTime = LocalDateTime.parse(closing);
}
auctions.add(auction);
}
}
return auctions;
}
/**
* Inserts or updates a lot record. Uses INSERT OR REPLACE to
* implement upsert semantics so that existing rows are replaced.

File diff suppressed because one or more lines are too long