This commit is contained in:
2025-11-28 06:23:30 +01:00
parent 0f5800441a
commit b174f77f6c
6 changed files with 512 additions and 37 deletions

View File

@@ -2,5 +2,6 @@
<project version="4"> <project version="4">
<component name="DataSourcePerFileMappings"> <component name="DataSourcePerFileMappings">
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9/console.sql" value="9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9" /> <file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9/console.sql" value="9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9" />
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9cb4e997-fbca-4426-9093-d308871c5d5e/console.sql" value="9cb4e997-fbca-4426-9093-d308871c5d5e" />
</component> </component>
</project> </project>

BIN
cache/page_cache.db vendored

Binary file not shown.

View File

@@ -308,8 +308,7 @@ public class TroostwijkAuctionExtractor {
int endPos = Math.min(hrefPos + 1000, html.length()); int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos); String context = html.substring(startPos, endPos);
// Try to find location pattern like "City, NL" or "City, Country" // Pattern 1: Classic format "City, NL"
// More flexible pattern to catch various location formats
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])"); "([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(context); java.util.regex.Matcher locMatcher = locPattern.matcher(context);
@@ -320,6 +319,31 @@ public class TroostwijkAuctionExtractor {
return location; return location;
} }
// Pattern 2: HTML format like "<span>City,<!-- --> </span>NL"
// Extract city and country code separately
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
"<span[^>]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
if (htmlMatcher.find()) {
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
String country = htmlMatcher.group(2);
String location = city + ", " + country;
System.out.println(" Found location (HTML): " + location + " for auction " + href);
return location;
}
// Pattern 3: Fallback - just find country code after HTML tags
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
"(?:-->|</span>|</div>)\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
if (countryMatcher.find()) {
String country = countryMatcher.group(1);
System.out.println(" Found country code: " + country + " for auction " + href);
return "Unknown, " + country;
}
System.out.println(" ⚠️ No location found for auction " + href); System.out.println(" ⚠️ No location found for auction " + href);
return "Unknown"; return "Unknown";
} }

View File

@@ -198,7 +198,9 @@ public class TroostwijkScraper {
* location contains ", NL" (indicating the Netherlands). Each auction link * location contains ", NL" (indicating the Netherlands). Each auction link
* contains a unique sale ID in the format A1-xxxxx or A7-xxxxx. * contains a unique sale ID in the format A1-xxxxx or A7-xxxxx.
* *
* @return a list of sale identifiers for auctions located in NL * Auctions are saved to the database and can be retrieved with getDutchAuctions().
*
* @return a list of sale identifiers for auctions located in NL (legacy compatibility)
*/ */
public List<Integer> discoverDutchAuctions() { public List<Integer> discoverDutchAuctions() {
Set<Integer> saleIds = new HashSet<>(); Set<Integer> saleIds = new HashSet<>();
@@ -246,7 +248,7 @@ public class TroostwijkScraper {
} }
} }
// Parse auctions from HTML // Parse auctions from HTML (saves Dutch auctions to database)
int foundOnPage = parseAuctionsFromHtml(html, saleIds); int foundOnPage = parseAuctionsFromHtml(html, saleIds);
if (foundOnPage == 0) { if (foundOnPage == 0) {
@@ -262,6 +264,21 @@ public class TroostwijkScraper {
return new ArrayList<>(saleIds); return new ArrayList<>(saleIds);
} }
/**
* Gets all Dutch auctions from the database.
* Call discoverDutchAuctions() first to populate the database.
*
* @return List of Dutch auctions with full metadata
*/
public List<AuctionInfo> getDutchAuctions() {
try {
return db.getAuctionsByCountry("NL");
} catch (SQLException e) {
System.err.println("Failed to retrieve Dutch auctions from database: " + e.getMessage());
return new ArrayList<>();
}
}
/** /**
* Fetches a single page using Playwright * Fetches a single page using Playwright
*/ */
@@ -305,51 +322,152 @@ public class TroostwijkScraper {
} }
/** /**
* Parses auctions from HTML and adds Dutch auctions to the set * Parses auctions from HTML using JSoup and saves Dutch auctions to database.
* Uses proper HTML parsing instead of regex for more reliable extraction.
* @return number of Dutch auctions found on this page * @return number of Dutch auctions found on this page
*/ */
private int parseAuctionsFromHtml(String html, Set<Integer> saleIds) { private int parseAuctionsFromHtml(String html, Set<Integer> saleIds) {
int foundCount = 0; int foundCount = 0;
// Simple regex-based parsing for auction links try {
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile( org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html);
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
while (linkMatcher.find()) { // Find all auction links (format: /a/title-A1-12345 or /a/title-A7-12345)
String href = linkMatcher.group(1); org.jsoup.select.Elements auctionLinks = doc.select("a[href^='/a/']");
int auctionId = Integer.parseInt(linkMatcher.group(2));
// Avoid duplicates for (org.jsoup.nodes.Element link : auctionLinks) {
String href = link.attr("href");
// Extract auction ID from URL
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)");
java.util.regex.Matcher matcher = pattern.matcher(href);
if (!matcher.find()) {
continue;
}
String typeNum = matcher.group(1);
int auctionId = Integer.parseInt(matcher.group(2));
// Skip duplicates
if (saleIds.contains(auctionId)) { if (saleIds.contains(auctionId)) {
continue; continue;
} }
// Check if this auction is Dutch (location contains ", NL") // Extract auction info using JSoup
if (isDutchAuction(html, href)) { AuctionInfo auction = extractAuctionInfo(link, href, auctionId, "A" + typeNum);
// Only keep Dutch auctions
if (auction != null && "NL".equals(auction.country)) {
saleIds.add(auctionId); saleIds.add(auctionId);
foundCount++; foundCount++;
System.out.println(" Found Dutch auction: " + auctionId + " - " + href);
// Save to database
try {
db.upsertAuction(auction);
System.out.println(" Found Dutch auction: " + auctionId + " - " + auction.title + " (" + auction.location + ")");
} catch (SQLException e) {
System.err.println(" Failed to save auction: " + e.getMessage());
} }
} }
}
} catch (Exception e) {
System.err.println(" Error parsing HTML: " + e.getMessage());
}
return foundCount; return foundCount;
} }
/** /**
* Checks if an auction is located in the Netherlands * Extracts auction information from a link element using JSoup
* This method intelligently parses the HTML structure to extract:
* - Title
* - Location (city and country)
* - Lot count (if available)
*/ */
private boolean isDutchAuction(String html, String href) { private AuctionInfo extractAuctionInfo(org.jsoup.nodes.Element link, String href, int auctionId, String type) {
int hrefPos = html.indexOf(href); AuctionInfo auction = new AuctionInfo();
if (hrefPos == -1) return false; auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;
// Look at 1000 characters before and after the href for location info // Extract title from href (convert kebab-case to title)
int startPos = Math.max(hrefPos - 500, 0); java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-");
int endPos = Math.min(hrefPos + 1000, html.length()); java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
String context = html.substring(startPos, endPos); if (titleMatcher.find()) {
String slug = titleMatcher.group(1);
auction.title = slug.replace("-", " ");
// Capitalize first letter
if (!auction.title.isEmpty()) {
auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1);
}
} else {
auction.title = "Unknown Auction";
}
// Look for ", NL" pattern // Try to find title in link text (more accurate)
return context.contains(", NL"); String linkText = link.text();
if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) {
// If link text doesn't contain numbers, it's likely the title
String[] parts = linkText.split(",|\\d+");
if (parts.length > 0 && parts[0].trim().length() > 5) {
auction.title = parts[0].trim();
}
}
// Extract location using JSoup selectors
// Look for <p> tags that contain location info
org.jsoup.select.Elements locationElements = link.select("p");
for (org.jsoup.nodes.Element p : locationElements) {
String text = p.text();
// Pattern: "City, Country" or "City, Region, Country"
if (text.matches(".*[A-Z]{2}$")) {
// Ends with 2-letter country code
String countryCode = text.substring(text.length() - 2);
String cityPart = text.substring(0, text.length() - 2).trim();
// Remove trailing comma or whitespace
cityPart = cityPart.replaceAll("[,\\s]+$", "");
auction.country = countryCode;
auction.city = cityPart;
auction.location = cityPart + ", " + countryCode;
break;
}
}
// Fallback: check HTML content directly
if (auction.country == null) {
String html = link.html();
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(html);
if (locMatcher.find()) {
String city = locMatcher.group(1).trim().replaceAll(",$", "");
String country = locMatcher.group(2);
auction.city = city;
auction.country = country;
auction.location = city + ", " + country;
}
}
// Extract lot count if available (kavels/lots)
org.jsoup.select.Elements textElements = link.select("*");
for (org.jsoup.nodes.Element elem : textElements) {
String text = elem.ownText();
if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) {
java.util.regex.Pattern countPattern = java.util.regex.Pattern.compile("(\\d+)");
java.util.regex.Matcher countMatcher = countPattern.matcher(text);
if (countMatcher.find()) {
auction.lotCount = Integer.parseInt(countMatcher.group(1));
break;
}
}
}
return auction;
} }
/** /**
@@ -664,6 +782,27 @@ public class TroostwijkScraper {
// Domain classes and services // Domain classes and services
// ---------------------------------------------------------------------- // ----------------------------------------------------------------------
/**
* Represents auction metadata (veiling informatie)
*/
public static class AuctionInfo {
public int auctionId; // Unique auction ID (from URL)
public String title; // Auction title
public String location; // Location (e.g., "Amsterdam, NL")
public String city; // City name
public String country; // Country code (e.g., "NL")
public String url; // Full auction URL
public String type; // Auction type (A1 or A7)
public int lotCount; // Number of lots/kavels
public LocalDateTime closingTime; // Closing time if available
@Override
public String toString() {
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', lots=%d, url='%s'}",
auctionId, type, title, location, lotCount, url);
}
}
/** /**
* Simple POJO representing a lot (kavel) in an auction. It keeps track * Simple POJO representing a lot (kavel) in an auction. It keeps track
* of the sale it belongs to, current bid and closing time. The method * of the sale it belongs to, current bid and closing time. The method
@@ -704,18 +843,33 @@ public class TroostwijkScraper {
} }
/** /**
* Creates tables if they do not already exist. The schema includes * Creates tables if they do not already exist. The schema includes
* tables for sales, lots, images, and object labels. This method is * tables for auctions, lots, images, and object labels. This method is
* idempotent; it can be called multiple times. * idempotent; it can be called multiple times.
*/ */
void ensureSchema() throws SQLException { void ensureSchema() throws SQLException {
try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) { try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) {
// Sales table // Auctions table (veilingen)
stmt.execute("CREATE TABLE IF NOT EXISTS auctions ("
+ "auction_id INTEGER PRIMARY KEY,"
+ "title TEXT NOT NULL,"
+ "location TEXT,"
+ "city TEXT,"
+ "country TEXT,"
+ "url TEXT NOT NULL,"
+ "type TEXT,"
+ "lot_count INTEGER DEFAULT 0,"
+ "closing_time TEXT,"
+ "discovered_at INTEGER" // Unix timestamp
+ ")");
// Sales table (legacy - keep for compatibility)
stmt.execute("CREATE TABLE IF NOT EXISTS sales (" stmt.execute("CREATE TABLE IF NOT EXISTS sales ("
+ "sale_id INTEGER PRIMARY KEY," + "sale_id INTEGER PRIMARY KEY,"
+ "title TEXT," + "title TEXT,"
+ "location TEXT," + "location TEXT,"
+ "closing_time TEXT" + "closing_time TEXT"
+ ")"); + ")");
// Lots table // Lots table
stmt.execute("CREATE TABLE IF NOT EXISTS lots (" stmt.execute("CREATE TABLE IF NOT EXISTS lots ("
+ "lot_id INTEGER PRIMARY KEY," + "lot_id INTEGER PRIMARY KEY,"
@@ -731,8 +885,9 @@ public class TroostwijkScraper {
+ "url TEXT," + "url TEXT,"
+ "closing_time TEXT," + "closing_time TEXT,"
+ "closing_notified INTEGER DEFAULT 0," + "closing_notified INTEGER DEFAULT 0,"
+ "FOREIGN KEY (sale_id) REFERENCES sales(sale_id)" + "FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)"
+ ")"); + ")");
// Images table // Images table
stmt.execute("CREATE TABLE IF NOT EXISTS images (" stmt.execute("CREATE TABLE IF NOT EXISTS images ("
+ "id INTEGER PRIMARY KEY AUTOINCREMENT," + "id INTEGER PRIMARY KEY AUTOINCREMENT,"
@@ -742,9 +897,99 @@ public class TroostwijkScraper {
+ "labels TEXT," + "labels TEXT,"
+ "FOREIGN KEY (lot_id) REFERENCES lots(lot_id)" + "FOREIGN KEY (lot_id) REFERENCES lots(lot_id)"
+ ")"); + ")");
// Create indexes for better query performance
stmt.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)");
stmt.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)");
} }
} }
/**
* Inserts or updates an auction record
*/
synchronized void upsertAuction(AuctionInfo auction) throws SQLException {
String sql = "INSERT INTO auctions (auction_id, title, location, city, country, url, type, lot_count, closing_time, discovered_at)"
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+ " ON CONFLICT(auction_id) DO UPDATE SET "
+ "title = excluded.title, location = excluded.location, city = excluded.city, "
+ "country = excluded.country, url = excluded.url, type = excluded.type, "
+ "lot_count = excluded.lot_count, closing_time = excluded.closing_time";
try (Connection conn = DriverManager.getConnection(url); PreparedStatement ps = conn.prepareStatement(sql)) {
ps.setInt(1, auction.auctionId);
ps.setString(2, auction.title);
ps.setString(3, auction.location);
ps.setString(4, auction.city);
ps.setString(5, auction.country);
ps.setString(6, auction.url);
ps.setString(7, auction.type);
ps.setInt(8, auction.lotCount);
ps.setString(9, auction.closingTime != null ? auction.closingTime.toString() : null);
ps.setLong(10, Instant.now().getEpochSecond());
ps.executeUpdate();
}
}
/**
* Retrieves all auctions from the database
*/
synchronized List<AuctionInfo> getAllAuctions() throws SQLException {
List<AuctionInfo> auctions = new ArrayList<>();
String sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time FROM auctions";
try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) {
ResultSet rs = stmt.executeQuery(sql);
while (rs.next()) {
AuctionInfo auction = new AuctionInfo();
auction.auctionId = rs.getInt("auction_id");
auction.title = rs.getString("title");
auction.location = rs.getString("location");
auction.city = rs.getString("city");
auction.country = rs.getString("country");
auction.url = rs.getString("url");
auction.type = rs.getString("type");
auction.lotCount = rs.getInt("lot_count");
String closing = rs.getString("closing_time");
if (closing != null) {
auction.closingTime = LocalDateTime.parse(closing);
}
auctions.add(auction);
}
}
return auctions;
}
/**
* Retrieves auctions by country code
*/
synchronized List<AuctionInfo> getAuctionsByCountry(String countryCode) throws SQLException {
List<AuctionInfo> auctions = new ArrayList<>();
String sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time "
+ "FROM auctions WHERE country = ?";
try (Connection conn = DriverManager.getConnection(url); PreparedStatement ps = conn.prepareStatement(sql)) {
ps.setString(1, countryCode);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
AuctionInfo auction = new AuctionInfo();
auction.auctionId = rs.getInt("auction_id");
auction.title = rs.getString("title");
auction.location = rs.getString("location");
auction.city = rs.getString("city");
auction.country = rs.getString("country");
auction.url = rs.getString("url");
auction.type = rs.getString("type");
auction.lotCount = rs.getInt("lot_count");
String closing = rs.getString("closing_time");
if (closing != null) {
auction.closingTime = LocalDateTime.parse(closing);
}
auctions.add(auction);
}
}
return auctions;
}
/** /**
* Inserts or updates a lot record. Uses INSERT OR REPLACE to * Inserts or updates a lot record. Uses INSERT OR REPLACE to
* implement upsert semantics so that existing rows are replaced. * implement upsert semantics so that existing rows are replaced.

File diff suppressed because one or more lines are too long