start
This commit is contained in:
1
.idea/data_source_mapping.xml
generated
1
.idea/data_source_mapping.xml
generated
@@ -2,5 +2,6 @@
|
||||
<project version="4">
|
||||
<component name="DataSourcePerFileMappings">
|
||||
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9/console.sql" value="9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9" />
|
||||
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9cb4e997-fbca-4426-9093-d308871c5d5e/console.sql" value="9cb4e997-fbca-4426-9093-d308871c5d5e" />
|
||||
</component>
|
||||
</project>
|
||||
BIN
cache/page_cache.db
vendored
BIN
cache/page_cache.db
vendored
Binary file not shown.
@@ -308,8 +308,7 @@ public class TroostwijkAuctionExtractor {
|
||||
int endPos = Math.min(hrefPos + 1000, html.length());
|
||||
String context = html.substring(startPos, endPos);
|
||||
|
||||
// Try to find location pattern like "City, NL" or "City, Country"
|
||||
// More flexible pattern to catch various location formats
|
||||
// Pattern 1: Classic format "City, NL"
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
|
||||
@@ -320,6 +319,31 @@ public class TroostwijkAuctionExtractor {
|
||||
return location;
|
||||
}
|
||||
|
||||
// Pattern 2: HTML format like "<span>City,<!-- --> </span>NL"
|
||||
// Extract city and country code separately
|
||||
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
|
||||
"<span[^>]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
|
||||
|
||||
if (htmlMatcher.find()) {
|
||||
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
|
||||
String country = htmlMatcher.group(2);
|
||||
String location = city + ", " + country;
|
||||
System.out.println(" Found location (HTML): " + location + " for auction " + href);
|
||||
return location;
|
||||
}
|
||||
|
||||
// Pattern 3: Fallback - just find country code after HTML tags
|
||||
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
|
||||
"(?:-->|</span>|</div>)\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
|
||||
|
||||
if (countryMatcher.find()) {
|
||||
String country = countryMatcher.group(1);
|
||||
System.out.println(" Found country code: " + country + " for auction " + href);
|
||||
return "Unknown, " + country;
|
||||
}
|
||||
|
||||
System.out.println(" ⚠️ No location found for auction " + href);
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
@@ -198,7 +198,9 @@ public class TroostwijkScraper {
|
||||
* location contains ", NL" (indicating the Netherlands). Each auction link
|
||||
* contains a unique sale ID in the format A1-xxxxx or A7-xxxxx.
|
||||
*
|
||||
* @return a list of sale identifiers for auctions located in NL
|
||||
* Auctions are saved to the database and can be retrieved with getDutchAuctions().
|
||||
*
|
||||
* @return a list of sale identifiers for auctions located in NL (legacy compatibility)
|
||||
*/
|
||||
public List<Integer> discoverDutchAuctions() {
|
||||
Set<Integer> saleIds = new HashSet<>();
|
||||
@@ -246,7 +248,7 @@ public class TroostwijkScraper {
|
||||
}
|
||||
}
|
||||
|
||||
// Parse auctions from HTML
|
||||
// Parse auctions from HTML (saves Dutch auctions to database)
|
||||
int foundOnPage = parseAuctionsFromHtml(html, saleIds);
|
||||
|
||||
if (foundOnPage == 0) {
|
||||
@@ -262,6 +264,21 @@ public class TroostwijkScraper {
|
||||
return new ArrayList<>(saleIds);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets all Dutch auctions from the database.
|
||||
* Call discoverDutchAuctions() first to populate the database.
|
||||
*
|
||||
* @return List of Dutch auctions with full metadata
|
||||
*/
|
||||
public List<AuctionInfo> getDutchAuctions() {
|
||||
try {
|
||||
return db.getAuctionsByCountry("NL");
|
||||
} catch (SQLException e) {
|
||||
System.err.println("Failed to retrieve Dutch auctions from database: " + e.getMessage());
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches a single page using Playwright
|
||||
*/
|
||||
@@ -305,51 +322,152 @@ public class TroostwijkScraper {
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses auctions from HTML and adds Dutch auctions to the set
|
||||
* Parses auctions from HTML using JSoup and saves Dutch auctions to database.
|
||||
* Uses proper HTML parsing instead of regex for more reliable extraction.
|
||||
* @return number of Dutch auctions found on this page
|
||||
*/
|
||||
private int parseAuctionsFromHtml(String html, Set<Integer> saleIds) {
|
||||
int foundCount = 0;
|
||||
|
||||
// Simple regex-based parsing for auction links
|
||||
java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile(
|
||||
"href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\"");
|
||||
java.util.regex.Matcher linkMatcher = linkPattern.matcher(html);
|
||||
try {
|
||||
org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html);
|
||||
|
||||
while (linkMatcher.find()) {
|
||||
String href = linkMatcher.group(1);
|
||||
int auctionId = Integer.parseInt(linkMatcher.group(2));
|
||||
// Find all auction links (format: /a/title-A1-12345 or /a/title-A7-12345)
|
||||
org.jsoup.select.Elements auctionLinks = doc.select("a[href^='/a/']");
|
||||
|
||||
// Avoid duplicates
|
||||
if (saleIds.contains(auctionId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this auction is Dutch (location contains ", NL")
|
||||
if (isDutchAuction(html, href)) {
|
||||
saleIds.add(auctionId);
|
||||
foundCount++;
|
||||
System.out.println(" Found Dutch auction: " + auctionId + " - " + href);
|
||||
for (org.jsoup.nodes.Element link : auctionLinks) {
|
||||
String href = link.attr("href");
|
||||
|
||||
// Extract auction ID from URL
|
||||
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)");
|
||||
java.util.regex.Matcher matcher = pattern.matcher(href);
|
||||
|
||||
if (!matcher.find()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String typeNum = matcher.group(1);
|
||||
int auctionId = Integer.parseInt(matcher.group(2));
|
||||
|
||||
// Skip duplicates
|
||||
if (saleIds.contains(auctionId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract auction info using JSoup
|
||||
AuctionInfo auction = extractAuctionInfo(link, href, auctionId, "A" + typeNum);
|
||||
|
||||
// Only keep Dutch auctions
|
||||
if (auction != null && "NL".equals(auction.country)) {
|
||||
saleIds.add(auctionId);
|
||||
foundCount++;
|
||||
|
||||
// Save to database
|
||||
try {
|
||||
db.upsertAuction(auction);
|
||||
System.out.println(" Found Dutch auction: " + auctionId + " - " + auction.title + " (" + auction.location + ")");
|
||||
} catch (SQLException e) {
|
||||
System.err.println(" Failed to save auction: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println(" Error parsing HTML: " + e.getMessage());
|
||||
}
|
||||
|
||||
return foundCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if an auction is located in the Netherlands
|
||||
* Extracts auction information from a link element using JSoup
|
||||
* This method intelligently parses the HTML structure to extract:
|
||||
* - Title
|
||||
* - Location (city and country)
|
||||
* - Lot count (if available)
|
||||
*/
|
||||
private boolean isDutchAuction(String html, String href) {
|
||||
int hrefPos = html.indexOf(href);
|
||||
if (hrefPos == -1) return false;
|
||||
private AuctionInfo extractAuctionInfo(org.jsoup.nodes.Element link, String href, int auctionId, String type) {
|
||||
AuctionInfo auction = new AuctionInfo();
|
||||
auction.auctionId = auctionId;
|
||||
auction.type = type;
|
||||
auction.url = "https://www.troostwijkauctions.com" + href;
|
||||
|
||||
// Look at 1000 characters before and after the href for location info
|
||||
int startPos = Math.max(hrefPos - 500, 0);
|
||||
int endPos = Math.min(hrefPos + 1000, html.length());
|
||||
String context = html.substring(startPos, endPos);
|
||||
// Extract title from href (convert kebab-case to title)
|
||||
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-");
|
||||
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
|
||||
if (titleMatcher.find()) {
|
||||
String slug = titleMatcher.group(1);
|
||||
auction.title = slug.replace("-", " ");
|
||||
// Capitalize first letter
|
||||
if (!auction.title.isEmpty()) {
|
||||
auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1);
|
||||
}
|
||||
} else {
|
||||
auction.title = "Unknown Auction";
|
||||
}
|
||||
|
||||
// Look for ", NL" pattern
|
||||
return context.contains(", NL");
|
||||
// Try to find title in link text (more accurate)
|
||||
String linkText = link.text();
|
||||
if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) {
|
||||
// If link text doesn't contain numbers, it's likely the title
|
||||
String[] parts = linkText.split(",|\\d+");
|
||||
if (parts.length > 0 && parts[0].trim().length() > 5) {
|
||||
auction.title = parts[0].trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Extract location using JSoup selectors
|
||||
// Look for <p> tags that contain location info
|
||||
org.jsoup.select.Elements locationElements = link.select("p");
|
||||
for (org.jsoup.nodes.Element p : locationElements) {
|
||||
String text = p.text();
|
||||
|
||||
// Pattern: "City, Country" or "City, Region, Country"
|
||||
if (text.matches(".*[A-Z]{2}$")) {
|
||||
// Ends with 2-letter country code
|
||||
String countryCode = text.substring(text.length() - 2);
|
||||
String cityPart = text.substring(0, text.length() - 2).trim();
|
||||
|
||||
// Remove trailing comma or whitespace
|
||||
cityPart = cityPart.replaceAll("[,\\s]+$", "");
|
||||
|
||||
auction.country = countryCode;
|
||||
auction.city = cityPart;
|
||||
auction.location = cityPart + ", " + countryCode;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: check HTML content directly
|
||||
if (auction.country == null) {
|
||||
String html = link.html();
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(html);
|
||||
|
||||
if (locMatcher.find()) {
|
||||
String city = locMatcher.group(1).trim().replaceAll(",$", "");
|
||||
String country = locMatcher.group(2);
|
||||
auction.city = city;
|
||||
auction.country = country;
|
||||
auction.location = city + ", " + country;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract lot count if available (kavels/lots)
|
||||
org.jsoup.select.Elements textElements = link.select("*");
|
||||
for (org.jsoup.nodes.Element elem : textElements) {
|
||||
String text = elem.ownText();
|
||||
if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) {
|
||||
java.util.regex.Pattern countPattern = java.util.regex.Pattern.compile("(\\d+)");
|
||||
java.util.regex.Matcher countMatcher = countPattern.matcher(text);
|
||||
if (countMatcher.find()) {
|
||||
auction.lotCount = Integer.parseInt(countMatcher.group(1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return auction;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -664,6 +782,27 @@ public class TroostwijkScraper {
|
||||
// Domain classes and services
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Represents auction metadata (veiling informatie)
|
||||
*/
|
||||
public static class AuctionInfo {
|
||||
public int auctionId; // Unique auction ID (from URL)
|
||||
public String title; // Auction title
|
||||
public String location; // Location (e.g., "Amsterdam, NL")
|
||||
public String city; // City name
|
||||
public String country; // Country code (e.g., "NL")
|
||||
public String url; // Full auction URL
|
||||
public String type; // Auction type (A1 or A7)
|
||||
public int lotCount; // Number of lots/kavels
|
||||
public LocalDateTime closingTime; // Closing time if available
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Auction{id=%d, type=%s, title='%s', location='%s', lots=%d, url='%s'}",
|
||||
auctionId, type, title, location, lotCount, url);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple POJO representing a lot (kavel) in an auction. It keeps track
|
||||
* of the sale it belongs to, current bid and closing time. The method
|
||||
@@ -704,18 +843,33 @@ public class TroostwijkScraper {
|
||||
}
|
||||
/**
|
||||
* Creates tables if they do not already exist. The schema includes
|
||||
* tables for sales, lots, images, and object labels. This method is
|
||||
* tables for auctions, lots, images, and object labels. This method is
|
||||
* idempotent; it can be called multiple times.
|
||||
*/
|
||||
void ensureSchema() throws SQLException {
|
||||
try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) {
|
||||
// Sales table
|
||||
// Auctions table (veilingen)
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS auctions ("
|
||||
+ "auction_id INTEGER PRIMARY KEY,"
|
||||
+ "title TEXT NOT NULL,"
|
||||
+ "location TEXT,"
|
||||
+ "city TEXT,"
|
||||
+ "country TEXT,"
|
||||
+ "url TEXT NOT NULL,"
|
||||
+ "type TEXT,"
|
||||
+ "lot_count INTEGER DEFAULT 0,"
|
||||
+ "closing_time TEXT,"
|
||||
+ "discovered_at INTEGER" // Unix timestamp
|
||||
+ ")");
|
||||
|
||||
// Sales table (legacy - keep for compatibility)
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS sales ("
|
||||
+ "sale_id INTEGER PRIMARY KEY,"
|
||||
+ "title TEXT,"
|
||||
+ "location TEXT,"
|
||||
+ "closing_time TEXT"
|
||||
+ ")");
|
||||
|
||||
// Lots table
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS lots ("
|
||||
+ "lot_id INTEGER PRIMARY KEY,"
|
||||
@@ -731,8 +885,9 @@ public class TroostwijkScraper {
|
||||
+ "url TEXT,"
|
||||
+ "closing_time TEXT,"
|
||||
+ "closing_notified INTEGER DEFAULT 0,"
|
||||
+ "FOREIGN KEY (sale_id) REFERENCES sales(sale_id)"
|
||||
+ "FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)"
|
||||
+ ")");
|
||||
|
||||
// Images table
|
||||
stmt.execute("CREATE TABLE IF NOT EXISTS images ("
|
||||
+ "id INTEGER PRIMARY KEY AUTOINCREMENT,"
|
||||
@@ -742,9 +897,99 @@ public class TroostwijkScraper {
|
||||
+ "labels TEXT,"
|
||||
+ "FOREIGN KEY (lot_id) REFERENCES lots(lot_id)"
|
||||
+ ")");
|
||||
|
||||
// Create indexes for better query performance
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)");
|
||||
stmt.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts or updates an auction record
|
||||
*/
|
||||
synchronized void upsertAuction(AuctionInfo auction) throws SQLException {
|
||||
String sql = "INSERT INTO auctions (auction_id, title, location, city, country, url, type, lot_count, closing_time, discovered_at)"
|
||||
+ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
||||
+ " ON CONFLICT(auction_id) DO UPDATE SET "
|
||||
+ "title = excluded.title, location = excluded.location, city = excluded.city, "
|
||||
+ "country = excluded.country, url = excluded.url, type = excluded.type, "
|
||||
+ "lot_count = excluded.lot_count, closing_time = excluded.closing_time";
|
||||
|
||||
try (Connection conn = DriverManager.getConnection(url); PreparedStatement ps = conn.prepareStatement(sql)) {
|
||||
ps.setInt(1, auction.auctionId);
|
||||
ps.setString(2, auction.title);
|
||||
ps.setString(3, auction.location);
|
||||
ps.setString(4, auction.city);
|
||||
ps.setString(5, auction.country);
|
||||
ps.setString(6, auction.url);
|
||||
ps.setString(7, auction.type);
|
||||
ps.setInt(8, auction.lotCount);
|
||||
ps.setString(9, auction.closingTime != null ? auction.closingTime.toString() : null);
|
||||
ps.setLong(10, Instant.now().getEpochSecond());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all auctions from the database
|
||||
*/
|
||||
synchronized List<AuctionInfo> getAllAuctions() throws SQLException {
|
||||
List<AuctionInfo> auctions = new ArrayList<>();
|
||||
String sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time FROM auctions";
|
||||
|
||||
try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) {
|
||||
ResultSet rs = stmt.executeQuery(sql);
|
||||
while (rs.next()) {
|
||||
AuctionInfo auction = new AuctionInfo();
|
||||
auction.auctionId = rs.getInt("auction_id");
|
||||
auction.title = rs.getString("title");
|
||||
auction.location = rs.getString("location");
|
||||
auction.city = rs.getString("city");
|
||||
auction.country = rs.getString("country");
|
||||
auction.url = rs.getString("url");
|
||||
auction.type = rs.getString("type");
|
||||
auction.lotCount = rs.getInt("lot_count");
|
||||
String closing = rs.getString("closing_time");
|
||||
if (closing != null) {
|
||||
auction.closingTime = LocalDateTime.parse(closing);
|
||||
}
|
||||
auctions.add(auction);
|
||||
}
|
||||
}
|
||||
return auctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves auctions by country code
|
||||
*/
|
||||
synchronized List<AuctionInfo> getAuctionsByCountry(String countryCode) throws SQLException {
|
||||
List<AuctionInfo> auctions = new ArrayList<>();
|
||||
String sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time "
|
||||
+ "FROM auctions WHERE country = ?";
|
||||
|
||||
try (Connection conn = DriverManager.getConnection(url); PreparedStatement ps = conn.prepareStatement(sql)) {
|
||||
ps.setString(1, countryCode);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
AuctionInfo auction = new AuctionInfo();
|
||||
auction.auctionId = rs.getInt("auction_id");
|
||||
auction.title = rs.getString("title");
|
||||
auction.location = rs.getString("location");
|
||||
auction.city = rs.getString("city");
|
||||
auction.country = rs.getString("country");
|
||||
auction.url = rs.getString("url");
|
||||
auction.type = rs.getString("type");
|
||||
auction.lotCount = rs.getInt("lot_count");
|
||||
String closing = rs.getString("closing_time");
|
||||
if (closing != null) {
|
||||
auction.closingTime = LocalDateTime.parse(closing);
|
||||
}
|
||||
auctions.add(auction);
|
||||
}
|
||||
}
|
||||
return auctions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts or updates a lot record. Uses INSERT OR REPLACE to
|
||||
* implement upsert semantics so that existing rows are replaced.
|
||||
|
||||
205
src/test/resources/test_auctions.html
Normal file
205
src/test/resources/test_auctions.html
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user