diff --git a/.idea/data_source_mapping.xml b/.idea/data_source_mapping.xml index 4e6c19e..68185f7 100644 --- a/.idea/data_source_mapping.xml +++ b/.idea/data_source_mapping.xml @@ -2,5 +2,6 @@ + \ No newline at end of file diff --git a/cache/page_cache.db b/cache/page_cache.db index 62a5c7e..1b423a3 100644 Binary files a/cache/page_cache.db and b/cache/page_cache.db differ diff --git a/src/main/java/com/auction/TroostwijkAuctionExtractor.java b/src/main/java/com/auction/TroostwijkAuctionExtractor.java index 7891615..46be1c2 100644 --- a/src/main/java/com/auction/TroostwijkAuctionExtractor.java +++ b/src/main/java/com/auction/TroostwijkAuctionExtractor.java @@ -308,8 +308,7 @@ public class TroostwijkAuctionExtractor { int endPos = Math.min(hrefPos + 1000, html.length()); String context = html.substring(startPos, endPos); - // Try to find location pattern like "City, NL" or "City, Country" - // More flexible pattern to catch various location formats + // Pattern 1: Classic format "City, NL" java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( "([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])"); java.util.regex.Matcher locMatcher = locPattern.matcher(context); @@ -320,6 +319,31 @@ public class TroostwijkAuctionExtractor { return location; } + // Pattern 2: HTML format like "City, NL" + // Extract city and country code separately + java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile( + "]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:)?\\s*\\s*([A-Z]{2})(?![A-Za-z])"); + java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context); + + if (htmlMatcher.find()) { + String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma + String country = htmlMatcher.group(2); + String location = city + ", " + country; + System.out.println(" Found location (HTML): " + location + " for auction " + href); + return location; + } + + // Pattern 3: Fallback - just find country code after HTML tags + java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile( + "(?:-->||)\\s*([A-Z]{2})(?![A-Za-z])"); + java.util.regex.Matcher countryMatcher = countryPattern.matcher(context); + + if (countryMatcher.find()) { + String country = countryMatcher.group(1); + System.out.println(" Found country code: " + country + " for auction " + href); + return "Unknown, " + country; + } + System.out.println(" ⚠️ No location found for auction " + href); return "Unknown"; } diff --git a/src/main/java/com/auction/TroostwijkScraper.java b/src/main/java/com/auction/TroostwijkScraper.java index 59fa79c..33ca429 100644 --- a/src/main/java/com/auction/TroostwijkScraper.java +++ b/src/main/java/com/auction/TroostwijkScraper.java @@ -198,7 +198,9 @@ public class TroostwijkScraper { * location contains ", NL" (indicating the Netherlands). Each auction link * contains a unique sale ID in the format A1-xxxxx or A7-xxxxx. * - * @return a list of sale identifiers for auctions located in NL + * Auctions are saved to the database and can be retrieved with getDutchAuctions(). + * + * @return a list of sale identifiers for auctions located in NL (legacy compatibility) */ public List discoverDutchAuctions() { Set saleIds = new HashSet<>(); @@ -246,7 +248,7 @@ public class TroostwijkScraper { } } - // Parse auctions from HTML + // Parse auctions from HTML (saves Dutch auctions to database) int foundOnPage = parseAuctionsFromHtml(html, saleIds); if (foundOnPage == 0) { @@ -262,6 +264,21 @@ public class TroostwijkScraper { return new ArrayList<>(saleIds); } + /** + * Gets all Dutch auctions from the database. + * Call discoverDutchAuctions() first to populate the database. + * + * @return List of Dutch auctions with full metadata + */ + public List getDutchAuctions() { + try { + return db.getAuctionsByCountry("NL"); + } catch (SQLException e) { + System.err.println("Failed to retrieve Dutch auctions from database: " + e.getMessage()); + return new ArrayList<>(); + } + } + /** * Fetches a single page using Playwright */ @@ -305,51 +322,152 @@ public class TroostwijkScraper { } /** - * Parses auctions from HTML and adds Dutch auctions to the set + * Parses auctions from HTML using JSoup and saves Dutch auctions to database. + * Uses proper HTML parsing instead of regex for more reliable extraction. * @return number of Dutch auctions found on this page */ private int parseAuctionsFromHtml(String html, Set saleIds) { int foundCount = 0; - // Simple regex-based parsing for auction links - java.util.regex.Pattern linkPattern = java.util.regex.Pattern.compile( - "href=\"(/a/[^\"]+A[17]-(\\d+)[^\"]*)\""); - java.util.regex.Matcher linkMatcher = linkPattern.matcher(html); + try { + org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html); - while (linkMatcher.find()) { - String href = linkMatcher.group(1); - int auctionId = Integer.parseInt(linkMatcher.group(2)); + // Find all auction links (format: /a/title-A1-12345 or /a/title-A7-12345) + org.jsoup.select.Elements auctionLinks = doc.select("a[href^='/a/']"); - // Avoid duplicates - if (saleIds.contains(auctionId)) { - continue; - } - - // Check if this auction is Dutch (location contains ", NL") - if (isDutchAuction(html, href)) { - saleIds.add(auctionId); - foundCount++; - System.out.println(" Found Dutch auction: " + auctionId + " - " + href); + for (org.jsoup.nodes.Element link : auctionLinks) { + String href = link.attr("href"); + + // Extract auction ID from URL + java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)"); + java.util.regex.Matcher matcher = pattern.matcher(href); + + if (!matcher.find()) { + continue; + } + + String typeNum = matcher.group(1); + int auctionId = Integer.parseInt(matcher.group(2)); + + // Skip duplicates + if (saleIds.contains(auctionId)) { + continue; + } + + // Extract auction info using JSoup + AuctionInfo auction = extractAuctionInfo(link, href, auctionId, "A" + typeNum); + + // Only keep Dutch auctions + if (auction != null && "NL".equals(auction.country)) { + saleIds.add(auctionId); + foundCount++; + + // Save to database + try { + db.upsertAuction(auction); + System.out.println(" Found Dutch auction: " + auctionId + " - " + auction.title + " (" + auction.location + ")"); + } catch (SQLException e) { + System.err.println(" Failed to save auction: " + e.getMessage()); + } + } } + } catch (Exception e) { + System.err.println(" Error parsing HTML: " + e.getMessage()); } return foundCount; } /** - * Checks if an auction is located in the Netherlands + * Extracts auction information from a link element using JSoup + * This method intelligently parses the HTML structure to extract: + * - Title + * - Location (city and country) + * - Lot count (if available) */ - private boolean isDutchAuction(String html, String href) { - int hrefPos = html.indexOf(href); - if (hrefPos == -1) return false; + private AuctionInfo extractAuctionInfo(org.jsoup.nodes.Element link, String href, int auctionId, String type) { + AuctionInfo auction = new AuctionInfo(); + auction.auctionId = auctionId; + auction.type = type; + auction.url = "https://www.troostwijkauctions.com" + href; - // Look at 1000 characters before and after the href for location info - int startPos = Math.max(hrefPos - 500, 0); - int endPos = Math.min(hrefPos + 1000, html.length()); - String context = html.substring(startPos, endPos); + // Extract title from href (convert kebab-case to title) + java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-"); + java.util.regex.Matcher titleMatcher = titlePattern.matcher(href); + if (titleMatcher.find()) { + String slug = titleMatcher.group(1); + auction.title = slug.replace("-", " "); + // Capitalize first letter + if (!auction.title.isEmpty()) { + auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1); + } + } else { + auction.title = "Unknown Auction"; + } - // Look for ", NL" pattern - return context.contains(", NL"); + // Try to find title in link text (more accurate) + String linkText = link.text(); + if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) { + // If link text doesn't contain numbers, it's likely the title + String[] parts = linkText.split(",|\\d+"); + if (parts.length > 0 && parts[0].trim().length() > 5) { + auction.title = parts[0].trim(); + } + } + + // Extract location using JSoup selectors + // Look for

tags that contain location info + org.jsoup.select.Elements locationElements = link.select("p"); + for (org.jsoup.nodes.Element p : locationElements) { + String text = p.text(); + + // Pattern: "City, Country" or "City, Region, Country" + if (text.matches(".*[A-Z]{2}$")) { + // Ends with 2-letter country code + String countryCode = text.substring(text.length() - 2); + String cityPart = text.substring(0, text.length() - 2).trim(); + + // Remove trailing comma or whitespace + cityPart = cityPart.replaceAll("[,\\s]+$", ""); + + auction.country = countryCode; + auction.city = cityPart; + auction.location = cityPart + ", " + countryCode; + break; + } + } + + // Fallback: check HTML content directly + if (auction.country == null) { + String html = link.html(); + java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( + "([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:)?\\s*\\s*([A-Z]{2})(?![A-Za-z])"); + java.util.regex.Matcher locMatcher = locPattern.matcher(html); + + if (locMatcher.find()) { + String city = locMatcher.group(1).trim().replaceAll(",$", ""); + String country = locMatcher.group(2); + auction.city = city; + auction.country = country; + auction.location = city + ", " + country; + } + } + + // Extract lot count if available (kavels/lots) + org.jsoup.select.Elements textElements = link.select("*"); + for (org.jsoup.nodes.Element elem : textElements) { + String text = elem.ownText(); + if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) { + java.util.regex.Pattern countPattern = java.util.regex.Pattern.compile("(\\d+)"); + java.util.regex.Matcher countMatcher = countPattern.matcher(text); + if (countMatcher.find()) { + auction.lotCount = Integer.parseInt(countMatcher.group(1)); + break; + } + } + } + + return auction; } /** @@ -664,13 +782,34 @@ public class TroostwijkScraper { // Domain classes and services // ---------------------------------------------------------------------- + /** + * Represents auction metadata (veiling informatie) + */ + public static class AuctionInfo { + public int auctionId; // Unique auction ID (from URL) + public String title; // Auction title + public String location; // Location (e.g., "Amsterdam, NL") + public String city; // City name + public String country; // Country code (e.g., "NL") + public String url; // Full auction URL + public String type; // Auction type (A1 or A7) + public int lotCount; // Number of lots/kavels + public LocalDateTime closingTime; // Closing time if available + + @Override + public String toString() { + return String.format("Auction{id=%d, type=%s, title='%s', location='%s', lots=%d, url='%s'}", + auctionId, type, title, location, lotCount, url); + } + } + /** * Simple POJO representing a lot (kavel) in an auction. It keeps track * of the sale it belongs to, current bid and closing time. The method * minutesUntilClose computes how many minutes remain until the lot closes. */ static class Lot { - + int saleId; int lotId; String title; @@ -684,7 +823,7 @@ public class TroostwijkScraper { String url; LocalDateTime closingTime; // null if unknown boolean closingNotified; - + long minutesUntilClose() { if (closingTime == null) return Long.MAX_VALUE; return java.time.Duration.between(LocalDateTime.now(), closingTime).toMinutes(); @@ -704,18 +843,33 @@ public class TroostwijkScraper { } /** * Creates tables if they do not already exist. The schema includes - * tables for sales, lots, images, and object labels. This method is + * tables for auctions, lots, images, and object labels. This method is * idempotent; it can be called multiple times. */ void ensureSchema() throws SQLException { try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) { - // Sales table + // Auctions table (veilingen) + stmt.execute("CREATE TABLE IF NOT EXISTS auctions (" + + "auction_id INTEGER PRIMARY KEY," + + "title TEXT NOT NULL," + + "location TEXT," + + "city TEXT," + + "country TEXT," + + "url TEXT NOT NULL," + + "type TEXT," + + "lot_count INTEGER DEFAULT 0," + + "closing_time TEXT," + + "discovered_at INTEGER" // Unix timestamp + + ")"); + + // Sales table (legacy - keep for compatibility) stmt.execute("CREATE TABLE IF NOT EXISTS sales (" + "sale_id INTEGER PRIMARY KEY," + "title TEXT," + "location TEXT," + "closing_time TEXT" + ")"); + // Lots table stmt.execute("CREATE TABLE IF NOT EXISTS lots (" + "lot_id INTEGER PRIMARY KEY," @@ -731,8 +885,9 @@ public class TroostwijkScraper { + "url TEXT," + "closing_time TEXT," + "closing_notified INTEGER DEFAULT 0," - + "FOREIGN KEY (sale_id) REFERENCES sales(sale_id)" + + "FOREIGN KEY (sale_id) REFERENCES auctions(auction_id)" + ")"); + // Images table stmt.execute("CREATE TABLE IF NOT EXISTS images (" + "id INTEGER PRIMARY KEY AUTOINCREMENT," @@ -742,8 +897,98 @@ public class TroostwijkScraper { + "labels TEXT," + "FOREIGN KEY (lot_id) REFERENCES lots(lot_id)" + ")"); + + // Create indexes for better query performance + stmt.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)"); + stmt.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)"); } } + + /** + * Inserts or updates an auction record + */ + synchronized void upsertAuction(AuctionInfo auction) throws SQLException { + String sql = "INSERT INTO auctions (auction_id, title, location, city, country, url, type, lot_count, closing_time, discovered_at)" + + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + + " ON CONFLICT(auction_id) DO UPDATE SET " + + "title = excluded.title, location = excluded.location, city = excluded.city, " + + "country = excluded.country, url = excluded.url, type = excluded.type, " + + "lot_count = excluded.lot_count, closing_time = excluded.closing_time"; + + try (Connection conn = DriverManager.getConnection(url); PreparedStatement ps = conn.prepareStatement(sql)) { + ps.setInt(1, auction.auctionId); + ps.setString(2, auction.title); + ps.setString(3, auction.location); + ps.setString(4, auction.city); + ps.setString(5, auction.country); + ps.setString(6, auction.url); + ps.setString(7, auction.type); + ps.setInt(8, auction.lotCount); + ps.setString(9, auction.closingTime != null ? auction.closingTime.toString() : null); + ps.setLong(10, Instant.now().getEpochSecond()); + ps.executeUpdate(); + } + } + + /** + * Retrieves all auctions from the database + */ + synchronized List getAllAuctions() throws SQLException { + List auctions = new ArrayList<>(); + String sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time FROM auctions"; + + try (Connection conn = DriverManager.getConnection(url); Statement stmt = conn.createStatement()) { + ResultSet rs = stmt.executeQuery(sql); + while (rs.next()) { + AuctionInfo auction = new AuctionInfo(); + auction.auctionId = rs.getInt("auction_id"); + auction.title = rs.getString("title"); + auction.location = rs.getString("location"); + auction.city = rs.getString("city"); + auction.country = rs.getString("country"); + auction.url = rs.getString("url"); + auction.type = rs.getString("type"); + auction.lotCount = rs.getInt("lot_count"); + String closing = rs.getString("closing_time"); + if (closing != null) { + auction.closingTime = LocalDateTime.parse(closing); + } + auctions.add(auction); + } + } + return auctions; + } + + /** + * Retrieves auctions by country code + */ + synchronized List getAuctionsByCountry(String countryCode) throws SQLException { + List auctions = new ArrayList<>(); + String sql = "SELECT auction_id, title, location, city, country, url, type, lot_count, closing_time " + + "FROM auctions WHERE country = ?"; + + try (Connection conn = DriverManager.getConnection(url); PreparedStatement ps = conn.prepareStatement(sql)) { + ps.setString(1, countryCode); + ResultSet rs = ps.executeQuery(); + while (rs.next()) { + AuctionInfo auction = new AuctionInfo(); + auction.auctionId = rs.getInt("auction_id"); + auction.title = rs.getString("title"); + auction.location = rs.getString("location"); + auction.city = rs.getString("city"); + auction.country = rs.getString("country"); + auction.url = rs.getString("url"); + auction.type = rs.getString("type"); + auction.lotCount = rs.getInt("lot_count"); + String closing = rs.getString("closing_time"); + if (closing != null) { + auction.closingTime = LocalDateTime.parse(closing); + } + auctions.add(auction); + } + } + return auctions; + } /** * Inserts or updates a lot record. Uses INSERT OR REPLACE to diff --git a/src/main/resources/test.html b/src/test/resources/test.html similarity index 100% rename from src/main/resources/test.html rename to src/test/resources/test.html diff --git a/src/test/resources/test_auctions.html b/src/test/resources/test_auctions.html new file mode 100644 index 0000000..78af290 --- /dev/null +++ b/src/test/resources/test_auctions.html @@ -0,0 +1,205 @@ +Alle veilingen | Troostwijk Auctions

Alle veilingen

522 resultaten

Vandaag 28 nov 25

Download nu de Troostwijk Auctions app!

App storeGoogle play
\ No newline at end of file