From 836ce3527f051569bdbc5e20acb389c06a448a7b Mon Sep 17 00:00:00 2001 From: michael1986 Date: Fri, 28 Nov 2025 06:37:04 +0100 Subject: [PATCH] start --- .../java/com/auction/AuctionParsingTest.java | 307 ++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 src/test/java/com/auction/AuctionParsingTest.java diff --git a/src/test/java/com/auction/AuctionParsingTest.java b/src/test/java/com/auction/AuctionParsingTest.java new file mode 100644 index 0000000..bae514e --- /dev/null +++ b/src/test/java/com/auction/AuctionParsingTest.java @@ -0,0 +1,307 @@ +package com.auction; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Test auction parsing logic using saved HTML from test.html + * Tests the markup data extraction for each auction found + */ +public class AuctionParsingTest { + + private static String testHtml; + + @BeforeAll + public static void loadTestHtml() throws IOException { + // Load the test HTML file + testHtml = Files.readString(Paths.get("src/test/resources/test.html")); + System.out.println("Loaded test HTML (" + testHtml.length() + " characters)"); + } + + @Test + public void testParseAuctionsFromTestHtml() { + // Parse the HTML with JSoup + var doc = Jsoup.parse(testHtml); + + // Find all auction links + var auctionLinks = doc.select("a[href^='/a/']"); + + System.out.println("\n=== Auction Parsing Test ==="); + System.out.println("Found " + auctionLinks.size() + " auction links"); + + List auctions = new ArrayList<>(); + var count = 0; + + for (var link : auctionLinks) { + var href = link.attr("href"); + + // Extract auction ID from URL + var pattern = Pattern.compile("/a/.*?-A([17])-(\\d+)"); + var matcher = pattern.matcher(href); + + if (!matcher.find()) { + continue; + } + + var typeNum = matcher.group(1); + var auctionId = Integer.parseInt(matcher.group(2)); + + // Extract auction info using IMPROVED text-based method + var auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum); + auctions.add(auction); + + // Print the first 10 auctions for verification + if (count < 10) { + System.out.println("\n--- Auction #" + (count + 1) + " ---"); + System.out.println("ID: " + auction.auctionId); + System.out.println("Type: " + auction.type); + System.out.println("Title: " + auction.title); + System.out.println("Location: " + auction.location); + System.out.println("City: " + auction.city); + System.out.println("Country: " + auction.country); + System.out.println("Lot Count: " + auction.lotCount); + System.out.println("URL: " + auction.url); + + // Print ALL visible text for debugging + System.out.println("\nAll visible text from link:"); + System.out.println("\"" + link.text() + "\""); + } + + count++; + } + + System.out.println("\n=== Summary ==="); + System.out.println("Total auctions parsed: " + auctions.size()); + + // Count by country + var nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count(); + var bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count(); + var deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count(); + var beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count(); + + System.out.println("Dutch (NL) auctions: " + nlCount); + System.out.println("Bulgarian (BG) auctions: " + bgCount); + System.out.println("German (DE) auctions: " + deCount); + System.out.println("Belgian (BE) auctions: " + beCount); + System.out.println("Unknown location: " + auctions.stream().filter(a -> a.country == null).count()); + + // Assertions + assertTrue(auctions.size() > 0, "Should find at least one auction"); + + // Verify all auctions have basic info + for (var auction : auctions) { + assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId); + assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId); + assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId); + assertTrue(auction.auctionId > 0, "Auction ID should be positive"); + } + } + + /** + * Helper method to extract auction info from a link element + * This is a copy of the logic from TroostwijkScraper for testing purposes + */ + private TroostwijkScraper.AuctionInfo extractAuctionInfo(Element link, String href, int auctionId, String type) { + var auction = new TroostwijkScraper.AuctionInfo(); + auction.auctionId = auctionId; + auction.type = type; + auction.url = "https://www.troostwijkauctions.com" + href; + + // Extract title from href (convert kebab-case to title) + var titlePattern = Pattern.compile("/a/(.+?)-A[17]-"); + var titleMatcher = titlePattern.matcher(href); + if (titleMatcher.find()) { + var slug = titleMatcher.group(1); + auction.title = slug.replace("-", " "); + // Capitalize first letter + if (!auction.title.isEmpty()) { + auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1); + } + } else { + auction.title = "Unknown Auction"; + } + + // Try to find title in link text (more accurate) + var linkText = link.text(); + if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) { + // If link text doesn't contain numbers, it's likely the title + var parts = linkText.split(",|\\d+"); + if (parts.length > 0 && parts[0].trim().length() > 5) { + auction.title = parts[0].trim(); + } + } + + // Extract location using JSoup selectors + // Look for

tags that contain location info + var locationElements = link.select("p"); + for (var p : locationElements) { + var text = p.text(); + + // Pattern: "City, Country" or "City, Region, Country" + if (text.matches(".*[A-Z]{2}$")) { + // Ends with 2-letter country code + var countryCode = text.substring(text.length() - 2); + var cityPart = text.substring(0, text.length() - 2).trim(); + + // Remove trailing comma or whitespace + cityPart = cityPart.replaceAll("[,\\s]+$", ""); + + auction.country = countryCode; + auction.city = cityPart; + auction.location = cityPart + ", " + countryCode; + break; + } + } + + // Fallback: check HTML content directly + if (auction.country == null) { + var html = link.html(); + var locPattern = Pattern.compile( + "([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:)?\\s*\\s*([A-Z]{2})(?![A-Za-z])"); + var locMatcher = locPattern.matcher(html); + + if (locMatcher.find()) { + var city = locMatcher.group(1).trim().replaceAll(",$", ""); + var country = locMatcher.group(2); + auction.city = city; + auction.country = country; + auction.location = city + ", " + country; + } + } + + // Extract lot count if available (kavels/lots) + var textElements = link.select("*"); + for (var elem : textElements) { + var text = elem.ownText(); + if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) { + var countPattern = Pattern.compile("(\\d+)"); + var countMatcher = countPattern.matcher(text); + if (countMatcher.find()) { + auction.lotCount = Integer.parseInt(countMatcher.group(1)); + break; + } + } + } + + return auction; + } + + /** + * IMPROVED: Extract auction info using .text() method + * This parses the human-readable text instead of HTML markup + */ + private TroostwijkScraper.AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) { + var auction = new TroostwijkScraper.AuctionInfo(); + auction.auctionId = auctionId; + auction.type = type; + auction.url = "https://www.troostwijkauctions.com" + href; + + // Get ALL visible text from the link (this removes all HTML tags) + var allText = link.text(); + + // Extract title (usually the first meaningful text, before numbers/location) + var textParts = allText.split("\\s{2,}"); // Split on 2+ spaces + for (var part : textParts) { + part = part.trim(); + // Title is usually first long text without dates/numbers/commas + if (part.length() > 15 && !part.matches(".*\\d{1,2}[:/]\\d{2}.*") && !part.matches(".*,\\s*[A-Z]{2}$")) { + auction.title = part; + break; + } + } + + // Fallback: use URL slug for title + if (auction.title == null || auction.title.isEmpty()) { + var titlePattern = Pattern.compile("/a/(.+?)-A[17]-"); + var titleMatcher = titlePattern.matcher(href); + if (titleMatcher.find()) { + var slug = titleMatcher.group(1).replace("-", " "); + auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1); + } else { + auction.title = "Unknown Auction"; + } + } + + // Extract location (look for "City, CC" pattern where CC is 2-letter country code) + var locPattern = Pattern.compile( + "([A-Za-z][A-Za-z\\s\\-']+?),\\s*([A-Z]{2})(?![A-Za-z])" + ); + var locMatcher = locPattern.matcher(allText); + + if (locMatcher.find()) { + auction.city = locMatcher.group(1).trim(); + auction.country = locMatcher.group(2); + auction.location = auction.city + ", " + auction.country; + } + + // Extract lot count (look for "X kavels" or "X lots") + var lotPattern = Pattern.compile( + "(\\d+)\\s+(?:kavel|lot|item)s?" + ); + var lotMatcher = lotPattern.matcher(allText); + + if (lotMatcher.find()) { + auction.lotCount = Integer.parseInt(lotMatcher.group(1)); + } + + // Extract closing time info (look for time patterns) + var timePattern = Pattern.compile( + "(\\d{1,2}:\\d{2})" + ); + var timeMatcher = timePattern.matcher(allText); + + if (timeMatcher.find()) { + var timeStr = timeMatcher.group(1); + // Note: Would need full date to convert to LocalDateTime + // For now, just log it + } + + return auction; + } + + @Test + public void testLocationPatternMatching() { + System.out.println("\n=== Location Pattern Tests ==="); + + // Test different location formats + var testCases = new String[]{ + "

Amsterdam, NL

", + "

Sofia, BG

", + "

Berlin, DE

", + "Brussels,BE" + }; + + for (var testHtml : testCases) { + var doc = Jsoup.parse(testHtml); + var elem = doc.select("p, span").first(); + + if (elem != null) { + var text = elem.text(); + System.out.println("\nTest: " + testHtml); + System.out.println("Text: " + text); + + // Test regex pattern + if (text.matches(".*[A-Z]{2}$")) { + var countryCode = text.substring(text.length() - 2); + var cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", ""); + System.out.println("→ Extracted: " + cityPart + ", " + countryCode); + } else { + System.out.println("→ No match"); + } + } + } + } +}