package com.auction; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import static org.junit.jupiter.api.Assertions.*; /** * Test auction parsing logic using saved HTML from test.html * Tests the markup data extraction for each auction found */ public class AuctionParsingTest { private static String testHtml; @BeforeAll public static void loadTestHtml() throws IOException { // Load the test HTML file testHtml = Files.readString(Paths.get("src/test/resources/test_auctions.html")); System.out.println("Loaded test HTML (" + testHtml.length() + " characters)"); } @Test public void testParseAuctionsFromTestHtml() { // Parse the HTML with JSoup Document doc = Jsoup.parse(testHtml); // Find all auction links Elements auctionLinks = doc.select("a[href^='/a/']"); System.out.println("\n=== Auction Parsing Test ==="); System.out.println("Found " + auctionLinks.size() + " auction links"); List auctions = new ArrayList<>(); int count = 0; for (Element link : auctionLinks) { String href = link.attr("href"); // Extract auction ID from URL java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)"); java.util.regex.Matcher matcher = pattern.matcher(href); if (!matcher.find()) { continue; } String typeNum = matcher.group(1); int auctionId = Integer.parseInt(matcher.group(2)); // Extract auction info using IMPROVED text-based method AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum); auctions.add(auction); // Print the first 10 auctions for verification if (count < 10) { System.out.println("\n--- Auction #" + (count + 1) + " ---"); System.out.println("ID: " + auction.auctionId); System.out.println("Type: " + auction.type); System.out.println("Title: " + auction.title); System.out.println("Location: " + auction.location); System.out.println("City: " + auction.city); System.out.println("Country: " + auction.country); System.out.println("Lot Count: " + auction.lotCount); System.out.println("URL: " + auction.url); // Print ALL visible text for debugging System.out.println("\nAll visible text from link:"); System.out.println("\"" + link.text() + "\""); } count++; } System.out.println("\n=== Summary ==="); System.out.println("Total auctions parsed: " + auctions.size()); // Count by country long nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count(); long bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count(); long deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count(); long beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count(); System.out.println("Dutch (NL) auctions: " + nlCount); System.out.println("Bulgarian (BG) auctions: " + bgCount); System.out.println("German (DE) auctions: " + deCount); System.out.println("Belgian (BE) auctions: " + beCount); System.out.println("Unknown location: " + auctions.stream().filter(a -> a.country == null).count()); // Assertions assertTrue(auctions.size() > 0, "Should find at least one auction"); // Verify all auctions have basic info for (AuctionInfo auction : auctions) { assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId); assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId); assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId); assertTrue(auction.auctionId > 0, "Auction ID should be positive"); assertNotNull(auction.location, "Location should not be null for auction " + auction.auctionId); assertNotNull(auction.country, "Country should not be null for auction " + auction.auctionId); assertTrue(auction.lotCount > 0, "Lot count should be positive for auction " + auction.auctionId); } } /** * IMPROVED: Extract auction info using .text() method * This parses the human-readable text instead of HTML markup * * Expected format: "[day] om [time] [lot_count] [title] [city], [CC]" * Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE" */ private AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) { AuctionInfo auction = new AuctionInfo(); auction.auctionId = auctionId; auction.type = type; auction.url = "https://www.troostwijkauctions.com" + href; // Get ALL visible text from the link (this removes all HTML tags) String allText = link.text().trim(); // Pattern: "[day] om [time] [lot_count] [title] [city], [CC]" // Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE" // Step 1: Extract closing time (day + time) java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile( "(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})" ); java.util.regex.Matcher timeMatcher = timePattern.matcher(allText); String remainingText = allText; if (timeMatcher.find()) { String day = timeMatcher.group(1); // e.g., "woensdag" String time = timeMatcher.group(2); // e.g., "18:00" // Store closing time info (could be parsed to LocalDateTime with proper date) System.out.println(" Closing time: " + day + " om " + time); // Remove the time part from text remainingText = allText.substring(timeMatcher.end()).trim(); } // Step 2: Extract location from the END (always ends with ", CC") java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( "([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$" ); java.util.regex.Matcher locMatcher = locPattern.matcher(remainingText); if (locMatcher.find()) { auction.city = locMatcher.group(1).trim(); auction.country = locMatcher.group(2); auction.location = auction.city + ", " + auction.country; // Remove location from end remainingText = remainingText.substring(0, locMatcher.start()).trim(); } // Step 3: Extract lot count (first number after time) java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile( "^(\\d+)\\s+" ); java.util.regex.Matcher lotMatcher = lotPattern.matcher(remainingText); if (lotMatcher.find()) { auction.lotCount = Integer.parseInt(lotMatcher.group(1)); // Remove lot count from beginning remainingText = remainingText.substring(lotMatcher.end()).trim(); } // Step 4: What remains is the title if (!remainingText.isEmpty()) { auction.title = remainingText; } else { // Fallback: use URL slug for title java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-"); java.util.regex.Matcher titleMatcher = titlePattern.matcher(href); if (titleMatcher.find()) { String slug = titleMatcher.group(1).replace("-", " ").replace("%7C", "|"); auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1); } else { auction.title = "Unknown Auction"; } } return auction; } @Test public void testLocationPatternMatching() { System.out.println("\n=== Location Pattern Tests ==="); // Test different location formats String[] testCases = { "

Amsterdam, NL

", "

Sofia, BG

", "

Berlin, DE

", "Brussels,BE" }; for (String testHtml : testCases) { Document doc = Jsoup.parse(testHtml); Element elem = doc.select("p, span").first(); if (elem != null) { String text = elem.text(); System.out.println("\nTest: " + testHtml); System.out.println("Text: " + text); // Test regex pattern if (text.matches(".*[A-Z]{2}$")) { String countryCode = text.substring(text.length() - 2); String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", ""); System.out.println("→ Extracted: " + cityPart + ", " + countryCode); } else { System.out.println("→ No match"); } } } } @Test public void testFullTextPatternMatching() { System.out.println("\n=== Full Text Pattern Tests ==="); // Test the complete auction text format String[] testCases = { "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE", "maandag om 14:30 5 Industriële machines Amsterdam, NL", "vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE" }; for (String testText : testCases) { System.out.println("\nParsing: \"" + testText + "\""); // Simulated extraction String remaining = testText; // Extract time java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})"); java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining); if (timeMatcher.find()) { System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2)); remaining = remaining.substring(timeMatcher.end()).trim(); } // Extract location java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile( "([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$" ); java.util.regex.Matcher locMatcher = locPattern.matcher(remaining); if (locMatcher.find()) { System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2)); remaining = remaining.substring(0, locMatcher.start()).trim(); } // Extract lot count java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+"); java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining); if (lotMatcher.find()) { System.out.println(" Lot count: " + lotMatcher.group(1)); remaining = remaining.substring(lotMatcher.end()).trim(); } // What remains is title System.out.println(" Title: " + remaining); } } }