This commit is contained in:
Tour
2025-12-03 15:32:34 +01:00
parent 815d6a9a4a
commit aef7a3aa30
10 changed files with 533 additions and 1350 deletions

View File

@@ -20,259 +20,94 @@ import static org.junit.jupiter.api.Assertions.*;
* Tests the markup data extraction for each auction found
*/
public class AuctionParsingTest {
private static String testHtml;
@BeforeAll
public static void loadTestHtml() throws IOException {
// Load the test HTML file
testHtml = Files.readString(Paths.get("src/test/resources/test_auctions.html"));
System.out.println("Loaded test HTML (" + testHtml.length() + " characters)");
}
@Test
public void testParseAuctionsFromTestHtml() {
// Parse the HTML with JSoup
Document doc = Jsoup.parse(testHtml);
// Find all auction links
Elements auctionLinks = doc.select("a[href^='/a/']");
System.out.println("\n=== Auction Parsing Test ===");
System.out.println("Found " + auctionLinks.size() + " auction links");
List<AuctionInfo> auctions = new ArrayList<>();
int count = 0;
for (Element link : auctionLinks) {
String href = link.attr("href");
// Extract auction ID from URL
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)");
java.util.regex.Matcher matcher = pattern.matcher(href);
if (!matcher.find()) {
continue;
}
String typeNum = matcher.group(1);
int auctionId = Integer.parseInt(matcher.group(2));
// Extract auction info using IMPROVED text-based method
AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
auctions.add(auction);
// Print the first 10 auctions for verification
if (count < 10) {
System.out.println("\n--- Auction #" + (count + 1) + " ---");
System.out.println("ID: " + auction.auctionId);
System.out.println("Type: " + auction.type);
System.out.println("Title: " + auction.title);
System.out.println("Location: " + auction.location);
System.out.println("City: " + auction.city);
System.out.println("Country: " + auction.country);
System.out.println("Lot Count: " + auction.lotCount);
System.out.println("URL: " + auction.url);
// Print ALL visible text for debugging
System.out.println("\nAll visible text from link:");
System.out.println("\"" + link.text() + "\"");
}
count++;
}
System.out.println("\n=== Summary ===");
System.out.println("Total auctions parsed: " + auctions.size());
// Count by country
long nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count();
long bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count();
long deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count();
long beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count();
System.out.println("Dutch (NL) auctions: " + nlCount);
System.out.println("Bulgarian (BG) auctions: " + bgCount);
System.out.println("German (DE) auctions: " + deCount);
System.out.println("Belgian (BE) auctions: " + beCount);
System.out.println("Unknown location: " + auctions.stream().filter(a -> a.country == null).count());
// Assertions
assertTrue(auctions.size() > 0, "Should find at least one auction");
// Verify all auctions have basic info
for (AuctionInfo auction : auctions) {
assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId);
assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId);
assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId);
assertTrue(auction.auctionId > 0, "Auction ID should be positive");
assertNotNull(auction.location, "Location should not be null for auction " + auction.auctionId);
assertNotNull(auction.country, "Country should not be null for auction " + auction.auctionId);
assertTrue(auction.lotCount > 0, "Lot count should be positive for auction " + auction.auctionId);
}
}
/**
* IMPROVED: Extract auction info using .text() method
* This parses the human-readable text instead of HTML markup
*
* Expected format: "[day] om [time] [lot_count] [title] [city], [CC]"
* Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
*/
private AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
AuctionInfo auction = new AuctionInfo();
auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;
// Get ALL visible text from the link (this removes all HTML tags)
String allText = link.text().trim();
// Pattern: "[day] om [time] [lot_count] [title] [city], [CC]"
// Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
// Step 1: Extract closing time (day + time)
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile(
"(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})"
);
java.util.regex.Matcher timeMatcher = timePattern.matcher(allText);
String remainingText = allText;
if (timeMatcher.find()) {
String day = timeMatcher.group(1); // e.g., "woensdag"
String time = timeMatcher.group(2); // e.g., "18:00"
// Store closing time info (could be parsed to LocalDateTime with proper date)
System.out.println(" Closing time: " + day + " om " + time);
// Remove the time part from text
remainingText = allText.substring(timeMatcher.end()).trim();
}
// Step 2: Extract location from the END (always ends with ", CC")
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
);
java.util.regex.Matcher locMatcher = locPattern.matcher(remainingText);
if (locMatcher.find()) {
auction.city = locMatcher.group(1).trim();
auction.country = locMatcher.group(2);
auction.location = auction.city + ", " + auction.country;
// Remove location from end
remainingText = remainingText.substring(0, locMatcher.start()).trim();
}
// Step 3: Extract lot count (first number after time)
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile(
"^(\\d+)\\s+"
);
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remainingText);
if (lotMatcher.find()) {
auction.lotCount = Integer.parseInt(lotMatcher.group(1));
// Remove lot count from beginning
remainingText = remainingText.substring(lotMatcher.end()).trim();
}
// Step 4: What remains is the title
if (!remainingText.isEmpty()) {
auction.title = remainingText;
} else {
// Fallback: use URL slug for title
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-");
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
String slug = titleMatcher.group(1).replace("-", " ").replace("%7C", "|");
auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1);
private static String testHtml;
@BeforeAll
public static void loadTestHtml() throws IOException {
// Load the test HTML file
testHtml = Files.readString(Paths.get("src/test/resources/test_auctions.html"));
System.out.println("Loaded test HTML (" + testHtml.length() + " characters)");
}
@Test
public void testLocationPatternMatching() {
System.out.println("\n=== Location Pattern Tests ===");
// Test different location formats
String[] testCases = {
"<p>Amsterdam, NL</p>",
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
"<p>Berlin, DE</p>",
"<span>Brussels,</span>BE"
};
for (String testHtml : testCases) {
Document doc = Jsoup.parse(testHtml);
Element elem = doc.select("p, span").first();
if (elem != null) {
String text = elem.text();
System.out.println("\nTest: " + testHtml);
System.out.println("Text: " + text);
// Test regex pattern
if (text.matches(".*[A-Z]{2}$")) {
String countryCode = text.substring(text.length() - 2);
String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
} else {
auction.title = "Unknown Auction";
System.out.println("→ No match");
}
}
return auction;
}
@Test
public void testLocationPatternMatching() {
System.out.println("\n=== Location Pattern Tests ===");
// Test different location formats
String[] testCases = {
"<p>Amsterdam, NL</p>",
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
"<p>Berlin, DE</p>",
"<span>Brussels,</span>BE"
};
for (String testHtml : testCases) {
Document doc = Jsoup.parse(testHtml);
Element elem = doc.select("p, span").first();
if (elem != null) {
String text = elem.text();
System.out.println("\nTest: " + testHtml);
System.out.println("Text: " + text);
// Test regex pattern
if (text.matches(".*[A-Z]{2}$")) {
String countryCode = text.substring(text.length() - 2);
String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
} else {
System.out.println("→ No match");
}
}
}
}
@Test
public void testFullTextPatternMatching() {
System.out.println("\n=== Full Text Pattern Tests ===");
// Test the complete auction text format
String[] testCases = {
"woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE",
"maandag om 14:30 5 Industriële machines Amsterdam, NL",
"vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE"
};
for (String testText : testCases) {
System.out.println("\nParsing: \"" + testText + "\"");
// Simulated extraction
String remaining = testText;
// Extract time
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})");
java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining);
if (timeMatcher.find()) {
System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2));
remaining = remaining.substring(timeMatcher.end()).trim();
}
// Extract location
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
);
java.util.regex.Matcher locMatcher = locPattern.matcher(remaining);
if (locMatcher.find()) {
System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2));
remaining = remaining.substring(0, locMatcher.start()).trim();
}
// Extract lot count
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+");
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining);
if (lotMatcher.find()) {
System.out.println(" Lot count: " + lotMatcher.group(1));
remaining = remaining.substring(lotMatcher.end()).trim();
}
// What remains is title
System.out.println(" Title: " + remaining);
}
}
}
}
}
@Test
public void testFullTextPatternMatching() {
System.out.println("\n=== Full Text Pattern Tests ===");
// Test the complete auction text format
String[] testCases = {
"woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE",
"maandag om 14:30 5 Industriële machines Amsterdam, NL",
"vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE"
};
for (String testText : testCases) {
System.out.println("\nParsing: \"" + testText + "\"");
// Simulated extraction
String remaining = testText;
// Extract time
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})");
java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining);
if (timeMatcher.find()) {
System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2));
remaining = remaining.substring(timeMatcher.end()).trim();
}
// Extract location
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
);
java.util.regex.Matcher locMatcher = locPattern.matcher(remaining);
if (locMatcher.find()) {
System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2));
remaining = remaining.substring(0, locMatcher.start()).trim();
}
// Extract lot count
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+");
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining);
if (lotMatcher.find()) {
System.out.println(" Lot count: " + lotMatcher.group(1));
remaining = remaining.substring(lotMatcher.end()).trim();
}
// What remains is title
System.out.println(" Title: " + remaining);
}
}
}