This commit is contained in:
2025-11-28 06:37:04 +01:00
parent b174f77f6c
commit 836ce3527f

View File

@@ -0,0 +1,307 @@
package com.auction;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import static org.junit.jupiter.api.Assertions.*;
/**
* Test auction parsing logic using saved HTML from test.html
* Tests the markup data extraction for each auction found
*/
public class AuctionParsingTest {
private static String testHtml;
@BeforeAll
public static void loadTestHtml() throws IOException {
// Load the test HTML file
testHtml = Files.readString(Paths.get("src/test/resources/test.html"));
System.out.println("Loaded test HTML (" + testHtml.length() + " characters)");
}
@Test
public void testParseAuctionsFromTestHtml() {
// Parse the HTML with JSoup
var doc = Jsoup.parse(testHtml);
// Find all auction links
var auctionLinks = doc.select("a[href^='/a/']");
System.out.println("\n=== Auction Parsing Test ===");
System.out.println("Found " + auctionLinks.size() + " auction links");
List<TroostwijkScraper.AuctionInfo> auctions = new ArrayList<>();
var count = 0;
for (var link : auctionLinks) {
var href = link.attr("href");
// Extract auction ID from URL
var pattern = Pattern.compile("/a/.*?-A([17])-(\\d+)");
var matcher = pattern.matcher(href);
if (!matcher.find()) {
continue;
}
var typeNum = matcher.group(1);
var auctionId = Integer.parseInt(matcher.group(2));
// Extract auction info using IMPROVED text-based method
var auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
auctions.add(auction);
// Print the first 10 auctions for verification
if (count < 10) {
System.out.println("\n--- Auction #" + (count + 1) + " ---");
System.out.println("ID: " + auction.auctionId);
System.out.println("Type: " + auction.type);
System.out.println("Title: " + auction.title);
System.out.println("Location: " + auction.location);
System.out.println("City: " + auction.city);
System.out.println("Country: " + auction.country);
System.out.println("Lot Count: " + auction.lotCount);
System.out.println("URL: " + auction.url);
// Print ALL visible text for debugging
System.out.println("\nAll visible text from link:");
System.out.println("\"" + link.text() + "\"");
}
count++;
}
System.out.println("\n=== Summary ===");
System.out.println("Total auctions parsed: " + auctions.size());
// Count by country
var nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count();
var bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count();
var deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count();
var beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count();
System.out.println("Dutch (NL) auctions: " + nlCount);
System.out.println("Bulgarian (BG) auctions: " + bgCount);
System.out.println("German (DE) auctions: " + deCount);
System.out.println("Belgian (BE) auctions: " + beCount);
System.out.println("Unknown location: " + auctions.stream().filter(a -> a.country == null).count());
// Assertions
assertTrue(auctions.size() > 0, "Should find at least one auction");
// Verify all auctions have basic info
for (var auction : auctions) {
assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId);
assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId);
assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId);
assertTrue(auction.auctionId > 0, "Auction ID should be positive");
}
}
/**
* Helper method to extract auction info from a link element
* This is a copy of the logic from TroostwijkScraper for testing purposes
*/
private TroostwijkScraper.AuctionInfo extractAuctionInfo(Element link, String href, int auctionId, String type) {
var auction = new TroostwijkScraper.AuctionInfo();
auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;
// Extract title from href (convert kebab-case to title)
var titlePattern = Pattern.compile("/a/(.+?)-A[17]-");
var titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
var slug = titleMatcher.group(1);
auction.title = slug.replace("-", " ");
// Capitalize first letter
if (!auction.title.isEmpty()) {
auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1);
}
} else {
auction.title = "Unknown Auction";
}
// Try to find title in link text (more accurate)
var linkText = link.text();
if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) {
// If link text doesn't contain numbers, it's likely the title
var parts = linkText.split(",|\\d+");
if (parts.length > 0 && parts[0].trim().length() > 5) {
auction.title = parts[0].trim();
}
}
// Extract location using JSoup selectors
// Look for <p> tags that contain location info
var locationElements = link.select("p");
for (var p : locationElements) {
var text = p.text();
// Pattern: "City, Country" or "City, Region, Country"
if (text.matches(".*[A-Z]{2}$")) {
// Ends with 2-letter country code
var countryCode = text.substring(text.length() - 2);
var cityPart = text.substring(0, text.length() - 2).trim();
// Remove trailing comma or whitespace
cityPart = cityPart.replaceAll("[,\\s]+$", "");
auction.country = countryCode;
auction.city = cityPart;
auction.location = cityPart + ", " + countryCode;
break;
}
}
// Fallback: check HTML content directly
if (auction.country == null) {
var html = link.html();
var locPattern = Pattern.compile(
"([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
var locMatcher = locPattern.matcher(html);
if (locMatcher.find()) {
var city = locMatcher.group(1).trim().replaceAll(",$", "");
var country = locMatcher.group(2);
auction.city = city;
auction.country = country;
auction.location = city + ", " + country;
}
}
// Extract lot count if available (kavels/lots)
var textElements = link.select("*");
for (var elem : textElements) {
var text = elem.ownText();
if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) {
var countPattern = Pattern.compile("(\\d+)");
var countMatcher = countPattern.matcher(text);
if (countMatcher.find()) {
auction.lotCount = Integer.parseInt(countMatcher.group(1));
break;
}
}
}
return auction;
}
/**
* IMPROVED: Extract auction info using .text() method
* This parses the human-readable text instead of HTML markup
*/
private TroostwijkScraper.AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
var auction = new TroostwijkScraper.AuctionInfo();
auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;
// Get ALL visible text from the link (this removes all HTML tags)
var allText = link.text();
// Extract title (usually the first meaningful text, before numbers/location)
var textParts = allText.split("\\s{2,}"); // Split on 2+ spaces
for (var part : textParts) {
part = part.trim();
// Title is usually first long text without dates/numbers/commas
if (part.length() > 15 && !part.matches(".*\\d{1,2}[:/]\\d{2}.*") && !part.matches(".*,\\s*[A-Z]{2}$")) {
auction.title = part;
break;
}
}
// Fallback: use URL slug for title
if (auction.title == null || auction.title.isEmpty()) {
var titlePattern = Pattern.compile("/a/(.+?)-A[17]-");
var titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
var slug = titleMatcher.group(1).replace("-", " ");
auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1);
} else {
auction.title = "Unknown Auction";
}
}
// Extract location (look for "City, CC" pattern where CC is 2-letter country code)
var locPattern = Pattern.compile(
"([A-Za-z][A-Za-z\\s\\-']+?),\\s*([A-Z]{2})(?![A-Za-z])"
);
var locMatcher = locPattern.matcher(allText);
if (locMatcher.find()) {
auction.city = locMatcher.group(1).trim();
auction.country = locMatcher.group(2);
auction.location = auction.city + ", " + auction.country;
}
// Extract lot count (look for "X kavels" or "X lots")
var lotPattern = Pattern.compile(
"(\\d+)\\s+(?:kavel|lot|item)s?"
);
var lotMatcher = lotPattern.matcher(allText);
if (lotMatcher.find()) {
auction.lotCount = Integer.parseInt(lotMatcher.group(1));
}
// Extract closing time info (look for time patterns)
var timePattern = Pattern.compile(
"(\\d{1,2}:\\d{2})"
);
var timeMatcher = timePattern.matcher(allText);
if (timeMatcher.find()) {
var timeStr = timeMatcher.group(1);
// Note: Would need full date to convert to LocalDateTime
// For now, just log it
}
return auction;
}
@Test
public void testLocationPatternMatching() {
System.out.println("\n=== Location Pattern Tests ===");
// Test different location formats
var testCases = new String[]{
"<p>Amsterdam, NL</p>",
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
"<p>Berlin, DE</p>",
"<span>Brussels,</span>BE"
};
for (var testHtml : testCases) {
var doc = Jsoup.parse(testHtml);
var elem = doc.select("p, span").first();
if (elem != null) {
var text = elem.text();
System.out.println("\nTest: " + testHtml);
System.out.println("Text: " + text);
// Test regex pattern
if (text.matches(".*[A-Z]{2}$")) {
var countryCode = text.substring(text.length() - 2);
var cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
} else {
System.out.println("→ No match");
}
}
}
}
}