This commit is contained in:
2025-11-28 14:43:58 +01:00
parent 836ce3527f
commit 7fa3e4a545
10 changed files with 664 additions and 396 deletions

View File

@@ -12,7 +12,6 @@ import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import static org.junit.jupiter.api.Assertions.*;
@@ -27,40 +26,40 @@ public class AuctionParsingTest {
@BeforeAll
public static void loadTestHtml() throws IOException {
// Load the test HTML file
testHtml = Files.readString(Paths.get("src/test/resources/test.html"));
testHtml = Files.readString(Paths.get("src/test/resources/test_auctions.html"));
System.out.println("Loaded test HTML (" + testHtml.length() + " characters)");
}
@Test
public void testParseAuctionsFromTestHtml() {
// Parse the HTML with JSoup
var doc = Jsoup.parse(testHtml);
Document doc = Jsoup.parse(testHtml);
// Find all auction links
var auctionLinks = doc.select("a[href^='/a/']");
Elements auctionLinks = doc.select("a[href^='/a/']");
System.out.println("\n=== Auction Parsing Test ===");
System.out.println("Found " + auctionLinks.size() + " auction links");
List<TroostwijkScraper.AuctionInfo> auctions = new ArrayList<>();
var count = 0;
int count = 0;
for (var link : auctionLinks) {
var href = link.attr("href");
for (Element link : auctionLinks) {
String href = link.attr("href");
// Extract auction ID from URL
var pattern = Pattern.compile("/a/.*?-A([17])-(\\d+)");
var matcher = pattern.matcher(href);
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)");
java.util.regex.Matcher matcher = pattern.matcher(href);
if (!matcher.find()) {
continue;
}
var typeNum = matcher.group(1);
var auctionId = Integer.parseInt(matcher.group(2));
String typeNum = matcher.group(1);
int auctionId = Integer.parseInt(matcher.group(2));
// Extract auction info using IMPROVED text-based method
var auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
TroostwijkScraper.AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
auctions.add(auction);
// Print the first 10 auctions for verification
@@ -87,10 +86,10 @@ public class AuctionParsingTest {
System.out.println("Total auctions parsed: " + auctions.size());
// Count by country
var nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count();
var bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count();
var deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count();
var beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count();
long nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count();
long bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count();
long deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count();
long beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count();
System.out.println("Dutch (NL) auctions: " + nlCount);
System.out.println("Bulgarian (BG) auctions: " + bgCount);
@@ -102,171 +101,95 @@ public class AuctionParsingTest {
assertTrue(auctions.size() > 0, "Should find at least one auction");
// Verify all auctions have basic info
for (var auction : auctions) {
for (TroostwijkScraper.AuctionInfo auction : auctions) {
assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId);
assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId);
assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId);
assertTrue(auction.auctionId > 0, "Auction ID should be positive");
assertNotNull(auction.location, "Location should not be null for auction " + auction.auctionId);
assertNotNull(auction.country, "Country should not be null for auction " + auction.auctionId);
assertTrue(auction.lotCount > 0, "Lot count should be positive for auction " + auction.auctionId);
}
}
/**
* Helper method to extract auction info from a link element
* This is a copy of the logic from TroostwijkScraper for testing purposes
*/
private TroostwijkScraper.AuctionInfo extractAuctionInfo(Element link, String href, int auctionId, String type) {
var auction = new TroostwijkScraper.AuctionInfo();
auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;
// Extract title from href (convert kebab-case to title)
var titlePattern = Pattern.compile("/a/(.+?)-A[17]-");
var titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
var slug = titleMatcher.group(1);
auction.title = slug.replace("-", " ");
// Capitalize first letter
if (!auction.title.isEmpty()) {
auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1);
}
} else {
auction.title = "Unknown Auction";
}
// Try to find title in link text (more accurate)
var linkText = link.text();
if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) {
// If link text doesn't contain numbers, it's likely the title
var parts = linkText.split(",|\\d+");
if (parts.length > 0 && parts[0].trim().length() > 5) {
auction.title = parts[0].trim();
}
}
// Extract location using JSoup selectors
// Look for <p> tags that contain location info
var locationElements = link.select("p");
for (var p : locationElements) {
var text = p.text();
// Pattern: "City, Country" or "City, Region, Country"
if (text.matches(".*[A-Z]{2}$")) {
// Ends with 2-letter country code
var countryCode = text.substring(text.length() - 2);
var cityPart = text.substring(0, text.length() - 2).trim();
// Remove trailing comma or whitespace
cityPart = cityPart.replaceAll("[,\\s]+$", "");
auction.country = countryCode;
auction.city = cityPart;
auction.location = cityPart + ", " + countryCode;
break;
}
}
// Fallback: check HTML content directly
if (auction.country == null) {
var html = link.html();
var locPattern = Pattern.compile(
"([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
var locMatcher = locPattern.matcher(html);
if (locMatcher.find()) {
var city = locMatcher.group(1).trim().replaceAll(",$", "");
var country = locMatcher.group(2);
auction.city = city;
auction.country = country;
auction.location = city + ", " + country;
}
}
// Extract lot count if available (kavels/lots)
var textElements = link.select("*");
for (var elem : textElements) {
var text = elem.ownText();
if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) {
var countPattern = Pattern.compile("(\\d+)");
var countMatcher = countPattern.matcher(text);
if (countMatcher.find()) {
auction.lotCount = Integer.parseInt(countMatcher.group(1));
break;
}
}
}
return auction;
}
/**
* IMPROVED: Extract auction info using .text() method
* This parses the human-readable text instead of HTML markup
*
* Expected format: "[day] om [time] [lot_count] [title] [city], [CC]"
* Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
*/
private TroostwijkScraper.AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
var auction = new TroostwijkScraper.AuctionInfo();
TroostwijkScraper.AuctionInfo auction = new TroostwijkScraper.AuctionInfo();
auction.auctionId = auctionId;
auction.type = type;
auction.url = "https://www.troostwijkauctions.com" + href;
// Get ALL visible text from the link (this removes all HTML tags)
var allText = link.text();
String allText = link.text().trim();
// Extract title (usually the first meaningful text, before numbers/location)
var textParts = allText.split("\\s{2,}"); // Split on 2+ spaces
for (var part : textParts) {
part = part.trim();
// Title is usually first long text without dates/numbers/commas
if (part.length() > 15 && !part.matches(".*\\d{1,2}[:/]\\d{2}.*") && !part.matches(".*,\\s*[A-Z]{2}$")) {
auction.title = part;
break;
}
// Pattern: "[day] om [time] [lot_count] [title] [city], [CC]"
// Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
// Step 1: Extract closing time (day + time)
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile(
"(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})"
);
java.util.regex.Matcher timeMatcher = timePattern.matcher(allText);
String remainingText = allText;
if (timeMatcher.find()) {
String day = timeMatcher.group(1); // e.g., "woensdag"
String time = timeMatcher.group(2); // e.g., "18:00"
// Store closing time info (could be parsed to LocalDateTime with proper date)
System.out.println(" Closing time: " + day + " om " + time);
// Remove the time part from text
remainingText = allText.substring(timeMatcher.end()).trim();
}
// Fallback: use URL slug for title
if (auction.title == null || auction.title.isEmpty()) {
var titlePattern = Pattern.compile("/a/(.+?)-A[17]-");
var titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
var slug = titleMatcher.group(1).replace("-", " ");
auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1);
} else {
auction.title = "Unknown Auction";
}
}
// Extract location (look for "City, CC" pattern where CC is 2-letter country code)
var locPattern = Pattern.compile(
"([A-Za-z][A-Za-z\\s\\-']+?),\\s*([A-Z]{2})(?![A-Za-z])"
);
var locMatcher = locPattern.matcher(allText);
// Step 2: Extract location from the END (always ends with ", CC")
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
);
java.util.regex.Matcher locMatcher = locPattern.matcher(remainingText);
if (locMatcher.find()) {
auction.city = locMatcher.group(1).trim();
auction.country = locMatcher.group(2);
auction.location = auction.city + ", " + auction.country;
// Remove location from end
remainingText = remainingText.substring(0, locMatcher.start()).trim();
}
// Extract lot count (look for "X kavels" or "X lots")
var lotPattern = Pattern.compile(
"(\\d+)\\s+(?:kavel|lot|item)s?"
);
var lotMatcher = lotPattern.matcher(allText);
// Step 3: Extract lot count (first number after time)
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile(
"^(\\d+)\\s+"
);
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remainingText);
if (lotMatcher.find()) {
auction.lotCount = Integer.parseInt(lotMatcher.group(1));
// Remove lot count from beginning
remainingText = remainingText.substring(lotMatcher.end()).trim();
}
// Extract closing time info (look for time patterns)
var timePattern = Pattern.compile(
"(\\d{1,2}:\\d{2})"
);
var timeMatcher = timePattern.matcher(allText);
if (timeMatcher.find()) {
var timeStr = timeMatcher.group(1);
// Note: Would need full date to convert to LocalDateTime
// For now, just log it
// Step 4: What remains is the title
if (!remainingText.isEmpty()) {
auction.title = remainingText;
} else {
// Fallback: use URL slug for title
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-");
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
if (titleMatcher.find()) {
String slug = titleMatcher.group(1).replace("-", " ").replace("%7C", "|");
auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1);
} else {
auction.title = "Unknown Auction";
}
}
return auction;
@@ -277,26 +200,26 @@ public class AuctionParsingTest {
System.out.println("\n=== Location Pattern Tests ===");
// Test different location formats
var testCases = new String[]{
"<p>Amsterdam, NL</p>",
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
"<p>Berlin, DE</p>",
"<span>Brussels,</span>BE"
};
String[] testCases = {
"<p>Amsterdam, NL</p>",
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
"<p>Berlin, DE</p>",
"<span>Brussels,</span>BE"
};
for (var testHtml : testCases) {
var doc = Jsoup.parse(testHtml);
var elem = doc.select("p, span").first();
for (String testHtml : testCases) {
Document doc = Jsoup.parse(testHtml);
Element elem = doc.select("p, span").first();
if (elem != null) {
var text = elem.text();
String text = elem.text();
System.out.println("\nTest: " + testHtml);
System.out.println("Text: " + text);
// Test regex pattern
if (text.matches(".*[A-Z]{2}$")) {
var countryCode = text.substring(text.length() - 2);
var cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
String countryCode = text.substring(text.length() - 2);
String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
} else {
System.out.println("→ No match");
@@ -304,4 +227,52 @@ public class AuctionParsingTest {
}
}
}
@Test
public void testFullTextPatternMatching() {
System.out.println("\n=== Full Text Pattern Tests ===");
// Test the complete auction text format
String[] testCases = {
"woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE",
"maandag om 14:30 5 Industriële machines Amsterdam, NL",
"vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE"
};
for (String testText : testCases) {
System.out.println("\nParsing: \"" + testText + "\"");
// Simulated extraction
String remaining = testText;
// Extract time
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})");
java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining);
if (timeMatcher.find()) {
System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2));
remaining = remaining.substring(timeMatcher.end()).trim();
}
// Extract location
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
);
java.util.regex.Matcher locMatcher = locPattern.matcher(remaining);
if (locMatcher.find()) {
System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2));
remaining = remaining.substring(0, locMatcher.start()).trim();
}
// Extract lot count
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+");
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining);
if (lotMatcher.find()) {
System.out.println(" Lot count: " + lotMatcher.group(1));
remaining = remaining.substring(lotMatcher.end()).trim();
}
// What remains is title
System.out.println(" Title: " + remaining);
}
}
}

File diff suppressed because one or more lines are too long

View File

@@ -67,28 +67,6 @@ public class TroostwijkScraperTest {
}
}
@Test
public void testDiscoverDutchAuctions() {
// Discover auctions from the live page
List<Integer> auctions = scraper.discoverDutchAuctions();
// Verify that auctions were found
assertNotNull(auctions, "Auctions list should not be null");
assertFalse(auctions.isEmpty(), "Should find at least one Dutch auction");
// Verify that all sale IDs are positive integers
for (Integer saleId : auctions) {
assertNotNull(saleId, "Sale ID should not be null");
assertTrue(saleId > 0, "Sale ID should be positive: " + saleId);
}
// Verify no duplicate sale IDs
long uniqueCount = auctions.stream().distinct().count();
assertEquals(auctions.size(), uniqueCount, "All sale IDs should be unique");
System.out.println("✓ Found " + auctions.size() + " unique Dutch auctions");
System.out.println("✓ Sale IDs: " + auctions);
}
@Test
public void testFetchAndPersistAuctionData() throws SQLException {