start
This commit is contained in:
@@ -20,259 +20,94 @@ import static org.junit.jupiter.api.Assertions.*;
|
||||
* Tests the markup data extraction for each auction found
|
||||
*/
|
||||
public class AuctionParsingTest {
|
||||
|
||||
private static String testHtml;
|
||||
|
||||
@BeforeAll
|
||||
public static void loadTestHtml() throws IOException {
|
||||
// Load the test HTML file
|
||||
testHtml = Files.readString(Paths.get("src/test/resources/test_auctions.html"));
|
||||
System.out.println("Loaded test HTML (" + testHtml.length() + " characters)");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseAuctionsFromTestHtml() {
|
||||
// Parse the HTML with JSoup
|
||||
Document doc = Jsoup.parse(testHtml);
|
||||
|
||||
// Find all auction links
|
||||
Elements auctionLinks = doc.select("a[href^='/a/']");
|
||||
|
||||
System.out.println("\n=== Auction Parsing Test ===");
|
||||
System.out.println("Found " + auctionLinks.size() + " auction links");
|
||||
|
||||
List<AuctionInfo> auctions = new ArrayList<>();
|
||||
int count = 0;
|
||||
|
||||
for (Element link : auctionLinks) {
|
||||
String href = link.attr("href");
|
||||
|
||||
// Extract auction ID from URL
|
||||
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)");
|
||||
java.util.regex.Matcher matcher = pattern.matcher(href);
|
||||
|
||||
if (!matcher.find()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String typeNum = matcher.group(1);
|
||||
int auctionId = Integer.parseInt(matcher.group(2));
|
||||
|
||||
// Extract auction info using IMPROVED text-based method
|
||||
AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
|
||||
auctions.add(auction);
|
||||
|
||||
// Print the first 10 auctions for verification
|
||||
if (count < 10) {
|
||||
System.out.println("\n--- Auction #" + (count + 1) + " ---");
|
||||
System.out.println("ID: " + auction.auctionId);
|
||||
System.out.println("Type: " + auction.type);
|
||||
System.out.println("Title: " + auction.title);
|
||||
System.out.println("Location: " + auction.location);
|
||||
System.out.println("City: " + auction.city);
|
||||
System.out.println("Country: " + auction.country);
|
||||
System.out.println("Lot Count: " + auction.lotCount);
|
||||
System.out.println("URL: " + auction.url);
|
||||
|
||||
// Print ALL visible text for debugging
|
||||
System.out.println("\nAll visible text from link:");
|
||||
System.out.println("\"" + link.text() + "\"");
|
||||
}
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
System.out.println("\n=== Summary ===");
|
||||
System.out.println("Total auctions parsed: " + auctions.size());
|
||||
|
||||
// Count by country
|
||||
long nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count();
|
||||
long bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count();
|
||||
long deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count();
|
||||
long beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count();
|
||||
|
||||
System.out.println("Dutch (NL) auctions: " + nlCount);
|
||||
System.out.println("Bulgarian (BG) auctions: " + bgCount);
|
||||
System.out.println("German (DE) auctions: " + deCount);
|
||||
System.out.println("Belgian (BE) auctions: " + beCount);
|
||||
System.out.println("Unknown location: " + auctions.stream().filter(a -> a.country == null).count());
|
||||
|
||||
// Assertions
|
||||
assertTrue(auctions.size() > 0, "Should find at least one auction");
|
||||
|
||||
// Verify all auctions have basic info
|
||||
for (AuctionInfo auction : auctions) {
|
||||
assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId);
|
||||
assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId);
|
||||
assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId);
|
||||
assertTrue(auction.auctionId > 0, "Auction ID should be positive");
|
||||
assertNotNull(auction.location, "Location should not be null for auction " + auction.auctionId);
|
||||
assertNotNull(auction.country, "Country should not be null for auction " + auction.auctionId);
|
||||
assertTrue(auction.lotCount > 0, "Lot count should be positive for auction " + auction.auctionId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* IMPROVED: Extract auction info using .text() method
|
||||
* This parses the human-readable text instead of HTML markup
|
||||
*
|
||||
* Expected format: "[day] om [time] [lot_count] [title] [city], [CC]"
|
||||
* Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
|
||||
*/
|
||||
private AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
|
||||
AuctionInfo auction = new AuctionInfo();
|
||||
auction.auctionId = auctionId;
|
||||
auction.type = type;
|
||||
auction.url = "https://www.troostwijkauctions.com" + href;
|
||||
|
||||
// Get ALL visible text from the link (this removes all HTML tags)
|
||||
String allText = link.text().trim();
|
||||
|
||||
// Pattern: "[day] om [time] [lot_count] [title] [city], [CC]"
|
||||
// Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
|
||||
|
||||
// Step 1: Extract closing time (day + time)
|
||||
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile(
|
||||
"(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})"
|
||||
);
|
||||
java.util.regex.Matcher timeMatcher = timePattern.matcher(allText);
|
||||
|
||||
String remainingText = allText;
|
||||
if (timeMatcher.find()) {
|
||||
String day = timeMatcher.group(1); // e.g., "woensdag"
|
||||
String time = timeMatcher.group(2); // e.g., "18:00"
|
||||
|
||||
// Store closing time info (could be parsed to LocalDateTime with proper date)
|
||||
System.out.println(" Closing time: " + day + " om " + time);
|
||||
|
||||
// Remove the time part from text
|
||||
remainingText = allText.substring(timeMatcher.end()).trim();
|
||||
}
|
||||
|
||||
// Step 2: Extract location from the END (always ends with ", CC")
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
|
||||
);
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(remainingText);
|
||||
|
||||
if (locMatcher.find()) {
|
||||
auction.city = locMatcher.group(1).trim();
|
||||
auction.country = locMatcher.group(2);
|
||||
auction.location = auction.city + ", " + auction.country;
|
||||
|
||||
// Remove location from end
|
||||
remainingText = remainingText.substring(0, locMatcher.start()).trim();
|
||||
}
|
||||
|
||||
// Step 3: Extract lot count (first number after time)
|
||||
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile(
|
||||
"^(\\d+)\\s+"
|
||||
);
|
||||
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remainingText);
|
||||
|
||||
if (lotMatcher.find()) {
|
||||
auction.lotCount = Integer.parseInt(lotMatcher.group(1));
|
||||
|
||||
// Remove lot count from beginning
|
||||
remainingText = remainingText.substring(lotMatcher.end()).trim();
|
||||
}
|
||||
|
||||
// Step 4: What remains is the title
|
||||
if (!remainingText.isEmpty()) {
|
||||
auction.title = remainingText;
|
||||
} else {
|
||||
// Fallback: use URL slug for title
|
||||
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-");
|
||||
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
|
||||
if (titleMatcher.find()) {
|
||||
String slug = titleMatcher.group(1).replace("-", " ").replace("%7C", "|");
|
||||
auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1);
|
||||
|
||||
private static String testHtml;
|
||||
|
||||
@BeforeAll
|
||||
public static void loadTestHtml() throws IOException {
|
||||
// Load the test HTML file
|
||||
testHtml = Files.readString(Paths.get("src/test/resources/test_auctions.html"));
|
||||
System.out.println("Loaded test HTML (" + testHtml.length() + " characters)");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLocationPatternMatching() {
|
||||
System.out.println("\n=== Location Pattern Tests ===");
|
||||
|
||||
// Test different location formats
|
||||
String[] testCases = {
|
||||
"<p>Amsterdam, NL</p>",
|
||||
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
|
||||
"<p>Berlin, DE</p>",
|
||||
"<span>Brussels,</span>BE"
|
||||
};
|
||||
|
||||
for (String testHtml : testCases) {
|
||||
Document doc = Jsoup.parse(testHtml);
|
||||
Element elem = doc.select("p, span").first();
|
||||
|
||||
if (elem != null) {
|
||||
String text = elem.text();
|
||||
System.out.println("\nTest: " + testHtml);
|
||||
System.out.println("Text: " + text);
|
||||
|
||||
// Test regex pattern
|
||||
if (text.matches(".*[A-Z]{2}$")) {
|
||||
String countryCode = text.substring(text.length() - 2);
|
||||
String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
|
||||
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
|
||||
} else {
|
||||
auction.title = "Unknown Auction";
|
||||
System.out.println("→ No match");
|
||||
}
|
||||
}
|
||||
|
||||
return auction;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLocationPatternMatching() {
|
||||
System.out.println("\n=== Location Pattern Tests ===");
|
||||
|
||||
// Test different location formats
|
||||
String[] testCases = {
|
||||
"<p>Amsterdam, NL</p>",
|
||||
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
|
||||
"<p>Berlin, DE</p>",
|
||||
"<span>Brussels,</span>BE"
|
||||
};
|
||||
|
||||
for (String testHtml : testCases) {
|
||||
Document doc = Jsoup.parse(testHtml);
|
||||
Element elem = doc.select("p, span").first();
|
||||
|
||||
if (elem != null) {
|
||||
String text = elem.text();
|
||||
System.out.println("\nTest: " + testHtml);
|
||||
System.out.println("Text: " + text);
|
||||
|
||||
// Test regex pattern
|
||||
if (text.matches(".*[A-Z]{2}$")) {
|
||||
String countryCode = text.substring(text.length() - 2);
|
||||
String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
|
||||
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
|
||||
} else {
|
||||
System.out.println("→ No match");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullTextPatternMatching() {
|
||||
System.out.println("\n=== Full Text Pattern Tests ===");
|
||||
|
||||
// Test the complete auction text format
|
||||
String[] testCases = {
|
||||
"woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE",
|
||||
"maandag om 14:30 5 Industriële machines Amsterdam, NL",
|
||||
"vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE"
|
||||
};
|
||||
|
||||
for (String testText : testCases) {
|
||||
System.out.println("\nParsing: \"" + testText + "\"");
|
||||
|
||||
// Simulated extraction
|
||||
String remaining = testText;
|
||||
|
||||
// Extract time
|
||||
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})");
|
||||
java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining);
|
||||
if (timeMatcher.find()) {
|
||||
System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2));
|
||||
remaining = remaining.substring(timeMatcher.end()).trim();
|
||||
}
|
||||
|
||||
// Extract location
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
|
||||
);
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(remaining);
|
||||
if (locMatcher.find()) {
|
||||
System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2));
|
||||
remaining = remaining.substring(0, locMatcher.start()).trim();
|
||||
}
|
||||
|
||||
// Extract lot count
|
||||
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+");
|
||||
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining);
|
||||
if (lotMatcher.find()) {
|
||||
System.out.println(" Lot count: " + lotMatcher.group(1));
|
||||
remaining = remaining.substring(lotMatcher.end()).trim();
|
||||
}
|
||||
|
||||
// What remains is title
|
||||
System.out.println(" Title: " + remaining);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullTextPatternMatching() {
|
||||
System.out.println("\n=== Full Text Pattern Tests ===");
|
||||
|
||||
// Test the complete auction text format
|
||||
String[] testCases = {
|
||||
"woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE",
|
||||
"maandag om 14:30 5 Industriële machines Amsterdam, NL",
|
||||
"vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE"
|
||||
};
|
||||
|
||||
for (String testText : testCases) {
|
||||
System.out.println("\nParsing: \"" + testText + "\"");
|
||||
|
||||
// Simulated extraction
|
||||
String remaining = testText;
|
||||
|
||||
// Extract time
|
||||
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})");
|
||||
java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining);
|
||||
if (timeMatcher.find()) {
|
||||
System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2));
|
||||
remaining = remaining.substring(timeMatcher.end()).trim();
|
||||
}
|
||||
|
||||
// Extract location
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
|
||||
);
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(remaining);
|
||||
if (locMatcher.find()) {
|
||||
System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2));
|
||||
remaining = remaining.substring(0, locMatcher.start()).trim();
|
||||
}
|
||||
|
||||
// Extract lot count
|
||||
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+");
|
||||
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining);
|
||||
if (lotMatcher.find()) {
|
||||
System.out.println(" Lot count: " + lotMatcher.group(1));
|
||||
remaining = remaining.substring(lotMatcher.end()).trim();
|
||||
}
|
||||
|
||||
// What remains is title
|
||||
System.out.println(" Title: " + remaining);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,8 @@
|
||||
package com.auction;
|
||||
|
||||
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
|
||||
import com.vladsch.flexmark.util.data.DataHolder;
|
||||
import net.bytebuddy.build.Plugin.Engine.Source.Element;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import org.junit.jupiter.api.extension.Extensions;
|
||||
public class Parser {
|
||||
|
||||
public record AuctionItem(
|
||||
|
||||
Reference in New Issue
Block a user