start
This commit is contained in:
7
.idea/data_source_mapping.xml
generated
7
.idea/data_source_mapping.xml
generated
@@ -1,7 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="DataSourcePerFileMappings">
|
|
||||||
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9/console.sql" value="9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9" />
|
|
||||||
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/9cb4e997-fbca-4426-9093-d308871c5d5e/console.sql" value="9cb4e997-fbca-4426-9093-d308871c5d5e" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
||||||
1
.idea/vcs.xml
generated
1
.idea/vcs.xml
generated
@@ -2,5 +2,6 @@
|
|||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="VcsDirectoryMappings">
|
<component name="VcsDirectoryMappings">
|
||||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
<mapping directory="$PROJECT_DIR$/Auction" vcs="Git" />
|
||||||
</component>
|
</component>
|
||||||
</project>
|
</project>
|
||||||
BIN
cache/page_cache.db
vendored
BIN
cache/page_cache.db
vendored
Binary file not shown.
5
pom.xml
5
pom.xml
@@ -78,6 +78,11 @@
|
|||||||
<version>5.10.1</version>
|
<version>5.10.1</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.vladsch.flexmark</groupId>
|
||||||
|
<artifactId>flexmark-all</artifactId>
|
||||||
|
<version>0.64.8</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
@@ -27,40 +26,40 @@ public class AuctionParsingTest {
|
|||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void loadTestHtml() throws IOException {
|
public static void loadTestHtml() throws IOException {
|
||||||
// Load the test HTML file
|
// Load the test HTML file
|
||||||
testHtml = Files.readString(Paths.get("src/test/resources/test.html"));
|
testHtml = Files.readString(Paths.get("src/test/resources/test_auctions.html"));
|
||||||
System.out.println("Loaded test HTML (" + testHtml.length() + " characters)");
|
System.out.println("Loaded test HTML (" + testHtml.length() + " characters)");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testParseAuctionsFromTestHtml() {
|
public void testParseAuctionsFromTestHtml() {
|
||||||
// Parse the HTML with JSoup
|
// Parse the HTML with JSoup
|
||||||
var doc = Jsoup.parse(testHtml);
|
Document doc = Jsoup.parse(testHtml);
|
||||||
|
|
||||||
// Find all auction links
|
// Find all auction links
|
||||||
var auctionLinks = doc.select("a[href^='/a/']");
|
Elements auctionLinks = doc.select("a[href^='/a/']");
|
||||||
|
|
||||||
System.out.println("\n=== Auction Parsing Test ===");
|
System.out.println("\n=== Auction Parsing Test ===");
|
||||||
System.out.println("Found " + auctionLinks.size() + " auction links");
|
System.out.println("Found " + auctionLinks.size() + " auction links");
|
||||||
|
|
||||||
List<TroostwijkScraper.AuctionInfo> auctions = new ArrayList<>();
|
List<TroostwijkScraper.AuctionInfo> auctions = new ArrayList<>();
|
||||||
var count = 0;
|
int count = 0;
|
||||||
|
|
||||||
for (var link : auctionLinks) {
|
for (Element link : auctionLinks) {
|
||||||
var href = link.attr("href");
|
String href = link.attr("href");
|
||||||
|
|
||||||
// Extract auction ID from URL
|
// Extract auction ID from URL
|
||||||
var pattern = Pattern.compile("/a/.*?-A([17])-(\\d+)");
|
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("/a/.*?-A([17])-(\\d+)");
|
||||||
var matcher = pattern.matcher(href);
|
java.util.regex.Matcher matcher = pattern.matcher(href);
|
||||||
|
|
||||||
if (!matcher.find()) {
|
if (!matcher.find()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var typeNum = matcher.group(1);
|
String typeNum = matcher.group(1);
|
||||||
var auctionId = Integer.parseInt(matcher.group(2));
|
int auctionId = Integer.parseInt(matcher.group(2));
|
||||||
|
|
||||||
// Extract auction info using IMPROVED text-based method
|
// Extract auction info using IMPROVED text-based method
|
||||||
var auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
|
TroostwijkScraper.AuctionInfo auction = extractAuctionInfoFromText(link, href, auctionId, "A" + typeNum);
|
||||||
auctions.add(auction);
|
auctions.add(auction);
|
||||||
|
|
||||||
// Print the first 10 auctions for verification
|
// Print the first 10 auctions for verification
|
||||||
@@ -87,10 +86,10 @@ public class AuctionParsingTest {
|
|||||||
System.out.println("Total auctions parsed: " + auctions.size());
|
System.out.println("Total auctions parsed: " + auctions.size());
|
||||||
|
|
||||||
// Count by country
|
// Count by country
|
||||||
var nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count();
|
long nlCount = auctions.stream().filter(a -> "NL".equals(a.country)).count();
|
||||||
var bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count();
|
long bgCount = auctions.stream().filter(a -> "BG".equals(a.country)).count();
|
||||||
var deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count();
|
long deCount = auctions.stream().filter(a -> "DE".equals(a.country)).count();
|
||||||
var beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count();
|
long beCount = auctions.stream().filter(a -> "BE".equals(a.country)).count();
|
||||||
|
|
||||||
System.out.println("Dutch (NL) auctions: " + nlCount);
|
System.out.println("Dutch (NL) auctions: " + nlCount);
|
||||||
System.out.println("Bulgarian (BG) auctions: " + bgCount);
|
System.out.println("Bulgarian (BG) auctions: " + bgCount);
|
||||||
@@ -102,171 +101,95 @@ public class AuctionParsingTest {
|
|||||||
assertTrue(auctions.size() > 0, "Should find at least one auction");
|
assertTrue(auctions.size() > 0, "Should find at least one auction");
|
||||||
|
|
||||||
// Verify all auctions have basic info
|
// Verify all auctions have basic info
|
||||||
for (var auction : auctions) {
|
for (TroostwijkScraper.AuctionInfo auction : auctions) {
|
||||||
assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId);
|
assertNotNull(auction.title, "Title should not be null for auction " + auction.auctionId);
|
||||||
assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId);
|
assertTrue(auction.title.length() > 0, "Title should not be empty for auction " + auction.auctionId);
|
||||||
assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId);
|
assertNotNull(auction.url, "URL should not be null for auction " + auction.auctionId);
|
||||||
assertTrue(auction.auctionId > 0, "Auction ID should be positive");
|
assertTrue(auction.auctionId > 0, "Auction ID should be positive");
|
||||||
|
assertNotNull(auction.location, "Location should not be null for auction " + auction.auctionId);
|
||||||
|
assertNotNull(auction.country, "Country should not be null for auction " + auction.auctionId);
|
||||||
|
assertTrue(auction.lotCount > 0, "Lot count should be positive for auction " + auction.auctionId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper method to extract auction info from a link element
|
|
||||||
* This is a copy of the logic from TroostwijkScraper for testing purposes
|
|
||||||
*/
|
|
||||||
private TroostwijkScraper.AuctionInfo extractAuctionInfo(Element link, String href, int auctionId, String type) {
|
|
||||||
var auction = new TroostwijkScraper.AuctionInfo();
|
|
||||||
auction.auctionId = auctionId;
|
|
||||||
auction.type = type;
|
|
||||||
auction.url = "https://www.troostwijkauctions.com" + href;
|
|
||||||
|
|
||||||
// Extract title from href (convert kebab-case to title)
|
|
||||||
var titlePattern = Pattern.compile("/a/(.+?)-A[17]-");
|
|
||||||
var titleMatcher = titlePattern.matcher(href);
|
|
||||||
if (titleMatcher.find()) {
|
|
||||||
var slug = titleMatcher.group(1);
|
|
||||||
auction.title = slug.replace("-", " ");
|
|
||||||
// Capitalize first letter
|
|
||||||
if (!auction.title.isEmpty()) {
|
|
||||||
auction.title = auction.title.substring(0, 1).toUpperCase() + auction.title.substring(1);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
auction.title = "Unknown Auction";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to find title in link text (more accurate)
|
|
||||||
var linkText = link.text();
|
|
||||||
if (!linkText.isEmpty() && !linkText.matches(".*\\d+.*")) {
|
|
||||||
// If link text doesn't contain numbers, it's likely the title
|
|
||||||
var parts = linkText.split(",|\\d+");
|
|
||||||
if (parts.length > 0 && parts[0].trim().length() > 5) {
|
|
||||||
auction.title = parts[0].trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract location using JSoup selectors
|
|
||||||
// Look for <p> tags that contain location info
|
|
||||||
var locationElements = link.select("p");
|
|
||||||
for (var p : locationElements) {
|
|
||||||
var text = p.text();
|
|
||||||
|
|
||||||
// Pattern: "City, Country" or "City, Region, Country"
|
|
||||||
if (text.matches(".*[A-Z]{2}$")) {
|
|
||||||
// Ends with 2-letter country code
|
|
||||||
var countryCode = text.substring(text.length() - 2);
|
|
||||||
var cityPart = text.substring(0, text.length() - 2).trim();
|
|
||||||
|
|
||||||
// Remove trailing comma or whitespace
|
|
||||||
cityPart = cityPart.replaceAll("[,\\s]+$", "");
|
|
||||||
|
|
||||||
auction.country = countryCode;
|
|
||||||
auction.city = cityPart;
|
|
||||||
auction.location = cityPart + ", " + countryCode;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: check HTML content directly
|
|
||||||
if (auction.country == null) {
|
|
||||||
var html = link.html();
|
|
||||||
var locPattern = Pattern.compile(
|
|
||||||
"([A-Za-z][A-Za-z\\s,\\-']+?)\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
|
|
||||||
var locMatcher = locPattern.matcher(html);
|
|
||||||
|
|
||||||
if (locMatcher.find()) {
|
|
||||||
var city = locMatcher.group(1).trim().replaceAll(",$", "");
|
|
||||||
var country = locMatcher.group(2);
|
|
||||||
auction.city = city;
|
|
||||||
auction.country = country;
|
|
||||||
auction.location = city + ", " + country;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract lot count if available (kavels/lots)
|
|
||||||
var textElements = link.select("*");
|
|
||||||
for (var elem : textElements) {
|
|
||||||
var text = elem.ownText();
|
|
||||||
if (text.matches("\\d+\\s+(?:kavel|lot|item)s?.*")) {
|
|
||||||
var countPattern = Pattern.compile("(\\d+)");
|
|
||||||
var countMatcher = countPattern.matcher(text);
|
|
||||||
if (countMatcher.find()) {
|
|
||||||
auction.lotCount = Integer.parseInt(countMatcher.group(1));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return auction;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IMPROVED: Extract auction info using .text() method
|
* IMPROVED: Extract auction info using .text() method
|
||||||
* This parses the human-readable text instead of HTML markup
|
* This parses the human-readable text instead of HTML markup
|
||||||
|
*
|
||||||
|
* Expected format: "[day] om [time] [lot_count] [title] [city], [CC]"
|
||||||
|
* Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
|
||||||
*/
|
*/
|
||||||
private TroostwijkScraper.AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
|
private TroostwijkScraper.AuctionInfo extractAuctionInfoFromText(Element link, String href, int auctionId, String type) {
|
||||||
var auction = new TroostwijkScraper.AuctionInfo();
|
TroostwijkScraper.AuctionInfo auction = new TroostwijkScraper.AuctionInfo();
|
||||||
auction.auctionId = auctionId;
|
auction.auctionId = auctionId;
|
||||||
auction.type = type;
|
auction.type = type;
|
||||||
auction.url = "https://www.troostwijkauctions.com" + href;
|
auction.url = "https://www.troostwijkauctions.com" + href;
|
||||||
|
|
||||||
// Get ALL visible text from the link (this removes all HTML tags)
|
// Get ALL visible text from the link (this removes all HTML tags)
|
||||||
var allText = link.text();
|
String allText = link.text().trim();
|
||||||
|
|
||||||
// Extract title (usually the first meaningful text, before numbers/location)
|
// Pattern: "[day] om [time] [lot_count] [title] [city], [CC]"
|
||||||
var textParts = allText.split("\\s{2,}"); // Split on 2+ spaces
|
// Example: "woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE"
|
||||||
for (var part : textParts) {
|
|
||||||
part = part.trim();
|
|
||||||
// Title is usually first long text without dates/numbers/commas
|
|
||||||
if (part.length() > 15 && !part.matches(".*\\d{1,2}[:/]\\d{2}.*") && !part.matches(".*,\\s*[A-Z]{2}$")) {
|
|
||||||
auction.title = part;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: use URL slug for title
|
// Step 1: Extract closing time (day + time)
|
||||||
if (auction.title == null || auction.title.isEmpty()) {
|
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile(
|
||||||
var titlePattern = Pattern.compile("/a/(.+?)-A[17]-");
|
"(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})"
|
||||||
var titleMatcher = titlePattern.matcher(href);
|
|
||||||
if (titleMatcher.find()) {
|
|
||||||
var slug = titleMatcher.group(1).replace("-", " ");
|
|
||||||
auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1);
|
|
||||||
} else {
|
|
||||||
auction.title = "Unknown Auction";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract location (look for "City, CC" pattern where CC is 2-letter country code)
|
|
||||||
var locPattern = Pattern.compile(
|
|
||||||
"([A-Za-z][A-Za-z\\s\\-']+?),\\s*([A-Z]{2})(?![A-Za-z])"
|
|
||||||
);
|
);
|
||||||
var locMatcher = locPattern.matcher(allText);
|
java.util.regex.Matcher timeMatcher = timePattern.matcher(allText);
|
||||||
|
|
||||||
|
String remainingText = allText;
|
||||||
|
if (timeMatcher.find()) {
|
||||||
|
String day = timeMatcher.group(1); // e.g., "woensdag"
|
||||||
|
String time = timeMatcher.group(2); // e.g., "18:00"
|
||||||
|
|
||||||
|
// Store closing time info (could be parsed to LocalDateTime with proper date)
|
||||||
|
System.out.println(" Closing time: " + day + " om " + time);
|
||||||
|
|
||||||
|
// Remove the time part from text
|
||||||
|
remainingText = allText.substring(timeMatcher.end()).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Extract location from the END (always ends with ", CC")
|
||||||
|
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||||
|
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
|
||||||
|
);
|
||||||
|
java.util.regex.Matcher locMatcher = locPattern.matcher(remainingText);
|
||||||
|
|
||||||
if (locMatcher.find()) {
|
if (locMatcher.find()) {
|
||||||
auction.city = locMatcher.group(1).trim();
|
auction.city = locMatcher.group(1).trim();
|
||||||
auction.country = locMatcher.group(2);
|
auction.country = locMatcher.group(2);
|
||||||
auction.location = auction.city + ", " + auction.country;
|
auction.location = auction.city + ", " + auction.country;
|
||||||
|
|
||||||
|
// Remove location from end
|
||||||
|
remainingText = remainingText.substring(0, locMatcher.start()).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract lot count (look for "X kavels" or "X lots")
|
// Step 3: Extract lot count (first number after time)
|
||||||
var lotPattern = Pattern.compile(
|
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile(
|
||||||
"(\\d+)\\s+(?:kavel|lot|item)s?"
|
"^(\\d+)\\s+"
|
||||||
);
|
);
|
||||||
var lotMatcher = lotPattern.matcher(allText);
|
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remainingText);
|
||||||
|
|
||||||
if (lotMatcher.find()) {
|
if (lotMatcher.find()) {
|
||||||
auction.lotCount = Integer.parseInt(lotMatcher.group(1));
|
auction.lotCount = Integer.parseInt(lotMatcher.group(1));
|
||||||
|
|
||||||
|
// Remove lot count from beginning
|
||||||
|
remainingText = remainingText.substring(lotMatcher.end()).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract closing time info (look for time patterns)
|
// Step 4: What remains is the title
|
||||||
var timePattern = Pattern.compile(
|
if (!remainingText.isEmpty()) {
|
||||||
"(\\d{1,2}:\\d{2})"
|
auction.title = remainingText;
|
||||||
);
|
} else {
|
||||||
var timeMatcher = timePattern.matcher(allText);
|
// Fallback: use URL slug for title
|
||||||
|
java.util.regex.Pattern titlePattern = java.util.regex.Pattern.compile("/a/(.+?)-A[17]-");
|
||||||
if (timeMatcher.find()) {
|
java.util.regex.Matcher titleMatcher = titlePattern.matcher(href);
|
||||||
var timeStr = timeMatcher.group(1);
|
if (titleMatcher.find()) {
|
||||||
// Note: Would need full date to convert to LocalDateTime
|
String slug = titleMatcher.group(1).replace("-", " ").replace("%7C", "|");
|
||||||
// For now, just log it
|
auction.title = slug.substring(0, 1).toUpperCase() + slug.substring(1);
|
||||||
|
} else {
|
||||||
|
auction.title = "Unknown Auction";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return auction;
|
return auction;
|
||||||
@@ -277,26 +200,26 @@ public class AuctionParsingTest {
|
|||||||
System.out.println("\n=== Location Pattern Tests ===");
|
System.out.println("\n=== Location Pattern Tests ===");
|
||||||
|
|
||||||
// Test different location formats
|
// Test different location formats
|
||||||
var testCases = new String[]{
|
String[] testCases = {
|
||||||
"<p>Amsterdam, NL</p>",
|
"<p>Amsterdam, NL</p>",
|
||||||
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
|
"<p class=\"flex truncate\"><span class=\"w-full truncate\">Sofia,<!-- --> </span>BG</p>",
|
||||||
"<p>Berlin, DE</p>",
|
"<p>Berlin, DE</p>",
|
||||||
"<span>Brussels,</span>BE"
|
"<span>Brussels,</span>BE"
|
||||||
};
|
};
|
||||||
|
|
||||||
for (var testHtml : testCases) {
|
for (String testHtml : testCases) {
|
||||||
var doc = Jsoup.parse(testHtml);
|
Document doc = Jsoup.parse(testHtml);
|
||||||
var elem = doc.select("p, span").first();
|
Element elem = doc.select("p, span").first();
|
||||||
|
|
||||||
if (elem != null) {
|
if (elem != null) {
|
||||||
var text = elem.text();
|
String text = elem.text();
|
||||||
System.out.println("\nTest: " + testHtml);
|
System.out.println("\nTest: " + testHtml);
|
||||||
System.out.println("Text: " + text);
|
System.out.println("Text: " + text);
|
||||||
|
|
||||||
// Test regex pattern
|
// Test regex pattern
|
||||||
if (text.matches(".*[A-Z]{2}$")) {
|
if (text.matches(".*[A-Z]{2}$")) {
|
||||||
var countryCode = text.substring(text.length() - 2);
|
String countryCode = text.substring(text.length() - 2);
|
||||||
var cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
|
String cityPart = text.substring(0, text.length() - 2).trim().replaceAll("[,\\s]+$", "");
|
||||||
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
|
System.out.println("→ Extracted: " + cityPart + ", " + countryCode);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("→ No match");
|
System.out.println("→ No match");
|
||||||
@@ -304,4 +227,52 @@ public class AuctionParsingTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFullTextPatternMatching() {
|
||||||
|
System.out.println("\n=== Full Text Pattern Tests ===");
|
||||||
|
|
||||||
|
// Test the complete auction text format
|
||||||
|
String[] testCases = {
|
||||||
|
"woensdag om 18:00 1 Vrachtwagens voor bedrijfsvoertuigen Loßburg, DE",
|
||||||
|
"maandag om 14:30 5 Industriële machines Amsterdam, NL",
|
||||||
|
"vrijdag om 10:00 12 Landbouwmachines Antwerpen, BE"
|
||||||
|
};
|
||||||
|
|
||||||
|
for (String testText : testCases) {
|
||||||
|
System.out.println("\nParsing: \"" + testText + "\"");
|
||||||
|
|
||||||
|
// Simulated extraction
|
||||||
|
String remaining = testText;
|
||||||
|
|
||||||
|
// Extract time
|
||||||
|
java.util.regex.Pattern timePattern = java.util.regex.Pattern.compile("(\\w+)\\s+om\\s+(\\d{1,2}:\\d{2})");
|
||||||
|
java.util.regex.Matcher timeMatcher = timePattern.matcher(remaining);
|
||||||
|
if (timeMatcher.find()) {
|
||||||
|
System.out.println(" Time: " + timeMatcher.group(1) + " om " + timeMatcher.group(2));
|
||||||
|
remaining = remaining.substring(timeMatcher.end()).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract location
|
||||||
|
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||||
|
"([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\\s\\-'öäüßàèéêëïôùûç]+?),\\s*([A-Z]{2})\\s*$"
|
||||||
|
);
|
||||||
|
java.util.regex.Matcher locMatcher = locPattern.matcher(remaining);
|
||||||
|
if (locMatcher.find()) {
|
||||||
|
System.out.println(" Location: " + locMatcher.group(1) + ", " + locMatcher.group(2));
|
||||||
|
remaining = remaining.substring(0, locMatcher.start()).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract lot count
|
||||||
|
java.util.regex.Pattern lotPattern = java.util.regex.Pattern.compile("^(\\d+)\\s+");
|
||||||
|
java.util.regex.Matcher lotMatcher = lotPattern.matcher(remaining);
|
||||||
|
if (lotMatcher.find()) {
|
||||||
|
System.out.println(" Lot count: " + lotMatcher.group(1));
|
||||||
|
remaining = remaining.substring(lotMatcher.end()).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// What remains is title
|
||||||
|
System.out.println(" Title: " + remaining);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
64
src/test/java/com/auction/Parser.java
Normal file
64
src/test/java/com/auction/Parser.java
Normal file
File diff suppressed because one or more lines are too long
@@ -67,28 +67,6 @@ public class TroostwijkScraperTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testDiscoverDutchAuctions() {
|
|
||||||
// Discover auctions from the live page
|
|
||||||
List<Integer> auctions = scraper.discoverDutchAuctions();
|
|
||||||
|
|
||||||
// Verify that auctions were found
|
|
||||||
assertNotNull(auctions, "Auctions list should not be null");
|
|
||||||
assertFalse(auctions.isEmpty(), "Should find at least one Dutch auction");
|
|
||||||
|
|
||||||
// Verify that all sale IDs are positive integers
|
|
||||||
for (Integer saleId : auctions) {
|
|
||||||
assertNotNull(saleId, "Sale ID should not be null");
|
|
||||||
assertTrue(saleId > 0, "Sale ID should be positive: " + saleId);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify no duplicate sale IDs
|
|
||||||
long uniqueCount = auctions.stream().distinct().count();
|
|
||||||
assertEquals(auctions.size(), uniqueCount, "All sale IDs should be unique");
|
|
||||||
|
|
||||||
System.out.println("✓ Found " + auctions.size() + " unique Dutch auctions");
|
|
||||||
System.out.println("✓ Sale IDs: " + auctions);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFetchAndPersistAuctionData() throws SQLException {
|
public void testFetchAndPersistAuctionData() throws SQLException {
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
456
src/test/resources/test.md
Normal file
456
src/test/resources/test.md
Normal file
@@ -0,0 +1,456 @@
|
|||||||
|
## Woensdag 3 dec 25
|
||||||
|
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 16:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
145
|
||||||
|
|
||||||
|
Industrie & machines
|
||||||
|
|
||||||
|
Meerdere locaties (45)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/industrie-machines-A3-37358)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 16:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
38
|
||||||
|
|
||||||
|
D | Raceautotransporters, kraan-polypengrepen en containers uit voorraadaanpassing
|
||||||
|
|
||||||
|
Nieheim, DE
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/d-%7C-raceautotransporters-kraan-polypengrepen-en-containers-uit-voorraadaanpassing-A1-39772)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 16:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
61
|
||||||
|
|
||||||
|
Voedselverwerkende apparatuur en verpakkingsmachines
|
||||||
|
|
||||||
|
CHOMERAC, FR
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/voedselverwerkende-apparatuur-en-verpakkingsmachines-A1-39319)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 16:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
117
|
||||||
|
|
||||||
|
Landbouw- & grondverzetmachines
|
||||||
|
|
||||||
|
Meerdere locaties (49)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/landbouw-grondverzetmachines-A3-37375)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 17:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
261
|
||||||
|
|
||||||
|
Gereedschappen & uitrusting
|
||||||
|
|
||||||
|
Meerdere locaties (36), BE
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/gereedschappen-uitrusting-A3-37367)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 18:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
1
|
||||||
|
|
||||||
|
Vrachtwagens voor bedrijfsvoertuigen
|
||||||
|
|
||||||
|
Loßburg, DE
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/vrachtwagens-voor-bedrijfsvoertuigen-A7-39531)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
61
|
||||||
|
|
||||||
|
Witgoed en accessoires
|
||||||
|
|
||||||
|
Etten-Leur, NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/witgoed-en-accessoires-A1-27241)
|
||||||
|
* [
|
||||||
|
|
||||||
|
Opent 28 nov 17:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
54
|
||||||
|
|
||||||
|
Collectie Rolex en Cartier horloges
|
||||||
|
|
||||||
|
Dordrecht, NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/collectie-rolex-en-cartier-horloges-A1-39398)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
254
|
||||||
|
|
||||||
|
SHOWROOMKEUKENS en INBOUWAPPARATUUR
|
||||||
|
|
||||||
|
Tilburg, NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/showroomkeukens-en-inbouwapparatuur-A1-39480)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
499
|
||||||
|
|
||||||
|
Machines, retourgoederen en restpartijen
|
||||||
|
|
||||||
|
Harlingen, NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/machines-retourgoederen-en-restpartijen-A1-39642)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
120
|
||||||
|
|
||||||
|
Partijen gereedschap, kantoorinventaris, detailhandelgoederen, decoratie en olijfbomen
|
||||||
|
|
||||||
|
Meerdere locaties (3), NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/partijen-gereedschap-kantoorinventaris-detailhandelgoederen-decoratie-en-olijfbomen-A1-27016)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
16
|
||||||
|
|
||||||
|
Faillissementsvoertuigen
|
||||||
|
|
||||||
|
Meerdere locaties (3), NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/faillissementsvoertuigen-A1-38368)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
78
|
||||||
|
|
||||||
|
Personenauto’s, oldtimers, campers en brommobielen
|
||||||
|
|
||||||
|
Buitenpost, NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/personenauto%E2%80%99s-oldtimers-campers-en-brommobielen-A1-39508)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
391
|
||||||
|
|
||||||
|
Bezorgveiling Faillissement Dvize B.V. – Hyundai Power Products gereedschappen
|
||||||
|
|
||||||
|
Meerdere locaties (2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/bezorgveiling-faillissement-dvize-b-v-%E2%80%93-hyundai-power-products-gereedschappen-A1-39409)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
208
|
||||||
|
|
||||||
|
Kunstplanten en bomen, composiet gevel- en vloerbekleding en akoestische materialen
|
||||||
|
|
||||||
|
De Lier, NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/kunstplanten-en-bomen-composiet-gevel-en-vloerbekleding-en-akoestische-materialen-A1-28707)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
181
|
||||||
|
|
||||||
|
Metaalbewerkingsmachines, gereedschap en voorraad in verband met bedrijfsverhuizing
|
||||||
|
|
||||||
|
Cuijk, NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/metaalbewerkingsmachines-gereedschap-en-voorraad-in-verband-met-bedrijfsverhuizing-A1-39360)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
238
|
||||||
|
|
||||||
|
Overstock en magazijnopruiming
|
||||||
|
|
||||||
|
Heesch, NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/overstock-en-magazijnopruiming-A1-39538)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
47
|
||||||
|
|
||||||
|
Verzamelveiling Scooters en Motoren
|
||||||
|
|
||||||
|
Meerdere locaties (2), NL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/verzamelveiling-scooters-en-motoren-A1-28428)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:00
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
338
|
||||||
|
|
||||||
|
Auto's & transport
|
||||||
|
|
||||||
|
Meerdere locaties (109)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/auto%27s-transport-A3-37349)
|
||||||
|
* [
|
||||||
|
|
||||||
|
woensdag om 19:30
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
74
|
||||||
|
|
||||||
|
Gouden juwelen en diamanten
|
||||||
|
|
||||||
|
Meerdere locaties (28)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
](/a/gouden-juwelen-en-diamanten-A1-29562)
|
||||||
BIN
troostwijk.db
BIN
troostwijk.db
Binary file not shown.
Reference in New Issue
Block a user