This commit is contained in:
2025-11-28 06:23:30 +01:00
parent 0f5800441a
commit b174f77f6c
6 changed files with 512 additions and 37 deletions

View File

@@ -308,8 +308,7 @@ public class TroostwijkAuctionExtractor {
int endPos = Math.min(hrefPos + 1000, html.length());
String context = html.substring(startPos, endPos);
// Try to find location pattern like "City, NL" or "City, Country"
// More flexible pattern to catch various location formats
// Pattern 1: Classic format "City, NL"
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
@@ -320,6 +319,31 @@ public class TroostwijkAuctionExtractor {
return location;
}
// Pattern 2: HTML format like "<span>City,<!-- --> </span>NL"
// Extract city and country code separately
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
"<span[^>]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
if (htmlMatcher.find()) {
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
String country = htmlMatcher.group(2);
String location = city + ", " + country;
System.out.println(" Found location (HTML): " + location + " for auction " + href);
return location;
}
// Pattern 3: Fallback - just find country code after HTML tags
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
"(?:-->|</span>|</div>)\\s*([A-Z]{2})(?![A-Za-z])");
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
if (countryMatcher.find()) {
String country = countryMatcher.group(1);
System.out.println(" Found country code: " + country + " for auction " + href);
return "Unknown, " + country;
}
System.out.println(" ⚠️ No location found for auction " + href);
return "Unknown";
}