start
This commit is contained in:
@@ -308,8 +308,7 @@ public class TroostwijkAuctionExtractor {
|
||||
int endPos = Math.min(hrefPos + 1000, html.length());
|
||||
String context = html.substring(startPos, endPos);
|
||||
|
||||
// Try to find location pattern like "City, NL" or "City, Country"
|
||||
// More flexible pattern to catch various location formats
|
||||
// Pattern 1: Classic format "City, NL"
|
||||
java.util.regex.Pattern locPattern = java.util.regex.Pattern.compile(
|
||||
"([A-Za-z][A-Za-z\\s\\-']+),\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher locMatcher = locPattern.matcher(context);
|
||||
@@ -320,6 +319,31 @@ public class TroostwijkAuctionExtractor {
|
||||
return location;
|
||||
}
|
||||
|
||||
// Pattern 2: HTML format like "<span>City,<!-- --> </span>NL"
|
||||
// Extract city and country code separately
|
||||
java.util.regex.Pattern htmlPattern = java.util.regex.Pattern.compile(
|
||||
"<span[^>]*>([A-Za-z][A-Za-z\\s\\-',]+?)(?:,)?\\s*(?:<!--.*?-->)?\\s*</span>\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher htmlMatcher = htmlPattern.matcher(context);
|
||||
|
||||
if (htmlMatcher.find()) {
|
||||
String city = htmlMatcher.group(1).trim().replaceAll(",$", ""); // Remove trailing comma
|
||||
String country = htmlMatcher.group(2);
|
||||
String location = city + ", " + country;
|
||||
System.out.println(" Found location (HTML): " + location + " for auction " + href);
|
||||
return location;
|
||||
}
|
||||
|
||||
// Pattern 3: Fallback - just find country code after HTML tags
|
||||
java.util.regex.Pattern countryPattern = java.util.regex.Pattern.compile(
|
||||
"(?:-->|</span>|</div>)\\s*([A-Z]{2})(?![A-Za-z])");
|
||||
java.util.regex.Matcher countryMatcher = countryPattern.matcher(context);
|
||||
|
||||
if (countryMatcher.find()) {
|
||||
String country = countryMatcher.group(1);
|
||||
System.out.println(" Found country code: " + country + " for auction " + href);
|
||||
return "Unknown, " + country;
|
||||
}
|
||||
|
||||
System.out.println(" ⚠️ No location found for auction " + href);
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user