This commit is contained in:
2025-11-28 05:16:51 +01:00
parent ec2efd4661
commit b560240c17
6 changed files with 307 additions and 3 deletions

5
.idea/compiler.xml generated
View File

@@ -10,4 +10,9 @@
</profile> </profile>
</annotationProcessing> </annotationProcessing>
</component> </component>
<component name="JavacSettings">
<option name="ADDITIONAL_OPTIONS_OVERRIDE">
<module name="troostwijk-scraper" options="--enable-native-access=ALL-UNNAMED" />
</option>
</component>
</project> </project>

24
.idea/dataSources.xml generated
View File

@@ -61,5 +61,29 @@
</library> </library>
</libraries> </libraries>
</data-source> </data-source>
<data-source source="LOCAL" name="page_cache" uuid="9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9">
<driver-ref>sqlite.xerial</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/cache/page_cache.db</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
<libraries>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.43.0/org/xerial/sqlite-jdbc/3.43.0.0/sqlite-jdbc-3.43.0.0.jar</url>
</library>
</libraries>
</data-source>
</component> </component>
</project> </project>

164
RUN_INSTRUCTIONS.md Normal file
View File

@@ -0,0 +1,164 @@
# Troostwijk Auction Extractor - Run Instructions
## Fixed Warnings
All warnings have been resolved:
- ✅ SLF4J logging configured (slf4j-simple)
- ✅ Native access enabled for SQLite JDBC
- ✅ Logging output controlled via simplelogger.properties
## Prerequisites
1. **Java 21** installed
2. **Maven** installed
3. **IntelliJ IDEA** (recommended) or command line
## Setup (First Time Only)
### 1. Install Dependencies
In IntelliJ Terminal or PowerShell:
```bash
# Reload Maven dependencies
mvn clean install
# Install Playwright browser binaries (first time only)
mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="install"
```
## Running the Application
### Option A: Using IntelliJ IDEA (Easiest)
1. **Add VM Options for native access:**
- Run → Edit Configurations
- Select or create configuration for `TroostwijkAuctionExtractor`
- In "VM options" field, add:
```
--enable-native-access=ALL-UNNAMED
```
2. **Add Program Arguments (optional):**
- In "Program arguments" field, add:
```
--max-visits 3
```
3. **Run the application:**
- Click the green Run button
### Option B: Using Maven (Command Line)
```bash
# Run with 3 page limit
mvn exec:java
# Run with custom arguments (override pom.xml defaults)
mvn exec:java -Dexec.args="--max-visits 5"
# Run without cache
mvn exec:java -Dexec.args="--no-cache --max-visits 2"
# Run with unlimited visits
mvn exec:java -Dexec.args=""
```
### Option C: Using Java Directly
```bash
# Compile first
mvn clean compile
# Run with native access enabled
java --enable-native-access=ALL-UNNAMED \
-cp target/classes:$(mvn dependency:build-classpath -Dmdep.outputFile=/dev/stdout -q) \
com.auction.TroostwijkAuctionExtractor --max-visits 3
```
## Command Line Arguments
```
--max-visits <n> Limit actual page fetches to n (0 = unlimited, default)
--no-cache Disable page caching
--help Show help message
```
## Examples
### Test with 3 page visits (cached pages don't count):
```bash
mvn exec:java -Dexec.args="--max-visits 3"
```
### Fresh extraction without cache:
```bash
mvn exec:java -Dexec.args="--no-cache --max-visits 5"
```
### Full extraction (all pages, unlimited):
```bash
mvn exec:java -Dexec.args=""
```
## Expected Output (No Warnings)
```
=== Troostwijk Auction Extractor ===
Max page visits set to: 3
Initializing Playwright browser...
✓ Browser ready
✓ Cache database initialized
Starting auction extraction from https://www.troostwijkauctions.com/auctions
[Page 1] Fetching auctions...
✓ Fetched from website (visit 1/3)
✓ Found 20 auctions
[Page 2] Fetching auctions...
✓ Loaded from cache
✓ Found 20 auctions
[Page 3] Fetching auctions...
✓ Fetched from website (visit 2/3)
✓ Found 20 auctions
✓ Total auctions extracted: 60
=== Results ===
Total auctions found: 60
Dutch auctions (NL): 45
Actual page visits: 2
✓ Browser and cache closed
```
## Cache Management
- Cache is stored in: `cache/page_cache.db`
- Cache expires after: 24 hours (configurable in code)
- To clear cache: Delete `cache/page_cache.db` file
## Troubleshooting
### If you still see warnings:
1. **Reload Maven project in IntelliJ:**
- Right-click `pom.xml` → Maven → Reload project
2. **Verify VM options:**
- Ensure `--enable-native-access=ALL-UNNAMED` is in VM options
3. **Clean and rebuild:**
```bash
mvn clean install
```
### If Playwright fails:
```bash
# Reinstall browser binaries
mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="install chromium"
```

38
pom.xml
View File

@@ -60,6 +60,18 @@
<artifactId>playwright</artifactId> <artifactId>playwright</artifactId>
<version>1.40.0</version> <version>1.40.0</version>
</dependency> </dependency>
<!-- SLF4J API and implementation for logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>2.0.9</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>2.0.9</version>
</dependency>
</dependencies> </dependencies>
<build> <build>
@@ -72,6 +84,32 @@
<configuration> <configuration>
<source>21</source> <source>21</source>
<target>21</target> <target>21</target>
<compilerArgs>
<arg>--enable-native-access=ALL-UNNAMED</arg>
</compilerArgs>
</configuration>
</plugin>
<!-- Maven Exec Plugin for running with native access -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<mainClass>com.auction.TroostwijkAuctionExtractor</mainClass>
<arguments>
<argument>--max-visits</argument>
<argument>3</argument>
</arguments>
<systemProperties>
<systemProperty>
<key>java.util.logging.SimpleFormatter.format</key>
<value>%1$tF %1$tT %4$s %2$s %5$s%6$s%n</value>
</systemProperty>
</systemProperties>
<additionalOptions>
<additionalOption>--enable-native-access=ALL-UNNAMED</additionalOption>
</additionalOptions>
</configuration> </configuration>
</plugin> </plugin>
<!-- <plugin> <!-- <plugin>

View File

@@ -53,6 +53,8 @@ public class TroostwijkAuctionExtractor {
private final ObjectMapper objectMapper; private final ObjectMapper objectMapper;
private final boolean useCache; private final boolean useCache;
private final CacheDatabase cacheDb; private final CacheDatabase cacheDb;
private final int maxPageVisits; // Maximum number of pages to fetch (0 = unlimited)
private int pageVisitCount; // Counter for actual page fetches (not from cache)
private Playwright playwright; private Playwright playwright;
private Browser browser; private Browser browser;
@@ -77,10 +79,13 @@ public class TroostwijkAuctionExtractor {
* Constructor * Constructor
* *
* @param useCache Enable database caching of visited pages * @param useCache Enable database caching of visited pages
* @param maxPageVisits Maximum number of actual page fetches (0 = unlimited)
*/ */
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException { public TroostwijkAuctionExtractor(boolean useCache, int maxPageVisits) throws SQLException, IOException {
this.objectMapper = new ObjectMapper(); this.objectMapper = new ObjectMapper();
this.useCache = useCache; this.useCache = useCache;
this.maxPageVisits = maxPageVisits;
this.pageVisitCount = 0;
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null; this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
if (useCache) { if (useCache) {
@@ -88,6 +93,15 @@ public class TroostwijkAuctionExtractor {
} }
} }
/**
* Constructor with default unlimited page visits
*
* @param useCache Enable database caching of visited pages
*/
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
this(useCache, 0); // 0 = unlimited
}
/** /**
* Initializes Playwright and browser instance * Initializes Playwright and browser instance
* Call this before extracting auctions * Call this before extracting auctions
@@ -145,14 +159,24 @@ public class TroostwijkAuctionExtractor {
System.out.println(" ✓ Loaded from cache"); System.out.println(" ✓ Loaded from cache");
html = cachedHtml; html = cachedHtml;
} else { } else {
// Check if we've reached the maximum page visit limit
if (maxPageVisits > 0 && pageVisitCount >= maxPageVisits) {
System.out.println(" ⚠️ Reached maximum page visit limit (" + maxPageVisits + "), stopping");
break;
}
// Fetch with Playwright // Fetch with Playwright
html = fetchPageWithPlaywright(pageNumber); html = fetchPageWithPlaywright(pageNumber);
pageVisitCount++; // Increment actual page fetch counter
if (html == null || html.isEmpty()) { if (html == null || html.isEmpty()) {
System.out.println(" ⚠️ Failed to fetch page, stopping pagination"); System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
break; break;
} }
System.out.println(" ✓ Fetched from website (visit " + pageVisitCount +
(maxPageVisits > 0 ? "/" + maxPageVisits : "") + ")");
// Save to cache // Save to cache
if (useCache) { if (useCache) {
saveToCache(pageNumber, html); saveToCache(pageNumber, html);
@@ -371,13 +395,41 @@ public class TroostwijkAuctionExtractor {
/** /**
* Entry point for testing * Entry point for testing
*
* Arguments:
* --max-visits <number> : Maximum number of page visits (0 = unlimited, default)
* --no-cache : Disable caching
*/ */
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
System.out.println("=== Troostwijk Auction Extractor ===\n"); System.out.println("=== Troostwijk Auction Extractor ===\n");
// Enable caching by default // Parse command line arguments
boolean useCache = true; boolean useCache = true;
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache); int maxVisits = 0; // 0 = unlimited
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "--max-visits":
if (i + 1 < args.length) {
maxVisits = Integer.parseInt(args[++i]);
System.out.println("Max page visits set to: " + maxVisits);
}
break;
case "--no-cache":
useCache = false;
System.out.println("Caching disabled");
break;
case "--help":
System.out.println("Usage: java TroostwijkAuctionExtractor [options]");
System.out.println("Options:");
System.out.println(" --max-visits <n> : Limit actual page fetches to n (0 = unlimited)");
System.out.println(" --no-cache : Disable page caching");
System.out.println(" --help : Show this help message");
return;
}
}
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache, maxVisits);
try { try {
// Initialize browser // Initialize browser
@@ -392,6 +444,7 @@ public class TroostwijkAuctionExtractor {
System.out.println("\n=== Results ==="); System.out.println("\n=== Results ===");
System.out.println("Total auctions found: " + allAuctions.size()); System.out.println("Total auctions found: " + allAuctions.size());
System.out.println("Dutch auctions (NL): " + dutchAuctions.size()); System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
System.out.println("Actual page visits: " + extractor.pageVisitCount);
// Display first 10 Dutch auctions // Display first 10 Dutch auctions
System.out.println("\n=== Sample Dutch Auctions ==="); System.out.println("\n=== Sample Dutch Auctions ===");

View File

@@ -0,0 +1,20 @@
# SLF4J Simple Logger Configuration
# Set default log level (trace, debug, info, warn, error, off)
org.slf4j.simpleLogger.defaultLogLevel=warn
# Show date/time in logs
org.slf4j.simpleLogger.showDateTime=true
org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss
# Show thread name
org.slf4j.simpleLogger.showThreadName=false
# Show log name (logger name)
org.slf4j.simpleLogger.showLogName=false
# Show short log name
org.slf4j.simpleLogger.showShortLogName=true
# Set specific logger levels
org.slf4j.simpleLogger.log.com.microsoft.playwright=warn
org.slf4j.simpleLogger.log.org.sqlite=warn