start
This commit is contained in:
5
.idea/compiler.xml
generated
5
.idea/compiler.xml
generated
@@ -10,4 +10,9 @@
|
||||
</profile>
|
||||
</annotationProcessing>
|
||||
</component>
|
||||
<component name="JavacSettings">
|
||||
<option name="ADDITIONAL_OPTIONS_OVERRIDE">
|
||||
<module name="troostwijk-scraper" options="--enable-native-access=ALL-UNNAMED" />
|
||||
</option>
|
||||
</component>
|
||||
</project>
|
||||
24
.idea/dataSources.xml
generated
24
.idea/dataSources.xml
generated
@@ -61,5 +61,29 @@
|
||||
</library>
|
||||
</libraries>
|
||||
</data-source>
|
||||
<data-source source="LOCAL" name="page_cache" uuid="9ac1bd58-f7d6-4a86-a165-f0a24a2b7cb9">
|
||||
<driver-ref>sqlite.xerial</driver-ref>
|
||||
<synchronize>true</synchronize>
|
||||
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
||||
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/cache/page_cache.db</jdbc-url>
|
||||
<working-dir>$ProjectFileDir$</working-dir>
|
||||
<libraries>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
||||
</library>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
||||
</library>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
||||
</library>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
||||
</library>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.43.0/org/xerial/sqlite-jdbc/3.43.0.0/sqlite-jdbc-3.43.0.0.jar</url>
|
||||
</library>
|
||||
</libraries>
|
||||
</data-source>
|
||||
</component>
|
||||
</project>
|
||||
164
RUN_INSTRUCTIONS.md
Normal file
164
RUN_INSTRUCTIONS.md
Normal file
@@ -0,0 +1,164 @@
|
||||
# Troostwijk Auction Extractor - Run Instructions
|
||||
|
||||
## Fixed Warnings
|
||||
|
||||
All warnings have been resolved:
|
||||
- ✅ SLF4J logging configured (slf4j-simple)
|
||||
- ✅ Native access enabled for SQLite JDBC
|
||||
- ✅ Logging output controlled via simplelogger.properties
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Java 21** installed
|
||||
2. **Maven** installed
|
||||
3. **IntelliJ IDEA** (recommended) or command line
|
||||
|
||||
## Setup (First Time Only)
|
||||
|
||||
### 1. Install Dependencies
|
||||
|
||||
In IntelliJ Terminal or PowerShell:
|
||||
|
||||
```bash
|
||||
# Reload Maven dependencies
|
||||
mvn clean install
|
||||
|
||||
# Install Playwright browser binaries (first time only)
|
||||
mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="install"
|
||||
```
|
||||
|
||||
## Running the Application
|
||||
|
||||
### Option A: Using IntelliJ IDEA (Easiest)
|
||||
|
||||
1. **Add VM Options for native access:**
|
||||
- Run → Edit Configurations
|
||||
- Select or create configuration for `TroostwijkAuctionExtractor`
|
||||
- In "VM options" field, add:
|
||||
```
|
||||
--enable-native-access=ALL-UNNAMED
|
||||
```
|
||||
|
||||
2. **Add Program Arguments (optional):**
|
||||
- In "Program arguments" field, add:
|
||||
```
|
||||
--max-visits 3
|
||||
```
|
||||
|
||||
3. **Run the application:**
|
||||
- Click the green Run button
|
||||
|
||||
### Option B: Using Maven (Command Line)
|
||||
|
||||
```bash
|
||||
# Run with 3 page limit
|
||||
mvn exec:java
|
||||
|
||||
# Run with custom arguments (override pom.xml defaults)
|
||||
mvn exec:java -Dexec.args="--max-visits 5"
|
||||
|
||||
# Run without cache
|
||||
mvn exec:java -Dexec.args="--no-cache --max-visits 2"
|
||||
|
||||
# Run with unlimited visits
|
||||
mvn exec:java -Dexec.args=""
|
||||
```
|
||||
|
||||
### Option C: Using Java Directly
|
||||
|
||||
```bash
|
||||
# Compile first
|
||||
mvn clean compile
|
||||
|
||||
# Run with native access enabled
|
||||
java --enable-native-access=ALL-UNNAMED \
|
||||
-cp target/classes:$(mvn dependency:build-classpath -Dmdep.outputFile=/dev/stdout -q) \
|
||||
com.auction.TroostwijkAuctionExtractor --max-visits 3
|
||||
```
|
||||
|
||||
## Command Line Arguments
|
||||
|
||||
```
|
||||
--max-visits <n> Limit actual page fetches to n (0 = unlimited, default)
|
||||
--no-cache Disable page caching
|
||||
--help Show help message
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Test with 3 page visits (cached pages don't count):
|
||||
```bash
|
||||
mvn exec:java -Dexec.args="--max-visits 3"
|
||||
```
|
||||
|
||||
### Fresh extraction without cache:
|
||||
```bash
|
||||
mvn exec:java -Dexec.args="--no-cache --max-visits 5"
|
||||
```
|
||||
|
||||
### Full extraction (all pages, unlimited):
|
||||
```bash
|
||||
mvn exec:java -Dexec.args=""
|
||||
```
|
||||
|
||||
## Expected Output (No Warnings)
|
||||
|
||||
```
|
||||
=== Troostwijk Auction Extractor ===
|
||||
Max page visits set to: 3
|
||||
|
||||
Initializing Playwright browser...
|
||||
✓ Browser ready
|
||||
✓ Cache database initialized
|
||||
|
||||
Starting auction extraction from https://www.troostwijkauctions.com/auctions
|
||||
|
||||
[Page 1] Fetching auctions...
|
||||
✓ Fetched from website (visit 1/3)
|
||||
✓ Found 20 auctions
|
||||
|
||||
[Page 2] Fetching auctions...
|
||||
✓ Loaded from cache
|
||||
✓ Found 20 auctions
|
||||
|
||||
[Page 3] Fetching auctions...
|
||||
✓ Fetched from website (visit 2/3)
|
||||
✓ Found 20 auctions
|
||||
|
||||
✓ Total auctions extracted: 60
|
||||
|
||||
=== Results ===
|
||||
Total auctions found: 60
|
||||
Dutch auctions (NL): 45
|
||||
Actual page visits: 2
|
||||
|
||||
✓ Browser and cache closed
|
||||
```
|
||||
|
||||
## Cache Management
|
||||
|
||||
- Cache is stored in: `cache/page_cache.db`
|
||||
- Cache expires after: 24 hours (configurable in code)
|
||||
- To clear cache: Delete `cache/page_cache.db` file
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### If you still see warnings:
|
||||
|
||||
1. **Reload Maven project in IntelliJ:**
|
||||
- Right-click `pom.xml` → Maven → Reload project
|
||||
|
||||
2. **Verify VM options:**
|
||||
- Ensure `--enable-native-access=ALL-UNNAMED` is in VM options
|
||||
|
||||
3. **Clean and rebuild:**
|
||||
```bash
|
||||
mvn clean install
|
||||
```
|
||||
|
||||
### If Playwright fails:
|
||||
|
||||
```bash
|
||||
# Reinstall browser binaries
|
||||
mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="install chromium"
|
||||
```
|
||||
38
pom.xml
38
pom.xml
@@ -60,6 +60,18 @@
|
||||
<artifactId>playwright</artifactId>
|
||||
<version>1.40.0</version>
|
||||
</dependency>
|
||||
|
||||
<!-- SLF4J API and implementation for logging -->
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>2.0.9</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-simple</artifactId>
|
||||
<version>2.0.9</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
@@ -72,6 +84,32 @@
|
||||
<configuration>
|
||||
<source>21</source>
|
||||
<target>21</target>
|
||||
<compilerArgs>
|
||||
<arg>--enable-native-access=ALL-UNNAMED</arg>
|
||||
</compilerArgs>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<!-- Maven Exec Plugin for running with native access -->
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
<version>3.1.0</version>
|
||||
<configuration>
|
||||
<mainClass>com.auction.TroostwijkAuctionExtractor</mainClass>
|
||||
<arguments>
|
||||
<argument>--max-visits</argument>
|
||||
<argument>3</argument>
|
||||
</arguments>
|
||||
<systemProperties>
|
||||
<systemProperty>
|
||||
<key>java.util.logging.SimpleFormatter.format</key>
|
||||
<value>%1$tF %1$tT %4$s %2$s %5$s%6$s%n</value>
|
||||
</systemProperty>
|
||||
</systemProperties>
|
||||
<additionalOptions>
|
||||
<additionalOption>--enable-native-access=ALL-UNNAMED</additionalOption>
|
||||
</additionalOptions>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!-- <plugin>
|
||||
|
||||
@@ -53,6 +53,8 @@ public class TroostwijkAuctionExtractor {
|
||||
private final ObjectMapper objectMapper;
|
||||
private final boolean useCache;
|
||||
private final CacheDatabase cacheDb;
|
||||
private final int maxPageVisits; // Maximum number of pages to fetch (0 = unlimited)
|
||||
private int pageVisitCount; // Counter for actual page fetches (not from cache)
|
||||
private Playwright playwright;
|
||||
private Browser browser;
|
||||
|
||||
@@ -77,10 +79,13 @@ public class TroostwijkAuctionExtractor {
|
||||
* Constructor
|
||||
*
|
||||
* @param useCache Enable database caching of visited pages
|
||||
* @param maxPageVisits Maximum number of actual page fetches (0 = unlimited)
|
||||
*/
|
||||
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
|
||||
public TroostwijkAuctionExtractor(boolean useCache, int maxPageVisits) throws SQLException, IOException {
|
||||
this.objectMapper = new ObjectMapper();
|
||||
this.useCache = useCache;
|
||||
this.maxPageVisits = maxPageVisits;
|
||||
this.pageVisitCount = 0;
|
||||
this.cacheDb = useCache ? new CacheDatabase(CACHE_DB_PATH) : null;
|
||||
|
||||
if (useCache) {
|
||||
@@ -88,6 +93,15 @@ public class TroostwijkAuctionExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor with default unlimited page visits
|
||||
*
|
||||
* @param useCache Enable database caching of visited pages
|
||||
*/
|
||||
public TroostwijkAuctionExtractor(boolean useCache) throws SQLException, IOException {
|
||||
this(useCache, 0); // 0 = unlimited
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes Playwright and browser instance
|
||||
* Call this before extracting auctions
|
||||
@@ -145,14 +159,24 @@ public class TroostwijkAuctionExtractor {
|
||||
System.out.println(" ✓ Loaded from cache");
|
||||
html = cachedHtml;
|
||||
} else {
|
||||
// Check if we've reached the maximum page visit limit
|
||||
if (maxPageVisits > 0 && pageVisitCount >= maxPageVisits) {
|
||||
System.out.println(" ⚠️ Reached maximum page visit limit (" + maxPageVisits + "), stopping");
|
||||
break;
|
||||
}
|
||||
|
||||
// Fetch with Playwright
|
||||
html = fetchPageWithPlaywright(pageNumber);
|
||||
pageVisitCount++; // Increment actual page fetch counter
|
||||
|
||||
if (html == null || html.isEmpty()) {
|
||||
System.out.println(" ⚠️ Failed to fetch page, stopping pagination");
|
||||
break;
|
||||
}
|
||||
|
||||
System.out.println(" ✓ Fetched from website (visit " + pageVisitCount +
|
||||
(maxPageVisits > 0 ? "/" + maxPageVisits : "") + ")");
|
||||
|
||||
// Save to cache
|
||||
if (useCache) {
|
||||
saveToCache(pageNumber, html);
|
||||
@@ -371,13 +395,41 @@ public class TroostwijkAuctionExtractor {
|
||||
|
||||
/**
|
||||
* Entry point for testing
|
||||
*
|
||||
* Arguments:
|
||||
* --max-visits <number> : Maximum number of page visits (0 = unlimited, default)
|
||||
* --no-cache : Disable caching
|
||||
*/
|
||||
public static void main(String[] args) throws Exception {
|
||||
System.out.println("=== Troostwijk Auction Extractor ===\n");
|
||||
|
||||
// Enable caching by default
|
||||
// Parse command line arguments
|
||||
boolean useCache = true;
|
||||
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache);
|
||||
int maxVisits = 0; // 0 = unlimited
|
||||
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
switch (args[i]) {
|
||||
case "--max-visits":
|
||||
if (i + 1 < args.length) {
|
||||
maxVisits = Integer.parseInt(args[++i]);
|
||||
System.out.println("Max page visits set to: " + maxVisits);
|
||||
}
|
||||
break;
|
||||
case "--no-cache":
|
||||
useCache = false;
|
||||
System.out.println("Caching disabled");
|
||||
break;
|
||||
case "--help":
|
||||
System.out.println("Usage: java TroostwijkAuctionExtractor [options]");
|
||||
System.out.println("Options:");
|
||||
System.out.println(" --max-visits <n> : Limit actual page fetches to n (0 = unlimited)");
|
||||
System.out.println(" --no-cache : Disable page caching");
|
||||
System.out.println(" --help : Show this help message");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
TroostwijkAuctionExtractor extractor = new TroostwijkAuctionExtractor(useCache, maxVisits);
|
||||
|
||||
try {
|
||||
// Initialize browser
|
||||
@@ -392,6 +444,7 @@ public class TroostwijkAuctionExtractor {
|
||||
System.out.println("\n=== Results ===");
|
||||
System.out.println("Total auctions found: " + allAuctions.size());
|
||||
System.out.println("Dutch auctions (NL): " + dutchAuctions.size());
|
||||
System.out.println("Actual page visits: " + extractor.pageVisitCount);
|
||||
|
||||
// Display first 10 Dutch auctions
|
||||
System.out.println("\n=== Sample Dutch Auctions ===");
|
||||
|
||||
20
src/main/resources/simplelogger.properties
Normal file
20
src/main/resources/simplelogger.properties
Normal file
@@ -0,0 +1,20 @@
|
||||
# SLF4J Simple Logger Configuration
|
||||
# Set default log level (trace, debug, info, warn, error, off)
|
||||
org.slf4j.simpleLogger.defaultLogLevel=warn
|
||||
|
||||
# Show date/time in logs
|
||||
org.slf4j.simpleLogger.showDateTime=true
|
||||
org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss
|
||||
|
||||
# Show thread name
|
||||
org.slf4j.simpleLogger.showThreadName=false
|
||||
|
||||
# Show log name (logger name)
|
||||
org.slf4j.simpleLogger.showLogName=false
|
||||
|
||||
# Show short log name
|
||||
org.slf4j.simpleLogger.showShortLogName=true
|
||||
|
||||
# Set specific logger levels
|
||||
org.slf4j.simpleLogger.log.com.microsoft.playwright=warn
|
||||
org.slf4j.simpleLogger.log.org.sqlite=warn
|
||||
Reference in New Issue
Block a user