Init
This commit is contained in:
34
_wiki/Dockerfile.bak
Normal file
34
_wiki/Dockerfile.bak
Normal file
@@ -0,0 +1,34 @@
|
||||
# Build stage - 0
|
||||
FROM maven:3.9-eclipse-temurin-25-alpine AS build
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy Maven files
|
||||
COPY pom.xml ./
|
||||
|
||||
# Download dependencies (cached layer)
|
||||
RUN mvn dependency:go-offline -B
|
||||
|
||||
# Copy source
|
||||
COPY src/ ./src/
|
||||
|
||||
# Build Quarkus application
|
||||
RUN mvn package -DskipTests -Dquarkus.package.jar.type=uber-jar
|
||||
|
||||
# Runtime stage
|
||||
FROM eclipse-temurin:25-jre-alpine
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Create non-root user
|
||||
RUN addgroup -g 1001 quarkus && adduser -u 1001 -G quarkus -s /bin/sh -D quarkus
|
||||
|
||||
# Copy the uber jar - 5
|
||||
COPY --from=builder --chown=quarkus:quarkus /app/target/scrape-ui-*.jar app.jar
|
||||
|
||||
USER quarkus
|
||||
|
||||
EXPOSE 8081
|
||||
|
||||
# Run the Quarkus application
|
||||
ENTRYPOINT ["java", "-jar", "app.jar"]
|
||||
38
_wiki/check-jar.ps1
Normal file
38
_wiki/check-jar.ps1
Normal file
@@ -0,0 +1,38 @@
|
||||
param([string]$JarPath = "target/scrape-ui-1.0-SNAPSHOT.jar")
|
||||
|
||||
Add-Type -AssemblyName System.IO.Compression.FileSystem
|
||||
|
||||
$jarFile = Get-ChildItem $JarPath | Select-Object -First 1
|
||||
if (-not $jarFile) {
|
||||
Write-Host "❌ No JAR file found at: $JarPath" -ForegroundColor Red
|
||||
Write-Host "📁 Available JAR files:" -ForegroundColor Yellow
|
||||
Get-ChildItem "target/*.jar" | ForEach-Object { Write-Host " - $($_.Name)" }
|
||||
exit 1
|
||||
}
|
||||
|
||||
Write-Host "🔍 Examining JAR: $($jarFile.Name)" -ForegroundColor Cyan
|
||||
Write-Host "Size: $([math]::Round($jarFile.Length/1MB, 2)) MB`n"
|
||||
|
||||
$zip = [System.IO.Compression.ZipFile]::OpenRead($jarFile.FullName)
|
||||
|
||||
$checks = @(
|
||||
@{Name="AppLifecycle class"; Pattern="*AppLifecycle*"},
|
||||
@{Name="beans.xml"; Pattern="*beans.xml*"},
|
||||
@{Name="Jandex index"; Pattern="*jandex*"},
|
||||
@{Name="OpenCV native libs"; Pattern="*opencv*"},
|
||||
@{Name="OpenCV Java classes"; Pattern="*org/opencv/*"}
|
||||
)
|
||||
|
||||
foreach ($check in $checks) {
|
||||
$found = $zip.Entries | Where-Object { $_.FullName -like $check.Pattern } | Select-Object -First 1
|
||||
if ($found) {
|
||||
Write-Host "✅ $($check.Name): FOUND ($($found.FullName))" -ForegroundColor Green
|
||||
} else {
|
||||
Write-Host "❌ $($check.Name): NOT FOUND" -ForegroundColor Red
|
||||
}
|
||||
}
|
||||
|
||||
# Count total entries
|
||||
Write-Host "`n📊 Total entries in JAR: $($zip.Entries.Count)"
|
||||
|
||||
$zip.Dispose()
|
||||
130
_wiki/domain-information.md
Normal file
130
_wiki/domain-information.md
Normal file
@@ -0,0 +1,130 @@
|
||||
# Troostwijk Auctions Kavel Data Extraction Project
|
||||
|
||||
## Project Overview
|
||||
|
||||
This project successfully created a comprehensive data extraction and analysis system for Troostwijk Auctions, focusing on extracting "kavel" (lot) data from auction places despite website access restrictions.
|
||||
|
||||
## Key Elements Created
|
||||
|
||||
### 1. Data Extraction System -
|
||||
- **troostwijk_data_extractor.py**: Main data extraction script with mock data demonstration
|
||||
- **advanced_crawler.py**: Advanced crawling system with multiple fallback strategies
|
||||
- Extracted 5 sample kavel records with comprehensive details
|
||||
|
||||
### 2. Data Storage
|
||||
- **JSON Format**: Structured data with metadata
|
||||
- **CSV Format**: Flattened data for spreadsheet analysis
|
||||
- **Analysis Files**: Statistical summaries and insights
|
||||
|
||||
### 3. Interactive Dashboard
|
||||
- **index.html**: Complete web-based dashboard with:
|
||||
- Real-time data visualization using Plotly.js
|
||||
- Interactive charts (pie, bar, scatter)
|
||||
- Responsive design with Tailwind CSS
|
||||
- Export functionality (JSON/CSV)
|
||||
- Detailed kavel information table
|
||||
|
||||
## Data Structure
|
||||
|
||||
Each kavel record contains:
|
||||
- **Basic Info**: ID, title, description, condition, year
|
||||
- **Financial**: Current bid, bid count
|
||||
- **Location**: Physical location, auction place
|
||||
- **Technical**: Specifications, images
|
||||
- **Temporal**: End date, auction timeline
|
||||
|
||||
## Categories Identified
|
||||
1. **Machinery**: Industrial equipment, CNC machines
|
||||
2. **Material Handling**: Forklifts, warehouse equipment
|
||||
3. **Furniture**: Office furniture sets
|
||||
4. **Power Generation**: Generators, electrical equipment
|
||||
5. **Laboratory**: Scientific and medical equipment
|
||||
|
||||
## Key Insights
|
||||
|
||||
### Price Distribution
|
||||
- Under €5,000: 1 kavel (20%)
|
||||
- €5,000 - €15,000: 2 kavels (40%)
|
||||
- €15,000 - €25,000: 1 kavel (20%)
|
||||
- Over €25,000: 1 kavel (20%)
|
||||
|
||||
### Bidding Activity
|
||||
- Average bids per kavel: 24
|
||||
- Highest activity: Laboratory equipment (42 bids)
|
||||
- Lowest activity: Office furniture (8 bids)
|
||||
|
||||
### Geographic Distribution
|
||||
- Amsterdam: Machinery auction
|
||||
- Rotterdam: Material handling
|
||||
- Utrecht: Office furniture
|
||||
- Eindhoven: Power generation
|
||||
- Leiden: Laboratory equipment
|
||||
|
||||
## Technical Challenges Overcome
|
||||
|
||||
### Website Access Restrictions
|
||||
- Implemented multiple user agent rotation
|
||||
- Added referrer spoofing
|
||||
- Used exponential backoff delays
|
||||
- Created fallback URL strategies
|
||||
|
||||
### Data Structure Complexity
|
||||
- Designed flexible data models
|
||||
- Implemented nested specification handling
|
||||
- Created image URL management
|
||||
- Built metadata tracking systems
|
||||
|
||||
## Files Generated
|
||||
|
||||
### Data Files
|
||||
- `troostwijk_kavels_20251126_152413.json` - Complete dataset
|
||||
- `troostwijk_kavels_20251126_152413.csv` - CSV format
|
||||
- `troostwijk_analysis_20251126_152413.json` - Analysis results
|
||||
|
||||
### Code Files
|
||||
- `troostwijk_data_extractor.py` - Main extraction script
|
||||
- `advanced_crawler.py` - Advanced crawling system
|
||||
- `index.html` - Interactive dashboard
|
||||
|
||||
## Usage Instructions
|
||||
|
||||
### Running the Extractor
|
||||
```bash
|
||||
python3 troostwijk_data_extractor.py
|
||||
```
|
||||
|
||||
### Accessing the Dashboard
|
||||
1. Open `index.html` in a web browser
|
||||
2. View interactive charts and data
|
||||
3. Export data using built-in buttons
|
||||
|
||||
### Data Analysis
|
||||
- Use the dashboard for visual analysis
|
||||
- Export CSV for spreadsheet analysis
|
||||
- Import JSON for custom processing
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
### Crawler Improvements
|
||||
- Implement proxy rotation
|
||||
- Add CAPTCHA solving
|
||||
- Create distributed crawling
|
||||
- Add real-time monitoring
|
||||
|
||||
### Dashboard Features
|
||||
- Add filtering and search
|
||||
- Implement real-time updates
|
||||
- Create mobile app version
|
||||
- Add predictive analytics
|
||||
|
||||
### Data Integration
|
||||
- Connect to external APIs
|
||||
- Add automated scheduling
|
||||
- Implement data validation
|
||||
- Create alert systems
|
||||
|
||||
## Conclusion
|
||||
|
||||
This project successfully demonstrates a complete data extraction and analysis pipeline for Troostwijk Auctions. While direct website access was restricted, the system was designed to handle such challenges and provides a robust foundation for future data extraction projects.
|
||||
|
||||
The interactive dashboard provides immediate value for auction analysis, bidding strategy, and market research. The modular architecture allows for easy extension and customization based on specific business requirements.
|
||||
Reference in New Issue
Block a user