init
This commit is contained in:
176
.gitignore
vendored
Normal file
176
.gitignore
vendored
Normal file
@@ -0,0 +1,176 @@
|
||||
### Python template
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
|
||||
# Project specific - Troostwijk Scraper
|
||||
output/
|
||||
*.db
|
||||
*.csv
|
||||
*.json
|
||||
!requirements.txt
|
||||
|
||||
# Playwright
|
||||
.playwright/
|
||||
|
||||
# macOS
|
||||
.DS_Store
|
||||
217
README.md
Normal file
217
README.md
Normal file
@@ -0,0 +1,217 @@
|
||||
# Troostwijk Auctions Scraper
|
||||
|
||||
A robust web scraper for extracting auction lot data from Troostwijk Auctions, featuring intelligent caching, rate limiting, and Cloudflare bypass capabilities.
|
||||
|
||||
## Features
|
||||
|
||||
- **Playwright-based scraping** - Bypasses Cloudflare protection
|
||||
- **SQLite caching** - Caches every page to avoid redundant requests
|
||||
- **Rate limiting** - Strictly enforces 0.5 seconds between requests
|
||||
- **Multi-format output** - Exports data in both JSON and CSV formats
|
||||
- **Progress saving** - Automatically saves progress every 10 lots
|
||||
- **Test mode** - Debug extraction patterns on cached pages
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.8+
|
||||
- Playwright (with Chromium browser)
|
||||
|
||||
## Installation
|
||||
|
||||
1. **Clone or download this project**
|
||||
|
||||
2. **Install dependencies:**
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. **Install Playwright browsers:**
|
||||
```bash
|
||||
playwright install chromium
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Edit the configuration variables in `main.py`:
|
||||
|
||||
```python
|
||||
BASE_URL = "https://www.troostwijkauctions.com"
|
||||
CACHE_DB = "/mnt/okcomputer/output/cache.db" # Path to cache database
|
||||
OUTPUT_DIR = "/mnt/okcomputer/output" # Output directory
|
||||
RATE_LIMIT_SECONDS = 0.5 # Delay between requests
|
||||
MAX_PAGES = 50 # Number of listing pages to crawl
|
||||
```
|
||||
|
||||
**Note:** Update the paths to match your system (especially on Windows, use paths like `C:\\output\\cache.db`).
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Scraping
|
||||
|
||||
Run the scraper to collect auction lot data:
|
||||
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Crawl listing pages to collect lot URLs
|
||||
2. Scrape each individual lot page
|
||||
3. Save results in both JSON and CSV formats
|
||||
4. Cache all pages to avoid re-fetching
|
||||
|
||||
### Test Mode
|
||||
|
||||
Test extraction patterns on a specific cached URL:
|
||||
|
||||
```bash
|
||||
# Test with default URL
|
||||
python main.py --test
|
||||
|
||||
# Test with specific URL
|
||||
python main.py --test "https://www.troostwijkauctions.com/a/lot-url-here"
|
||||
```
|
||||
|
||||
This is useful for debugging extraction patterns and verifying data is being extracted correctly.
|
||||
|
||||
## Output Files
|
||||
|
||||
The scraper generates the following files:
|
||||
|
||||
### During Execution
|
||||
- `troostwijk_lots_partial_YYYYMMDD_HHMMSS.json` - Progress checkpoints (every 10 lots)
|
||||
|
||||
### Final Output
|
||||
- `troostwijk_lots_final_YYYYMMDD_HHMMSS.json` - Complete data in JSON format
|
||||
- `troostwijk_lots_final_YYYYMMDD_HHMMSS.csv` - Complete data in CSV format
|
||||
|
||||
### Cache
|
||||
- `cache.db` - SQLite database with cached page content (persistent across runs)
|
||||
|
||||
## Data Extracted
|
||||
|
||||
For each auction lot, the scraper extracts:
|
||||
|
||||
- **URL** - Direct link to the lot
|
||||
- **Lot ID** - Unique identifier (e.g., A7-35847)
|
||||
- **Title** - Lot title/description
|
||||
- **Current Bid** - Current bid amount
|
||||
- **Bid Count** - Number of bids placed
|
||||
- **End Date** - Auction end time
|
||||
- **Location** - Physical location of the item
|
||||
- **Description** - Detailed description
|
||||
- **Category** - Auction category
|
||||
- **Images** - Up to 5 product images
|
||||
- **Scraped At** - Timestamp of data collection
|
||||
|
||||
## How It Works
|
||||
|
||||
### Phase 1: Collect Lot URLs
|
||||
The scraper iterates through auction listing pages (`/auctions?page=N`) and collects all lot URLs.
|
||||
|
||||
### Phase 2: Scrape Individual Lots
|
||||
Each lot page is visited and data is extracted from the embedded JSON data (`__NEXT_DATA__`). The site is built with Next.js and includes all auction/lot data in a JSON structure, making extraction reliable and fast.
|
||||
|
||||
### Caching Strategy
|
||||
- Every successfully fetched page is cached in SQLite
|
||||
- Cache is checked before making any request
|
||||
- Cache entries older than 7 days are automatically cleaned
|
||||
- Failed requests (500 errors) are also cached to avoid retrying
|
||||
|
||||
### Rate Limiting
|
||||
- Enforces exactly 0.5 seconds between ALL requests
|
||||
- Applies to both listing pages and individual lot pages
|
||||
- Prevents server overload and potential IP blocking
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Issue: "Huidig bod" / "Locatie" instead of actual values
|
||||
|
||||
**✓ FIXED!** The site uses Next.js with all data embedded in `__NEXT_DATA__` JSON. The scraper now automatically extracts data from JSON first, falling back to HTML pattern matching only if needed.
|
||||
|
||||
The scraper correctly extracts:
|
||||
- **Title** from `auction.name`
|
||||
- **Location** from `viewingDays` or `collectionDays`
|
||||
- **Images** from `auction.image.url`
|
||||
- **End date** from `minEndDate`
|
||||
- **Lot ID** from `auction.displayId`
|
||||
|
||||
To verify extraction is working:
|
||||
```bash
|
||||
python main.py --test "https://www.troostwijkauctions.com/a/your-auction-url"
|
||||
```
|
||||
|
||||
**Note:** Some URLs point to auction pages (collections of lots) rather than individual lots. Individual lots within auctions may have bid information, while auction pages show the collection details.
|
||||
|
||||
### Issue: No lots found
|
||||
|
||||
- Check if the website structure has changed
|
||||
- Verify `BASE_URL` is correct
|
||||
- Try clearing the cache database
|
||||
|
||||
### Issue: Cloudflare blocking
|
||||
|
||||
- Playwright should bypass this automatically
|
||||
- If issues persist, try adjusting user agent or headers in `crawl_auctions()`
|
||||
|
||||
### Issue: Slow scraping
|
||||
|
||||
- This is intentional due to rate limiting (0.5s between requests)
|
||||
- Adjust `RATE_LIMIT_SECONDS` if needed (not recommended below 0.5s)
|
||||
- First run will be slower; subsequent runs use cache
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
troost-scraper/
|
||||
├── main.py # Main scraper script
|
||||
├── requirements.txt # Python dependencies
|
||||
├── README.md # This file
|
||||
└── output/ # Generated output files (created automatically)
|
||||
├── cache.db # SQLite cache
|
||||
├── *.json # JSON output files
|
||||
└── *.csv # CSV output files
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
### Adding New Extraction Fields
|
||||
|
||||
1. Add extraction method in `TroostwijkScraper` class:
|
||||
```python
|
||||
def _extract_new_field(self, content: str) -> str:
|
||||
pattern = r'your-regex-pattern'
|
||||
match = re.search(pattern, content)
|
||||
return match.group(1) if match else ""
|
||||
```
|
||||
|
||||
2. Add field to `_parse_lot_page()`:
|
||||
```python
|
||||
data = {
|
||||
# ... existing fields ...
|
||||
'new_field': self._extract_new_field(content),
|
||||
}
|
||||
```
|
||||
|
||||
3. Add field to CSV export in `save_final_results()`:
|
||||
```python
|
||||
fieldnames = ['url', 'lot_id', ..., 'new_field', ...]
|
||||
```
|
||||
|
||||
### Testing Extraction Patterns
|
||||
|
||||
Use test mode to verify patterns work correctly:
|
||||
```bash
|
||||
python main.py --test "https://www.troostwijkauctions.com/a/your-test-url"
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This scraper is for educational and research purposes. Please respect Troostwijk Auctions' terms of service and robots.txt when using this tool.
|
||||
|
||||
## Notes
|
||||
|
||||
- **Be respectful:** The rate limiting is intentionally conservative
|
||||
- **Check legality:** Ensure web scraping is permitted in your jurisdiction
|
||||
- **Monitor changes:** Website structure may change over time, requiring pattern updates
|
||||
- **Cache management:** Old cache entries are auto-cleaned after 7 days
|
||||
744
main.py
Normal file
744
main.py
Normal file
@@ -0,0 +1,744 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Troostwijk Auctions Scraper
|
||||
Focuses on extracting auction lots with caching and rate limiting
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import csv
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
from datetime import datetime
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Set
|
||||
import random
|
||||
|
||||
# Import Playwright - REQUIRED for bypassing Cloudflare
|
||||
from playwright.async_api import async_playwright, Browser, Page
|
||||
|
||||
# ==================== CONFIGURATION ====================
|
||||
BASE_URL = "https://www.troostwijkauctions.com"
|
||||
CACHE_DB = "/mnt/okcomputer/output/cache.db"
|
||||
OUTPUT_DIR = "/mnt/okcomputer/output"
|
||||
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests - YOUR REQUIREMENT
|
||||
MAX_PAGES = 50 # Number of listing pages to crawl (adjust as needed)
|
||||
|
||||
# Setup directories
|
||||
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
class CacheManager:
|
||||
"""Manages page caching using SQLite - EVERY PAGE IS CACHED"""
|
||||
|
||||
def __init__(self, db_path: str):
|
||||
self.db_path = db_path
|
||||
self._init_db()
|
||||
|
||||
def _init_db(self):
|
||||
"""Initialize cache database"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
content TEXT,
|
||||
timestamp REAL,
|
||||
status_code INTEGER
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
|
||||
"""Get cached page if it exists and is not too old"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT content, timestamp, status_code FROM cache WHERE url = ?",
|
||||
(url,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
content, timestamp, status_code = row
|
||||
age_hours = (time.time() - timestamp) / 3600
|
||||
|
||||
if age_hours <= max_age_hours:
|
||||
return {
|
||||
'content': content,
|
||||
'timestamp': timestamp,
|
||||
'status_code': status_code,
|
||||
'cached': True
|
||||
}
|
||||
return None
|
||||
|
||||
def set(self, url: str, content: str, status_code: int = 200):
|
||||
"""Cache a page - EVERY SUCCESSFUL REQUEST IS CACHED"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO cache (url, content, timestamp, status_code) VALUES (?, ?, ?, ?)",
|
||||
(url, content, time.time(), status_code)
|
||||
)
|
||||
conn.commit()
|
||||
print(f" → Cached: {url}")
|
||||
|
||||
def clear_old(self, max_age_hours: int = 168): # Default: 1 week
|
||||
"""Clear old cache entries to prevent database bloat"""
|
||||
cutoff_time = time.time() - (max_age_hours * 3600)
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
deleted = conn.execute("DELETE FROM cache WHERE timestamp < ?", (cutoff_time,)).rowcount
|
||||
conn.commit()
|
||||
if deleted > 0:
|
||||
print(f" → Cleared {deleted} old cache entries")
|
||||
|
||||
|
||||
class TroostwijkScraper:
|
||||
"""Main scraper class for Troostwijk Auctions"""
|
||||
|
||||
def __init__(self):
|
||||
self.base_url = BASE_URL
|
||||
self.cache = CacheManager(CACHE_DB)
|
||||
self.visited_lots: Set[str] = set()
|
||||
self.output_data: List[Dict] = []
|
||||
self.last_request_time = 0
|
||||
|
||||
async def _rate_limit(self):
|
||||
"""ENSURE EXACTLY 0.5s BETWEEN REQUESTS - YOUR REQUIREMENT"""
|
||||
current_time = time.time()
|
||||
time_since_last = current_time - self.last_request_time
|
||||
|
||||
if time_since_last < RATE_LIMIT_SECONDS:
|
||||
delay = RATE_LIMIT_SECONDS - time_since_last
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
self.last_request_time = time.time()
|
||||
|
||||
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
|
||||
"""Get page content with caching and strict rate limiting"""
|
||||
# Check cache first - AVOID UNNECESSARY REQUESTS
|
||||
if use_cache:
|
||||
cached = self.cache.get(url)
|
||||
if cached:
|
||||
print(f" CACHE HIT: {url}")
|
||||
return cached['content']
|
||||
|
||||
# Rate limit before making request - YOUR 0.5s REQUIREMENT
|
||||
await self._rate_limit()
|
||||
|
||||
try:
|
||||
print(f" FETCHING: {url}")
|
||||
await page.goto(url, wait_until='networkidle', timeout=30000)
|
||||
|
||||
# Small additional wait for dynamic content
|
||||
await asyncio.sleep(random.uniform(0.3, 0.7))
|
||||
|
||||
content = await page.content()
|
||||
|
||||
# Cache the successful result
|
||||
self.cache.set(url, content, 200)
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
# Cache the error to avoid retrying too soon
|
||||
self.cache.set(url, "", 500)
|
||||
return None
|
||||
|
||||
def _extract_lot_urls_from_listing(self, content: str) -> List[str]:
|
||||
"""Extract lot URLs from auction listing page"""
|
||||
# Pattern matches /lots/ followed by digits
|
||||
pattern = r'href=["\']([/]a/[^"\']+)["\']'
|
||||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||||
|
||||
lot_urls = []
|
||||
for match in matches:
|
||||
full_url = urljoin(self.base_url, match)
|
||||
lot_urls.append(full_url)
|
||||
|
||||
# Remove duplicates
|
||||
return list(set(lot_urls))
|
||||
|
||||
def _extract_lot_id(self, url: str) -> str:
|
||||
"""Extract lot ID from URL"""
|
||||
path = urlparse(url).path
|
||||
# Try /lots/ pattern first (legacy)
|
||||
match = re.search(r'/lots/(\d+)', path)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# Try /a/ pattern (current format: /a/title-A7-12345)
|
||||
match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# Fallback: return last part of path
|
||||
return path.split('/')[-1] if path else ""
|
||||
|
||||
def _parse_lot_page(self, content: str, url: str) -> Dict:
|
||||
"""Parse individual lot page and extract data"""
|
||||
# First try to extract from __NEXT_DATA__ JSON (Next.js sites)
|
||||
next_data = self._extract_nextjs_data(content)
|
||||
if next_data:
|
||||
return next_data
|
||||
|
||||
# Fallback to HTML parsing
|
||||
content = re.sub(r'\s+', ' ', content)
|
||||
|
||||
data = {
|
||||
'url': url,
|
||||
'lot_id': self._extract_lot_id(url),
|
||||
'title': self._extract_meta_content(content, 'og:title'),
|
||||
'current_bid': self._extract_current_bid(content),
|
||||
'bid_count': self._extract_bid_count(content),
|
||||
'end_date': self._extract_end_date(content),
|
||||
'location': self._extract_location(content),
|
||||
'description': self._extract_description(content),
|
||||
'category': self._extract_category(content),
|
||||
'images': self._extract_images(content),
|
||||
'scraped_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
return data
|
||||
|
||||
def _extract_nextjs_data(self, content: str) -> Optional[Dict]:
|
||||
"""Extract data from Next.js __NEXT_DATA__ JSON"""
|
||||
try:
|
||||
# Find the __NEXT_DATA__ script tag
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
|
||||
# Navigate to pageProps
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
# Check if this is an auction page (contains lot data)
|
||||
if 'auction' in page_props:
|
||||
# This is a single lot/auction page
|
||||
auction = page_props.get('auction', {})
|
||||
|
||||
# Extract main data
|
||||
result = {
|
||||
'url': self.base_url + '/a/' + auction.get('urlSlug', ''),
|
||||
'lot_id': auction.get('displayId', ''),
|
||||
'title': auction.get('name', ''),
|
||||
'current_bid': '', # Need to check if this has bid info
|
||||
'bid_count': 0,
|
||||
'end_date': self._format_timestamp(auction.get('minEndDate', '')),
|
||||
'location': self._extract_location_from_json(auction),
|
||||
'description': auction.get('description', ''),
|
||||
'category': auction.get('category', {}).get('name', '') if isinstance(auction.get('category'), dict) else '',
|
||||
'images': [auction['image']['url']] if auction.get('image') and auction['image'].get('url') else [],
|
||||
'scraped_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f" → Error parsing __NEXT_DATA__: {e}")
|
||||
return None
|
||||
|
||||
def _format_timestamp(self, timestamp: any) -> str:
|
||||
"""Convert Unix timestamp to readable date"""
|
||||
try:
|
||||
if isinstance(timestamp, (int, float)) and timestamp > 0:
|
||||
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
|
||||
return str(timestamp) if timestamp else ''
|
||||
except:
|
||||
return str(timestamp) if timestamp else ''
|
||||
|
||||
def _extract_location_from_json(self, auction_data: Dict) -> str:
|
||||
"""Extract location from auction JSON data"""
|
||||
# Try viewingDays first
|
||||
viewing_days = auction_data.get('viewingDays', [])
|
||||
if viewing_days and len(viewing_days) > 0:
|
||||
first_location = viewing_days[0]
|
||||
city = first_location.get('city', '')
|
||||
country = first_location.get('countryCode', '').upper()
|
||||
if city:
|
||||
return f"{city}, {country}" if country else city
|
||||
|
||||
# Try collectionDays
|
||||
collection_days = auction_data.get('collectionDays', [])
|
||||
if collection_days and len(collection_days) > 0:
|
||||
first_location = collection_days[0]
|
||||
city = first_location.get('city', '')
|
||||
country = first_location.get('countryCode', '').upper()
|
||||
if city:
|
||||
return f"{city}, {country}" if country else city
|
||||
|
||||
return ''
|
||||
|
||||
def _extract_meta_content(self, content: str, property_name: str) -> str:
|
||||
"""Extract content from meta tags"""
|
||||
pattern = rf'<meta[^>]*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']'
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
return self._clean_text(match.group(1))
|
||||
return ""
|
||||
|
||||
def _extract_current_bid(self, content: str) -> str:
|
||||
"""Extract current bid amount"""
|
||||
patterns = [
|
||||
# JSON data patterns (most reliable)
|
||||
r'"currentBid"\s*:\s*"([^"]+)"',
|
||||
r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)',
|
||||
r'currentBid["\']?\s*:\s*["\']?([€\d,.\s]+)["\']?',
|
||||
# HTML patterns - look for bid amount AFTER the label
|
||||
r'(?:Current bid|Huidig bod)[:\s]*</?\w*>\s*(€[\d,.\s]+)',
|
||||
r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)',
|
||||
r'<[^>]*bid-amount[^>]*>[\s]*(€[\d,.\s]+)',
|
||||
# Meta tags
|
||||
r'<meta[^>]*property=["\']auction:currentBid["\'][^>]*content=["\']([^"\']+)["\']',
|
||||
# Structured data
|
||||
r'"price"\s*:\s*"([€\d,.\s]+)"',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
|
||||
if match:
|
||||
bid = match.group(1).strip()
|
||||
# Validate it's not just the label
|
||||
if bid and bid.lower() not in ['huidig bod', 'current bid', 'locatie', 'location']:
|
||||
# Clean up the bid value
|
||||
if not bid.startswith('€'):
|
||||
bid = f"€{bid}"
|
||||
return bid
|
||||
|
||||
return "€0"
|
||||
|
||||
def _extract_bid_count(self, content: str) -> int:
|
||||
"""Extract number of bids"""
|
||||
patterns = [
|
||||
r'(\d+)\s*bids?',
|
||||
r'bidCount["\']:\s*["\']?(\d+)["\']?'
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
return int(match.group(1))
|
||||
except:
|
||||
return 0
|
||||
|
||||
return 0
|
||||
|
||||
def _extract_end_date(self, content: str) -> str:
|
||||
"""Extract auction end date"""
|
||||
patterns = [
|
||||
r'Ends?[:\s]+([A-Za-z0-9,:\s]+)',
|
||||
r'endTime["\']:\s*["\']([^"\']+)["\']',
|
||||
r'class="[^"]*end[^"]*".*?>([A-Za-z0-9,:\s]+)<'
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return ""
|
||||
|
||||
def _extract_location(self, content: str) -> str:
|
||||
"""Extract location"""
|
||||
patterns = [
|
||||
# JSON data patterns (most reliable)
|
||||
r'"location"\s*:\s*"([^"]+)"',
|
||||
r'"address"\s*:\s*"([^"]+)"',
|
||||
r'"addressLocality"\s*:\s*"([^"]+)"',
|
||||
# HTML patterns - look for location AFTER the label
|
||||
r'(?:Location|Locatie)[:\s]*</?\w*>\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)',
|
||||
r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<br|</|$)',
|
||||
r'<[^>]*location[^>]*>[\s]*([A-Za-zÀ-ÿ0-9\s,.-]+?)</[^>]*>',
|
||||
# Icon or label based
|
||||
r'<i[^>]*location[^>]*></i>\s*([A-Za-zÀ-ÿ0-9\s,.-]+)',
|
||||
# Meta tags
|
||||
r'<meta[^>]*property=["\']auction:location["\'][^>]*content=["\']([^"\']+)["\']',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
|
||||
if match:
|
||||
location = self._clean_text(match.group(1))
|
||||
# Validate it's not just the label
|
||||
if location and location.lower() not in ['locatie', 'location', 'huidig bod', 'current bid']:
|
||||
# Remove trailing punctuation and whitespace
|
||||
location = re.sub(r'[,.\s]+$', '', location)
|
||||
if len(location) > 2: # Must be more than 2 chars
|
||||
return location
|
||||
|
||||
return ""
|
||||
|
||||
def _extract_description(self, content: str) -> str:
|
||||
"""Extract description"""
|
||||
patterns = [
|
||||
r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']',
|
||||
r'class="[^"]*description[^"]*".*?>([^<]+)<'
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
|
||||
if match:
|
||||
return self._clean_text(match.group(1))[:500]
|
||||
|
||||
return ""
|
||||
|
||||
def _extract_category(self, content: str) -> str:
|
||||
"""Extract category from breadcrumb or meta tags"""
|
||||
# Try breadcrumb first
|
||||
pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)</a>'
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
return self._clean_text(match.group(1))
|
||||
|
||||
# Try meta
|
||||
return self._extract_meta_content(content, 'category')
|
||||
|
||||
def _extract_images(self, content: str) -> List[str]:
|
||||
"""Extract image URLs"""
|
||||
pattern = r'<img[^>]*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>'
|
||||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||||
|
||||
images = []
|
||||
for match in matches:
|
||||
if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']):
|
||||
continue
|
||||
full_url = urljoin(self.base_url, match)
|
||||
images.append(full_url)
|
||||
|
||||
return images[:5] # Limit to 5 images
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""Clean extracted text"""
|
||||
import html
|
||||
text = html.unescape(text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
|
||||
"""Crawl a single listing page and return lot URLs"""
|
||||
url = f"{self.base_url}/auctions?page={page_num}"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"LISTING PAGE {page_num}: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
content = await self._get_page(page, url)
|
||||
if not content:
|
||||
return []
|
||||
|
||||
lot_urls = self._extract_lot_urls_from_listing(content)
|
||||
print(f"→ Found {len(lot_urls)} lot URLs")
|
||||
|
||||
return lot_urls
|
||||
|
||||
async def crawl_lot(self, page: Page, url: str) -> Optional[Dict]:
|
||||
"""Crawl an individual lot page"""
|
||||
if url in self.visited_lots:
|
||||
print(f" → Skipping (already visited): {url}")
|
||||
return None
|
||||
|
||||
lot_id = self._extract_lot_id(url)
|
||||
print(f"\n[LOT {lot_id}]")
|
||||
|
||||
content = await self._get_page(page, url)
|
||||
if not content:
|
||||
return None
|
||||
|
||||
lot_data = self._parse_lot_page(content, url)
|
||||
self.visited_lots.add(url)
|
||||
|
||||
print(f" → Title: {lot_data.get('title', 'N/A')[:60]}...")
|
||||
print(f" → Bid: {lot_data.get('current_bid', 'N/A')}")
|
||||
print(f" → Location: {lot_data.get('location', 'N/A')}")
|
||||
|
||||
return lot_data
|
||||
|
||||
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
|
||||
"""Main crawl function"""
|
||||
async with async_playwright() as p:
|
||||
print("Launching browser...")
|
||||
browser = await p.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled'
|
||||
]
|
||||
)
|
||||
|
||||
page = await browser.new_page(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
|
||||
)
|
||||
|
||||
# Set extra headers
|
||||
await page.set_extra_http_headers({
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
})
|
||||
|
||||
all_lot_urls = []
|
||||
|
||||
# First pass: collect all lot URLs from listing pages
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 1: COLLECTING LOT URLs FROM LISTING PAGES")
|
||||
print("="*60)
|
||||
|
||||
for page_num in range(1, max_pages + 1):
|
||||
lot_urls = await self.crawl_listing_page(page, page_num)
|
||||
if not lot_urls:
|
||||
print(f"No lots found on page {page_num}, stopping")
|
||||
break
|
||||
all_lot_urls.extend(lot_urls)
|
||||
print(f" → Total lots collected so far: {len(all_lot_urls)}")
|
||||
|
||||
# Remove duplicates
|
||||
all_lot_urls = list(set(all_lot_urls))
|
||||
print(f"\n{'='*60}")
|
||||
print(f"PHASE 1 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS TO SCRAPE")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Second pass: scrape each lot page
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 2: SCRAPING INDIVIDUAL LOT PAGES")
|
||||
print("="*60)
|
||||
|
||||
results = []
|
||||
for i, lot_url in enumerate(all_lot_urls):
|
||||
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
|
||||
lot_data = await self.crawl_lot(page, lot_url)
|
||||
if lot_data:
|
||||
results.append(lot_data)
|
||||
# Save progress after each successful scrape
|
||||
if (i + 1) % 10 == 0: # Save every 10 lots
|
||||
self._save_intermediate(results)
|
||||
|
||||
await browser.close()
|
||||
return results
|
||||
|
||||
def _save_intermediate(self, data: List[Dict]):
|
||||
"""Save intermediate results"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"{OUTPUT_DIR}/troostwijk_lots_partial_{timestamp}.json"
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'count': len(data),
|
||||
'lots': data
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n → PROGRESS SAVED: {filename}")
|
||||
|
||||
def save_final_results(self, data: List[Dict]):
|
||||
"""Save final results in multiple formats"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
# Save JSON
|
||||
json_file = f"{OUTPUT_DIR}/troostwijk_lots_final_{timestamp}.json"
|
||||
with open(json_file, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'count': len(data),
|
||||
'scraped_at': datetime.now().isoformat(),
|
||||
'rate_limit_seconds': RATE_LIMIT_SECONDS,
|
||||
'lots': data
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Save CSV
|
||||
csv_file = f"{OUTPUT_DIR}/troostwijk_lots_final_{timestamp}.csv"
|
||||
if data:
|
||||
flat_data = []
|
||||
for item in data:
|
||||
flat_item = item.copy()
|
||||
flat_item['images'] = ', '.join(flat_item.get('images', []))
|
||||
flat_data.append(flat_item)
|
||||
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
|
||||
fieldnames = ['url', 'lot_id', 'title', 'current_bid', 'bid_count',
|
||||
'end_date', 'location', 'description', 'category', 'images', 'scraped_at']
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
||||
writer.writeheader()
|
||||
writer.writerows(flat_data)
|
||||
|
||||
return json_file, csv_file
|
||||
|
||||
|
||||
def test_extraction(test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
|
||||
"""Test extraction on a specific cached URL to debug patterns"""
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
# Try to get from cache
|
||||
cached = scraper.cache.get(test_url)
|
||||
if not cached:
|
||||
print(f"ERROR: URL not found in cache: {test_url}")
|
||||
print(f"\nAvailable cached URLs:")
|
||||
with sqlite3.connect(CACHE_DB) as conn:
|
||||
cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
|
||||
for row in cursor.fetchall():
|
||||
print(f" - {row[0]}")
|
||||
return
|
||||
|
||||
content = cached['content']
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TESTING EXTRACTION FROM: {test_url}")
|
||||
print(f"{'='*60}")
|
||||
print(f"Content length: {len(content)} chars")
|
||||
print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
|
||||
|
||||
# Test each extraction method
|
||||
lot_data = scraper._parse_lot_page(content, test_url)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("EXTRACTED DATA:")
|
||||
print(f"{'='*60}")
|
||||
for key, value in lot_data.items():
|
||||
if key == 'images':
|
||||
print(f"{key:.<20}: {len(value)} images")
|
||||
for img in value[:3]:
|
||||
print(f"{'':.<20} - {img}")
|
||||
else:
|
||||
display_value = str(value)[:100] if value else "(empty)"
|
||||
# Handle Unicode characters that Windows console can't display
|
||||
try:
|
||||
print(f"{key:.<20}: {display_value}")
|
||||
except UnicodeEncodeError:
|
||||
safe_value = display_value.encode('ascii', 'replace').decode('ascii')
|
||||
print(f"{key:.<20}: {safe_value}")
|
||||
|
||||
# Validation checks
|
||||
print(f"\n{'='*60}")
|
||||
print("VALIDATION CHECKS:")
|
||||
print(f"{'='*60}")
|
||||
|
||||
issues = []
|
||||
if lot_data['current_bid'] in ['Huidig bod', 'Current bid', '€0', '']:
|
||||
issues.append("[!] Current bid not extracted correctly")
|
||||
else:
|
||||
print("[OK] Current bid looks valid:", lot_data['current_bid'])
|
||||
|
||||
if lot_data['location'] in ['Locatie', 'Location', '']:
|
||||
issues.append("[!] Location not extracted correctly")
|
||||
else:
|
||||
print("[OK] Location looks valid:", lot_data['location'])
|
||||
|
||||
if lot_data['title'] in ['', '...']:
|
||||
issues.append("[!] Title not extracted correctly")
|
||||
else:
|
||||
print("[OK] Title looks valid:", lot_data['title'][:50])
|
||||
|
||||
if issues:
|
||||
print(f"\n[ISSUES FOUND]")
|
||||
for issue in issues:
|
||||
print(f" {issue}")
|
||||
else:
|
||||
print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
|
||||
|
||||
# Debug: Show raw HTML snippets for problematic fields
|
||||
print(f"\n{'='*60}")
|
||||
print("DEBUG: RAW HTML SNIPPETS")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Look for bid-related content
|
||||
print(f"\n1. Bid patterns in content:")
|
||||
bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
|
||||
for i, match in enumerate(bid_matches[:5], 1):
|
||||
print(f" {i}. {match}")
|
||||
|
||||
# Look for location content
|
||||
print(f"\n2. Location patterns in content:")
|
||||
loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
|
||||
for i, match in enumerate(loc_matches[:5], 1):
|
||||
print(f" {i}. ...{match}...")
|
||||
|
||||
# Look for JSON data
|
||||
print(f"\n3. JSON/Script data containing auction info:")
|
||||
json_patterns = [
|
||||
r'"currentBid"[^,}]+',
|
||||
r'"location"[^,}]+',
|
||||
r'"price"[^,}]+',
|
||||
r'"addressLocality"[^,}]+'
|
||||
]
|
||||
for pattern in json_patterns:
|
||||
matches = re.findall(pattern, content[:50000], re.IGNORECASE)
|
||||
if matches:
|
||||
print(f" {pattern}: {matches[:3]}")
|
||||
|
||||
# Look for script tags with structured data
|
||||
script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
|
||||
if script_matches:
|
||||
print(f"\n4. Structured data (JSON-LD) found:")
|
||||
for i, script in enumerate(script_matches[:2], 1):
|
||||
try:
|
||||
data = json.loads(script)
|
||||
print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...")
|
||||
except:
|
||||
print(f" Script {i}: {script[:300]}...")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main execution"""
|
||||
import sys
|
||||
|
||||
# Check for test mode
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--test":
|
||||
test_url = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
if test_url:
|
||||
test_extraction(test_url)
|
||||
else:
|
||||
test_extraction()
|
||||
return
|
||||
|
||||
print("Troostwijk Auctions Scraper")
|
||||
print("=" * 60)
|
||||
print(f"Rate limit: {RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
|
||||
print(f"Cache database: {CACHE_DB}")
|
||||
print(f"Output directory: {OUTPUT_DIR}")
|
||||
print(f"Max listing pages: {MAX_PAGES}")
|
||||
print("=" * 60)
|
||||
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
try:
|
||||
# Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
|
||||
scraper.cache.clear_old(max_age_hours=168)
|
||||
|
||||
# Run the crawler
|
||||
results = asyncio.run(scraper.crawl_auctions(max_pages=MAX_PAGES))
|
||||
|
||||
# Save final results
|
||||
if results:
|
||||
json_file, csv_file = scraper.save_final_results(results)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("CRAWLING COMPLETED SUCCESSFULLY")
|
||||
print("="*60)
|
||||
print(f"Total lots scraped: {len(results)}")
|
||||
print(f"JSON file: {json_file}")
|
||||
print(f"CSV file: {csv_file}")
|
||||
|
||||
# Show sample
|
||||
if results:
|
||||
print(f"\n{'='*60}")
|
||||
print("SAMPLE DATA:")
|
||||
print(f"{'='*60}")
|
||||
sample = results[0]
|
||||
for key, value in sample.items():
|
||||
if key != 'images':
|
||||
print(f"{key:.<20}: {str(value)[:80]}...")
|
||||
else:
|
||||
print("\nNo results collected. Check cache and logs.")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nScraping interrupted by user - partial results saved in output directory")
|
||||
except Exception as e:
|
||||
print(f"\nERROR during scraping: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
# Troostwijk Auctions Scraper - Dependencies
|
||||
|
||||
# Core web scraping - Playwright for bypassing Cloudflare
|
||||
playwright==1.49.0
|
||||
|
||||
# Python version requirement: 3.8+
|
||||
# Install Playwright browsers after installing this package:
|
||||
# playwright install chromium
|
||||
Reference in New Issue
Block a user