init

Tour
2025-12-03 11:44:11 +01:00
commit 8b71d5e113
4 changed files with 1145 additions and 0 deletions

176
.gitignore vendored Normal file

@@ -0,0 +1,176 @@
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Project specific - Troostwijk Scraper
output/
*.db
*.csv
*.json
!requirements.txt
# Playwright
.playwright/
# macOS
.DS_Store

217
README.md Normal file

@@ -0,0 +1,217 @@
# Troostwijk Auctions Scraper
A robust web scraper for extracting auction lot data from Troostwijk Auctions, featuring intelligent caching, rate limiting, and Cloudflare bypass capabilities.
## Features
- **Playwright-based scraping** - Bypasses Cloudflare protection
- **SQLite caching** - Caches every page to avoid redundant requests
- **Rate limiting** - Strictly enforces 0.5 seconds between requests
- **Multi-format output** - Exports data in both JSON and CSV formats
- **Progress saving** - Automatically saves progress every 10 lots
- **Test mode** - Debug extraction patterns on cached pages
## Requirements
- Python 3.8+
- Playwright (with Chromium browser)
## Installation
1. **Clone or download this project**
2. **Install dependencies:**
```bash
pip install -r requirements.txt
```
3. **Install Playwright browsers:**
```bash
playwright install chromium
```
## Configuration
Edit the configuration variables in `main.py`:
```python
BASE_URL = "https://www.troostwijkauctions.com"
CACHE_DB = "/mnt/okcomputer/output/cache.db" # Path to cache database
OUTPUT_DIR = "/mnt/okcomputer/output" # Output directory
RATE_LIMIT_SECONDS = 0.5 # Delay between requests
MAX_PAGES = 50 # Number of listing pages to crawl
```
**Note:** Update the paths to match your system (especially on Windows, use paths like `C:\\output\\cache.db`).
## Usage
### Basic Scraping
Run the scraper to collect auction lot data:
```bash
python main.py
```
This will:
1. Crawl listing pages to collect lot URLs
2. Scrape each individual lot page
3. Save results in both JSON and CSV formats
4. Cache all pages to avoid re-fetching
### Test Mode
Test extraction patterns on a specific cached URL:
```bash
# Test with default URL
python main.py --test
# Test with specific URL
python main.py --test "https://www.troostwijkauctions.com/a/lot-url-here"
```
This is useful for debugging extraction patterns and verifying data is being extracted correctly.
## Output Files
The scraper generates the following files:
### During Execution
- `troostwijk_lots_partial_YYYYMMDD_HHMMSS.json` - Progress checkpoints (every 10 lots)
### Final Output
- `troostwijk_lots_final_YYYYMMDD_HHMMSS.json` - Complete data in JSON format
- `troostwijk_lots_final_YYYYMMDD_HHMMSS.csv` - Complete data in CSV format
### Cache
- `cache.db` - SQLite database with cached page content (persistent across runs)
## Data Extracted
For each auction lot, the scraper extracts:
- **URL** - Direct link to the lot
- **Lot ID** - Unique identifier (e.g., A7-35847)
- **Title** - Lot title/description
- **Current Bid** - Current bid amount
- **Bid Count** - Number of bids placed
- **End Date** - Auction end time
- **Location** - Physical location of the item
- **Description** - Detailed description
- **Category** - Auction category
- **Images** - Up to 5 product images
- **Scraped At** - Timestamp of data collection
## How It Works
### Phase 1: Collect Lot URLs
The scraper iterates through auction listing pages (`/auctions?page=N`) and collects all lot URLs.
### Phase 2: Scrape Individual Lots
Each lot page is visited and data is extracted from the embedded JSON data (`__NEXT_DATA__`). The site is built with Next.js and includes all auction/lot data in a JSON structure, making extraction reliable and fast.
### Caching Strategy
- Every successfully fetched page is cached in SQLite
- Cache is checked before making any request
- Cache entries older than 7 days are automatically cleaned
- Failed requests (500 errors) are also cached to avoid retrying
### Rate Limiting
- Enforces exactly 0.5 seconds between ALL requests
- Applies to both listing pages and individual lot pages
- Prevents server overload and potential IP blocking
## Troubleshooting
### Issue: "Huidig bod" / "Locatie" instead of actual values
**✓ FIXED!** The site uses Next.js with all data embedded in `__NEXT_DATA__` JSON. The scraper now automatically extracts data from JSON first, falling back to HTML pattern matching only if needed.
The scraper correctly extracts:
- **Title** from `auction.name`
- **Location** from `viewingDays` or `collectionDays`
- **Images** from `auction.image.url`
- **End date** from `minEndDate`
- **Lot ID** from `auction.displayId`
To verify extraction is working:
```bash
python main.py --test "https://www.troostwijkauctions.com/a/your-auction-url"
```
**Note:** Some URLs point to auction pages (collections of lots) rather than individual lots. Individual lots within auctions may have bid information, while auction pages show the collection details.
### Issue: No lots found
- Check if the website structure has changed
- Verify `BASE_URL` is correct
- Try clearing the cache database
### Issue: Cloudflare blocking
- Playwright should bypass this automatically
- If issues persist, try adjusting user agent or headers in `crawl_auctions()`
### Issue: Slow scraping
- This is intentional due to rate limiting (0.5s between requests)
- Adjust `RATE_LIMIT_SECONDS` if needed (not recommended below 0.5s)
- First run will be slower; subsequent runs use cache
## Project Structure
```
troost-scraper/
├── main.py # Main scraper script
├── requirements.txt # Python dependencies
├── README.md # This file
└── output/ # Generated output files (created automatically)
├── cache.db # SQLite cache
├── *.json # JSON output files
└── *.csv # CSV output files
```
## Development
### Adding New Extraction Fields
1. Add extraction method in `TroostwijkScraper` class:
```python
def _extract_new_field(self, content: str) -> str:
pattern = r'your-regex-pattern'
match = re.search(pattern, content)
return match.group(1) if match else ""
```
2. Add field to `_parse_lot_page()`:
```python
data = {
# ... existing fields ...
'new_field': self._extract_new_field(content),
}
```
3. Add field to CSV export in `save_final_results()`:
```python
fieldnames = ['url', 'lot_id', ..., 'new_field', ...]
```
### Testing Extraction Patterns
Use test mode to verify patterns work correctly:
```bash
python main.py --test "https://www.troostwijkauctions.com/a/your-test-url"
```
## License
This scraper is for educational and research purposes. Please respect Troostwijk Auctions' terms of service and robots.txt when using this tool.
## Notes
- **Be respectful:** The rate limiting is intentionally conservative
- **Check legality:** Ensure web scraping is permitted in your jurisdiction
- **Monitor changes:** Website structure may change over time, requiring pattern updates
- **Cache management:** Old cache entries are auto-cleaned after 7 days

744
main.py Normal file

@@ -0,0 +1,744 @@
#!/usr/bin/env python3
"""
Troostwijk Auctions Scraper
Focuses on extracting auction lots with caching and rate limiting
"""
import asyncio
import json
import csv
import re
import sqlite3
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse
from pathlib import Path
from typing import List, Dict, Optional, Set
import random
# Import Playwright - REQUIRED for bypassing Cloudflare
from playwright.async_api import async_playwright, Browser, Page
# ==================== CONFIGURATION ====================
BASE_URL = "https://www.troostwijkauctions.com"
CACHE_DB = "/mnt/okcomputer/output/cache.db"
OUTPUT_DIR = "/mnt/okcomputer/output"
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests - YOUR REQUIREMENT
MAX_PAGES = 50 # Number of listing pages to crawl (adjust as needed)
# Setup directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
class CacheManager:
"""Manages page caching using SQLite - EVERY PAGE IS CACHED"""
def __init__(self, db_path: str):
self.db_path = db_path
self._init_db()
def _init_db(self):
"""Initialize cache database"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS cache (
url TEXT PRIMARY KEY,
content TEXT,
timestamp REAL,
status_code INTEGER
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
""")
conn.commit()
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
"""Get cached page if it exists and is not too old"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"SELECT content, timestamp, status_code FROM cache WHERE url = ?",
(url,)
)
row = cursor.fetchone()
if row:
content, timestamp, status_code = row
age_hours = (time.time() - timestamp) / 3600
if age_hours <= max_age_hours:
return {
'content': content,
'timestamp': timestamp,
'status_code': status_code,
'cached': True
}
return None
def set(self, url: str, content: str, status_code: int = 200):
"""Cache a page - EVERY SUCCESSFUL REQUEST IS CACHED"""
with sqlite3.connect(self.db_path) as conn:
conn.execute(
"INSERT OR REPLACE INTO cache (url, content, timestamp, status_code) VALUES (?, ?, ?, ?)",
(url, content, time.time(), status_code)
)
conn.commit()
print(f" → Cached: {url}")
def clear_old(self, max_age_hours: int = 168): # Default: 1 week
"""Clear old cache entries to prevent database bloat"""
cutoff_time = time.time() - (max_age_hours * 3600)
with sqlite3.connect(self.db_path) as conn:
deleted = conn.execute("DELETE FROM cache WHERE timestamp < ?", (cutoff_time,)).rowcount
conn.commit()
if deleted > 0:
print(f" → Cleared {deleted} old cache entries")
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
def __init__(self):
self.base_url = BASE_URL
self.cache = CacheManager(CACHE_DB)
self.visited_lots: Set[str] = set()
self.output_data: List[Dict] = []
self.last_request_time = 0
async def _rate_limit(self):
"""ENSURE EXACTLY 0.5s BETWEEN REQUESTS - YOUR REQUIREMENT"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < RATE_LIMIT_SECONDS:
delay = RATE_LIMIT_SECONDS - time_since_last
await asyncio.sleep(delay)
self.last_request_time = time.time()
async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
"""Get page content with caching and strict rate limiting"""
# Check cache first - AVOID UNNECESSARY REQUESTS
if use_cache:
cached = self.cache.get(url)
if cached:
print(f" CACHE HIT: {url}")
return cached['content']
# Rate limit before making request - YOUR 0.5s REQUIREMENT
await self._rate_limit()
try:
print(f" FETCHING: {url}")
await page.goto(url, wait_until='networkidle', timeout=30000)
# Small additional wait for dynamic content
await asyncio.sleep(random.uniform(0.3, 0.7))
content = await page.content()
# Cache the successful result
self.cache.set(url, content, 200)
return content
except Exception as e:
print(f" ERROR: {e}")
# Cache the error to avoid retrying too soon
self.cache.set(url, "", 500)
return None
def _extract_lot_urls_from_listing(self, content: str) -> List[str]:
"""Extract lot URLs from auction listing page"""
# Pattern matches /lots/ followed by digits
pattern = r'href=["\']([/]a/[^"\']+)["\']'
matches = re.findall(pattern, content, re.IGNORECASE)
lot_urls = []
for match in matches:
full_url = urljoin(self.base_url, match)
lot_urls.append(full_url)
# Remove duplicates
return list(set(lot_urls))
def _extract_lot_id(self, url: str) -> str:
"""Extract lot ID from URL"""
path = urlparse(url).path
# Try /lots/ pattern first (legacy)
match = re.search(r'/lots/(\d+)', path)
if match:
return match.group(1)
# Try /a/ pattern (current format: /a/title-A7-12345)
match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path)
if match:
return match.group(1)
# Fallback: return last part of path
return path.split('/')[-1] if path else ""
def _parse_lot_page(self, content: str, url: str) -> Dict:
"""Parse individual lot page and extract data"""
# First try to extract from __NEXT_DATA__ JSON (Next.js sites)
next_data = self._extract_nextjs_data(content)
if next_data:
return next_data
# Fallback to HTML parsing
content = re.sub(r'\s+', ' ', content)
data = {
'url': url,
'lot_id': self._extract_lot_id(url),
'title': self._extract_meta_content(content, 'og:title'),
'current_bid': self._extract_current_bid(content),
'bid_count': self._extract_bid_count(content),
'end_date': self._extract_end_date(content),
'location': self._extract_location(content),
'description': self._extract_description(content),
'category': self._extract_category(content),
'images': self._extract_images(content),
'scraped_at': datetime.now().isoformat()
}
return data
def _extract_nextjs_data(self, content: str) -> Optional[Dict]:
"""Extract data from Next.js __NEXT_DATA__ JSON"""
try:
# Find the __NEXT_DATA__ script tag
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
return None
data = json.loads(match.group(1))
# Navigate to pageProps
page_props = data.get('props', {}).get('pageProps', {})
# Check if this is an auction page (contains lot data)
if 'auction' in page_props:
# This is a single lot/auction page
auction = page_props.get('auction', {})
# Extract main data
result = {
'url': self.base_url + '/a/' + auction.get('urlSlug', ''),
'lot_id': auction.get('displayId', ''),
'title': auction.get('name', ''),
'current_bid': '', # Need to check if this has bid info
'bid_count': 0,
'end_date': self._format_timestamp(auction.get('minEndDate', '')),
'location': self._extract_location_from_json(auction),
'description': auction.get('description', ''),
'category': auction.get('category', {}).get('name', '') if isinstance(auction.get('category'), dict) else '',
'images': [auction['image']['url']] if auction.get('image') and auction['image'].get('url') else [],
'scraped_at': datetime.now().isoformat()
}
return result
return None
except Exception as e:
print(f" → Error parsing __NEXT_DATA__: {e}")
return None
def _format_timestamp(self, timestamp: any) -> str:
"""Convert Unix timestamp to readable date"""
try:
if isinstance(timestamp, (int, float)) and timestamp > 0:
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
return str(timestamp) if timestamp else ''
except:
return str(timestamp) if timestamp else ''
def _extract_location_from_json(self, auction_data: Dict) -> str:
"""Extract location from auction JSON data"""
# Try viewingDays first
viewing_days = auction_data.get('viewingDays', [])
if viewing_days and len(viewing_days) > 0:
first_location = viewing_days[0]
city = first_location.get('city', '')
country = first_location.get('countryCode', '').upper()
if city:
return f"{city}, {country}" if country else city
# Try collectionDays
collection_days = auction_data.get('collectionDays', [])
if collection_days and len(collection_days) > 0:
first_location = collection_days[0]
city = first_location.get('city', '')
country = first_location.get('countryCode', '').upper()
if city:
return f"{city}, {country}" if country else city
return ''
def _extract_meta_content(self, content: str, property_name: str) -> str:
"""Extract content from meta tags"""
pattern = rf'<meta[^>]*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']'
match = re.search(pattern, content, re.IGNORECASE)
if match:
return self._clean_text(match.group(1))
return ""
def _extract_current_bid(self, content: str) -> str:
"""Extract current bid amount"""
patterns = [
# JSON data patterns (most reliable)
r'"currentBid"\s*:\s*"([^"]+)"',
r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)',
r'currentBid["\']?\s*:\s*["\']?([€\d,.\s]+)["\']?',
# HTML patterns - look for bid amount AFTER the label
r'(?:Current bid|Huidig bod)[:\s]*</?\w*>\s*(€[\d,.\s]+)',
r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)',
r'<[^>]*bid-amount[^>]*>[\s]*(€[\d,.\s]+)',
# Meta tags
r'<meta[^>]*property=["\']auction:currentBid["\'][^>]*content=["\']([^"\']+)["\']',
# Structured data
r'"price"\s*:\s*"([€\d,.\s]+)"',
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
if match:
bid = match.group(1).strip()
# Validate it's not just the label
if bid and bid.lower() not in ['huidig bod', 'current bid', 'locatie', 'location']:
# Clean up the bid value
if not bid.startswith(''):
bid = f"{bid}"
return bid
return "€0"
def _extract_bid_count(self, content: str) -> int:
"""Extract number of bids"""
patterns = [
r'(\d+)\s*bids?',
r'bidCount["\']:\s*["\']?(\d+)["\']?'
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
try:
return int(match.group(1))
except:
return 0
return 0
def _extract_end_date(self, content: str) -> str:
"""Extract auction end date"""
patterns = [
r'Ends?[:\s]+([A-Za-z0-9,:\s]+)',
r'endTime["\']:\s*["\']([^"\']+)["\']',
r'class="[^"]*end[^"]*".*?>([A-Za-z0-9,:\s]+)<'
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
return match.group(1).strip()
return ""
def _extract_location(self, content: str) -> str:
"""Extract location"""
patterns = [
# JSON data patterns (most reliable)
r'"location"\s*:\s*"([^"]+)"',
r'"address"\s*:\s*"([^"]+)"',
r'"addressLocality"\s*:\s*"([^"]+)"',
# HTML patterns - look for location AFTER the label
r'(?:Location|Locatie)[:\s]*</?\w*>\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)',
r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<br|</|$)',
r'<[^>]*location[^>]*>[\s]*([A-Za-zÀ-ÿ0-9\s,.-]+?)</[^>]*>',
# Icon or label based
r'<i[^>]*location[^>]*></i>\s*([A-Za-zÀ-ÿ0-9\s,.-]+)',
# Meta tags
r'<meta[^>]*property=["\']auction:location["\'][^>]*content=["\']([^"\']+)["\']',
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
if match:
location = self._clean_text(match.group(1))
# Validate it's not just the label
if location and location.lower() not in ['locatie', 'location', 'huidig bod', 'current bid']:
# Remove trailing punctuation and whitespace
location = re.sub(r'[,.\s]+$', '', location)
if len(location) > 2: # Must be more than 2 chars
return location
return ""
def _extract_description(self, content: str) -> str:
"""Extract description"""
patterns = [
r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']',
r'class="[^"]*description[^"]*".*?>([^<]+)<'
]
for pattern in patterns:
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
if match:
return self._clean_text(match.group(1))[:500]
return ""
def _extract_category(self, content: str) -> str:
"""Extract category from breadcrumb or meta tags"""
# Try breadcrumb first
pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)</a>'
match = re.search(pattern, content, re.IGNORECASE)
if match:
return self._clean_text(match.group(1))
# Try meta
return self._extract_meta_content(content, 'category')
def _extract_images(self, content: str) -> List[str]:
"""Extract image URLs"""
pattern = r'<img[^>]*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>'
matches = re.findall(pattern, content, re.IGNORECASE)
images = []
for match in matches:
if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']):
continue
full_url = urljoin(self.base_url, match)
images.append(full_url)
return images[:5] # Limit to 5 images
def _clean_text(self, text: str) -> str:
"""Clean extracted text"""
import html
text = html.unescape(text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
"""Crawl a single listing page and return lot URLs"""
url = f"{self.base_url}/auctions?page={page_num}"
print(f"\n{'='*60}")
print(f"LISTING PAGE {page_num}: {url}")
print(f"{'='*60}")
content = await self._get_page(page, url)
if not content:
return []
lot_urls = self._extract_lot_urls_from_listing(content)
print(f"→ Found {len(lot_urls)} lot URLs")
return lot_urls
async def crawl_lot(self, page: Page, url: str) -> Optional[Dict]:
"""Crawl an individual lot page"""
if url in self.visited_lots:
print(f" → Skipping (already visited): {url}")
return None
lot_id = self._extract_lot_id(url)
print(f"\n[LOT {lot_id}]")
content = await self._get_page(page, url)
if not content:
return None
lot_data = self._parse_lot_page(content, url)
self.visited_lots.add(url)
print(f" → Title: {lot_data.get('title', 'N/A')[:60]}...")
print(f" → Bid: {lot_data.get('current_bid', 'N/A')}")
print(f" → Location: {lot_data.get('location', 'N/A')}")
return lot_data
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
"""Main crawl function"""
async with async_playwright() as p:
print("Launching browser...")
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled'
]
)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
)
# Set extra headers
await page.set_extra_http_headers({
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
})
all_lot_urls = []
# First pass: collect all lot URLs from listing pages
print("\n" + "="*60)
print("PHASE 1: COLLECTING LOT URLs FROM LISTING PAGES")
print("="*60)
for page_num in range(1, max_pages + 1):
lot_urls = await self.crawl_listing_page(page, page_num)
if not lot_urls:
print(f"No lots found on page {page_num}, stopping")
break
all_lot_urls.extend(lot_urls)
print(f" → Total lots collected so far: {len(all_lot_urls)}")
# Remove duplicates
all_lot_urls = list(set(all_lot_urls))
print(f"\n{'='*60}")
print(f"PHASE 1 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS TO SCRAPE")
print(f"{'='*60}")
# Second pass: scrape each lot page
print("\n" + "="*60)
print("PHASE 2: SCRAPING INDIVIDUAL LOT PAGES")
print("="*60)
results = []
for i, lot_url in enumerate(all_lot_urls):
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
lot_data = await self.crawl_lot(page, lot_url)
if lot_data:
results.append(lot_data)
# Save progress after each successful scrape
if (i + 1) % 10 == 0: # Save every 10 lots
self._save_intermediate(results)
await browser.close()
return results
def _save_intermediate(self, data: List[Dict]):
"""Save intermediate results"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{OUTPUT_DIR}/troostwijk_lots_partial_{timestamp}.json"
with open(filename, 'w', encoding='utf-8') as f:
json.dump({
'count': len(data),
'lots': data
}, f, indent=2, ensure_ascii=False)
print(f"\n → PROGRESS SAVED: {filename}")
def save_final_results(self, data: List[Dict]):
"""Save final results in multiple formats"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save JSON
json_file = f"{OUTPUT_DIR}/troostwijk_lots_final_{timestamp}.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump({
'count': len(data),
'scraped_at': datetime.now().isoformat(),
'rate_limit_seconds': RATE_LIMIT_SECONDS,
'lots': data
}, f, indent=2, ensure_ascii=False)
# Save CSV
csv_file = f"{OUTPUT_DIR}/troostwijk_lots_final_{timestamp}.csv"
if data:
flat_data = []
for item in data:
flat_item = item.copy()
flat_item['images'] = ', '.join(flat_item.get('images', []))
flat_data.append(flat_item)
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
fieldnames = ['url', 'lot_id', 'title', 'current_bid', 'bid_count',
'end_date', 'location', 'description', 'category', 'images', 'scraped_at']
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(flat_data)
return json_file, csv_file
def test_extraction(test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
"""Test extraction on a specific cached URL to debug patterns"""
scraper = TroostwijkScraper()
# Try to get from cache
cached = scraper.cache.get(test_url)
if not cached:
print(f"ERROR: URL not found in cache: {test_url}")
print(f"\nAvailable cached URLs:")
with sqlite3.connect(CACHE_DB) as conn:
cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
for row in cursor.fetchall():
print(f" - {row[0]}")
return
content = cached['content']
print(f"\n{'='*60}")
print(f"TESTING EXTRACTION FROM: {test_url}")
print(f"{'='*60}")
print(f"Content length: {len(content)} chars")
print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
# Test each extraction method
lot_data = scraper._parse_lot_page(content, test_url)
print(f"\n{'='*60}")
print("EXTRACTED DATA:")
print(f"{'='*60}")
for key, value in lot_data.items():
if key == 'images':
print(f"{key:.<20}: {len(value)} images")
for img in value[:3]:
print(f"{'':.<20} - {img}")
else:
display_value = str(value)[:100] if value else "(empty)"
# Handle Unicode characters that Windows console can't display
try:
print(f"{key:.<20}: {display_value}")
except UnicodeEncodeError:
safe_value = display_value.encode('ascii', 'replace').decode('ascii')
print(f"{key:.<20}: {safe_value}")
# Validation checks
print(f"\n{'='*60}")
print("VALIDATION CHECKS:")
print(f"{'='*60}")
issues = []
if lot_data['current_bid'] in ['Huidig bod', 'Current bid', '€0', '']:
issues.append("[!] Current bid not extracted correctly")
else:
print("[OK] Current bid looks valid:", lot_data['current_bid'])
if lot_data['location'] in ['Locatie', 'Location', '']:
issues.append("[!] Location not extracted correctly")
else:
print("[OK] Location looks valid:", lot_data['location'])
if lot_data['title'] in ['', '...']:
issues.append("[!] Title not extracted correctly")
else:
print("[OK] Title looks valid:", lot_data['title'][:50])
if issues:
print(f"\n[ISSUES FOUND]")
for issue in issues:
print(f" {issue}")
else:
print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
# Debug: Show raw HTML snippets for problematic fields
print(f"\n{'='*60}")
print("DEBUG: RAW HTML SNIPPETS")
print(f"{'='*60}")
# Look for bid-related content
print(f"\n1. Bid patterns in content:")
bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
for i, match in enumerate(bid_matches[:5], 1):
print(f" {i}. {match}")
# Look for location content
print(f"\n2. Location patterns in content:")
loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
for i, match in enumerate(loc_matches[:5], 1):
print(f" {i}. ...{match}...")
# Look for JSON data
print(f"\n3. JSON/Script data containing auction info:")
json_patterns = [
r'"currentBid"[^,}]+',
r'"location"[^,}]+',
r'"price"[^,}]+',
r'"addressLocality"[^,}]+'
]
for pattern in json_patterns:
matches = re.findall(pattern, content[:50000], re.IGNORECASE)
if matches:
print(f" {pattern}: {matches[:3]}")
# Look for script tags with structured data
script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
if script_matches:
print(f"\n4. Structured data (JSON-LD) found:")
for i, script in enumerate(script_matches[:2], 1):
try:
data = json.loads(script)
print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...")
except:
print(f" Script {i}: {script[:300]}...")
def main():
"""Main execution"""
import sys
# Check for test mode
if len(sys.argv) > 1 and sys.argv[1] == "--test":
test_url = sys.argv[2] if len(sys.argv) > 2 else None
if test_url:
test_extraction(test_url)
else:
test_extraction()
return
print("Troostwijk Auctions Scraper")
print("=" * 60)
print(f"Rate limit: {RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
print(f"Cache database: {CACHE_DB}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Max listing pages: {MAX_PAGES}")
print("=" * 60)
scraper = TroostwijkScraper()
try:
# Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
scraper.cache.clear_old(max_age_hours=168)
# Run the crawler
results = asyncio.run(scraper.crawl_auctions(max_pages=MAX_PAGES))
# Save final results
if results:
json_file, csv_file = scraper.save_final_results(results)
print("\n" + "="*60)
print("CRAWLING COMPLETED SUCCESSFULLY")
print("="*60)
print(f"Total lots scraped: {len(results)}")
print(f"JSON file: {json_file}")
print(f"CSV file: {csv_file}")
# Show sample
if results:
print(f"\n{'='*60}")
print("SAMPLE DATA:")
print(f"{'='*60}")
sample = results[0]
for key, value in sample.items():
if key != 'images':
print(f"{key:.<20}: {str(value)[:80]}...")
else:
print("\nNo results collected. Check cache and logs.")
except KeyboardInterrupt:
print("\nScraping interrupted by user - partial results saved in output directory")
except Exception as e:
print(f"\nERROR during scraping: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

8
requirements.txt Normal file

@@ -0,0 +1,8 @@
# Troostwijk Auctions Scraper - Dependencies
# Core web scraping - Playwright for bypassing Cloudflare
playwright==1.49.0
# Python version requirement: 3.8+
# Install Playwright browsers after installing this package:
# playwright install chromium