init

2025-12-03 11:44:11 +01:00
commit 8b71d5e113
4 changed files with 1145 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,176 @@
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# Project specific - Troostwijk Scraper
+output/
+*.db
+*.csv
+*.json
+!requirements.txt
+
+# Playwright
+.playwright/
+
+# macOS
+.DS_Store
--- a/README.md
+++ b/README.md
@@ -0,0 +1,217 @@
+# Troostwijk Auctions Scraper
+
+A robust web scraper for extracting auction lot data from Troostwijk Auctions, featuring intelligent caching, rate limiting, and Cloudflare bypass capabilities.
+
+## Features
+
+- **Playwright-based scraping** - Bypasses Cloudflare protection
+- **SQLite caching** - Caches every page to avoid redundant requests
+- **Rate limiting** - Strictly enforces 0.5 seconds between requests
+- **Multi-format output** - Exports data in both JSON and CSV formats
+- **Progress saving** - Automatically saves progress every 10 lots
+- **Test mode** - Debug extraction patterns on cached pages
+
+## Requirements
+
+- Python 3.8+
+- Playwright (with Chromium browser)
+
+## Installation
+
+1. **Clone or download this project**
+
+2. **Install dependencies:**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. **Install Playwright browsers:**
+   ```bash
+   playwright install chromium
+   ```
+
+## Configuration
+
+Edit the configuration variables in `main.py`:
+
+```python
+BASE_URL = "https://www.troostwijkauctions.com"
+CACHE_DB = "/mnt/okcomputer/output/cache.db"      # Path to cache database
+OUTPUT_DIR = "/mnt/okcomputer/output"              # Output directory
+RATE_LIMIT_SECONDS = 0.5                           # Delay between requests
+MAX_PAGES = 50                                     # Number of listing pages to crawl
+```
+
+**Note:** Update the paths to match your system (especially on Windows, use paths like `C:\\output\\cache.db`).
+
+## Usage
+
+### Basic Scraping
+
+Run the scraper to collect auction lot data:
+
+```bash
+python main.py
+```
+
+This will:
+1. Crawl listing pages to collect lot URLs
+2. Scrape each individual lot page
+3. Save results in both JSON and CSV formats
+4. Cache all pages to avoid re-fetching
+
+### Test Mode
+
+Test extraction patterns on a specific cached URL:
+
+```bash
+# Test with default URL
+python main.py --test
+
+# Test with specific URL
+python main.py --test "https://www.troostwijkauctions.com/a/lot-url-here"
+```
+
+This is useful for debugging extraction patterns and verifying data is being extracted correctly.
+
+## Output Files
+
+The scraper generates the following files:
+
+### During Execution
+- `troostwijk_lots_partial_YYYYMMDD_HHMMSS.json` - Progress checkpoints (every 10 lots)
+
+### Final Output
+- `troostwijk_lots_final_YYYYMMDD_HHMMSS.json` - Complete data in JSON format
+- `troostwijk_lots_final_YYYYMMDD_HHMMSS.csv` - Complete data in CSV format
+
+### Cache
+- `cache.db` - SQLite database with cached page content (persistent across runs)
+
+## Data Extracted
+
+For each auction lot, the scraper extracts:
+
+- **URL** - Direct link to the lot
+- **Lot ID** - Unique identifier (e.g., A7-35847)
+- **Title** - Lot title/description
+- **Current Bid** - Current bid amount
+- **Bid Count** - Number of bids placed
+- **End Date** - Auction end time
+- **Location** - Physical location of the item
+- **Description** - Detailed description
+- **Category** - Auction category
+- **Images** - Up to 5 product images
+- **Scraped At** - Timestamp of data collection
+
+## How It Works
+
+### Phase 1: Collect Lot URLs
+The scraper iterates through auction listing pages (`/auctions?page=N`) and collects all lot URLs.
+
+### Phase 2: Scrape Individual Lots
+Each lot page is visited and data is extracted from the embedded JSON data (`__NEXT_DATA__`). The site is built with Next.js and includes all auction/lot data in a JSON structure, making extraction reliable and fast.
+
+### Caching Strategy
+- Every successfully fetched page is cached in SQLite
+- Cache is checked before making any request
+- Cache entries older than 7 days are automatically cleaned
+- Failed requests (500 errors) are also cached to avoid retrying
+
+### Rate Limiting
+- Enforces exactly 0.5 seconds between ALL requests
+- Applies to both listing pages and individual lot pages
+- Prevents server overload and potential IP blocking
+
+## Troubleshooting
+
+### Issue: "Huidig bod" / "Locatie" instead of actual values
+
+**✓ FIXED!** The site uses Next.js with all data embedded in `__NEXT_DATA__` JSON. The scraper now automatically extracts data from JSON first, falling back to HTML pattern matching only if needed.
+
+The scraper correctly extracts:
+- **Title** from `auction.name`
+- **Location** from `viewingDays` or `collectionDays`
+- **Images** from `auction.image.url`
+- **End date** from `minEndDate`
+- **Lot ID** from `auction.displayId`
+
+To verify extraction is working:
+```bash
+python main.py --test "https://www.troostwijkauctions.com/a/your-auction-url"
+```
+
+**Note:** Some URLs point to auction pages (collections of lots) rather than individual lots. Individual lots within auctions may have bid information, while auction pages show the collection details.
+
+### Issue: No lots found
+
+- Check if the website structure has changed
+- Verify `BASE_URL` is correct
+- Try clearing the cache database
+
+### Issue: Cloudflare blocking
+
+- Playwright should bypass this automatically
+- If issues persist, try adjusting user agent or headers in `crawl_auctions()`
+
+### Issue: Slow scraping
+
+- This is intentional due to rate limiting (0.5s between requests)
+- Adjust `RATE_LIMIT_SECONDS` if needed (not recommended below 0.5s)
+- First run will be slower; subsequent runs use cache
+
+## Project Structure
+
+```
+troost-scraper/
+├── main.py              # Main scraper script
+├── requirements.txt     # Python dependencies
+├── README.md           # This file
+└── output/             # Generated output files (created automatically)
+    ├── cache.db        # SQLite cache
+    ├── *.json          # JSON output files
+    └── *.csv           # CSV output files
+```
+
+## Development
+
+### Adding New Extraction Fields
+
+1. Add extraction method in `TroostwijkScraper` class:
+   ```python
+   def _extract_new_field(self, content: str) -> str:
+       pattern = r'your-regex-pattern'
+       match = re.search(pattern, content)
+       return match.group(1) if match else ""
+   ```
+
+2. Add field to `_parse_lot_page()`:
+   ```python
+   data = {
+       # ... existing fields ...
+       'new_field': self._extract_new_field(content),
+   }
+   ```
+
+3. Add field to CSV export in `save_final_results()`:
+   ```python
+   fieldnames = ['url', 'lot_id', ..., 'new_field', ...]
+   ```
+
+### Testing Extraction Patterns
+
+Use test mode to verify patterns work correctly:
+```bash
+python main.py --test "https://www.troostwijkauctions.com/a/your-test-url"
+```
+
+## License
+
+This scraper is for educational and research purposes. Please respect Troostwijk Auctions' terms of service and robots.txt when using this tool.
+
+## Notes
+
+- **Be respectful:** The rate limiting is intentionally conservative
+- **Check legality:** Ensure web scraping is permitted in your jurisdiction
+- **Monitor changes:** Website structure may change over time, requiring pattern updates
+- **Cache management:** Old cache entries are auto-cleaned after 7 days
--- a/main.py
+++ b/main.py
@@ -0,0 +1,744 @@
+#!/usr/bin/env python3
+"""
+Troostwijk Auctions Scraper
+Focuses on extracting auction lots with caching and rate limiting
+"""
+
+import asyncio
+import json
+import csv
+import re
+import sqlite3
+import time
+from datetime import datetime
+from urllib.parse import urljoin, urlparse
+from pathlib import Path
+from typing import List, Dict, Optional, Set
+import random
+
+# Import Playwright - REQUIRED for bypassing Cloudflare
+from playwright.async_api import async_playwright, Browser, Page
+
+# ==================== CONFIGURATION ====================
+BASE_URL = "https://www.troostwijkauctions.com"
+CACHE_DB = "/mnt/okcomputer/output/cache.db"
+OUTPUT_DIR = "/mnt/okcomputer/output"
+RATE_LIMIT_SECONDS = 0.5  # EXACTLY 0.5 seconds between requests - YOUR REQUIREMENT
+MAX_PAGES = 50  # Number of listing pages to crawl (adjust as needed)
+
+# Setup directories
+Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+
+
+class CacheManager:
+    """Manages page caching using SQLite - EVERY PAGE IS CACHED"""
+
+    def __init__(self, db_path: str):
+        self.db_path = db_path
+        self._init_db()
+
+    def _init_db(self):
+        """Initialize cache database"""
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("""
+                         CREATE TABLE IF NOT EXISTS cache (
+                                                              url TEXT PRIMARY KEY,
+                                                              content TEXT,
+                                                              timestamp REAL,
+                                                              status_code INTEGER
+                         )
+                         """)
+            conn.execute("""
+                         CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
+                         """)
+            conn.commit()
+
+    def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
+        """Get cached page if it exists and is not too old"""
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.execute(
+                "SELECT content, timestamp, status_code FROM cache WHERE url = ?",
+                (url,)
+            )
+            row = cursor.fetchone()
+
+            if row:
+                content, timestamp, status_code = row
+                age_hours = (time.time() - timestamp) / 3600
+
+                if age_hours <= max_age_hours:
+                    return {
+                        'content': content,
+                        'timestamp': timestamp,
+                        'status_code': status_code,
+                        'cached': True
+                    }
+        return None
+
+    def set(self, url: str, content: str, status_code: int = 200):
+        """Cache a page - EVERY SUCCESSFUL REQUEST IS CACHED"""
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute(
+                "INSERT OR REPLACE INTO cache (url, content, timestamp, status_code) VALUES (?, ?, ?, ?)",
+                (url, content, time.time(), status_code)
+            )
+            conn.commit()
+            print(f"  → Cached: {url}")
+
+    def clear_old(self, max_age_hours: int = 168):  # Default: 1 week
+        """Clear old cache entries to prevent database bloat"""
+        cutoff_time = time.time() - (max_age_hours * 3600)
+        with sqlite3.connect(self.db_path) as conn:
+            deleted = conn.execute("DELETE FROM cache WHERE timestamp < ?", (cutoff_time,)).rowcount
+            conn.commit()
+            if deleted > 0:
+                print(f"  → Cleared {deleted} old cache entries")
+
+
+class TroostwijkScraper:
+    """Main scraper class for Troostwijk Auctions"""
+
+    def __init__(self):
+        self.base_url = BASE_URL
+        self.cache = CacheManager(CACHE_DB)
+        self.visited_lots: Set[str] = set()
+        self.output_data: List[Dict] = []
+        self.last_request_time = 0
+
+    async def _rate_limit(self):
+        """ENSURE EXACTLY 0.5s BETWEEN REQUESTS - YOUR REQUIREMENT"""
+        current_time = time.time()
+        time_since_last = current_time - self.last_request_time
+
+        if time_since_last < RATE_LIMIT_SECONDS:
+            delay = RATE_LIMIT_SECONDS - time_since_last
+            await asyncio.sleep(delay)
+
+        self.last_request_time = time.time()
+
+    async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
+        """Get page content with caching and strict rate limiting"""
+        # Check cache first - AVOID UNNECESSARY REQUESTS
+        if use_cache:
+            cached = self.cache.get(url)
+            if cached:
+                print(f"  CACHE HIT: {url}")
+                return cached['content']
+
+        # Rate limit before making request - YOUR 0.5s REQUIREMENT
+        await self._rate_limit()
+
+        try:
+            print(f"  FETCHING: {url}")
+            await page.goto(url, wait_until='networkidle', timeout=30000)
+
+            # Small additional wait for dynamic content
+            await asyncio.sleep(random.uniform(0.3, 0.7))
+
+            content = await page.content()
+
+            # Cache the successful result
+            self.cache.set(url, content, 200)
+
+            return content
+
+        except Exception as e:
+            print(f"  ERROR: {e}")
+            # Cache the error to avoid retrying too soon
+            self.cache.set(url, "", 500)
+            return None
+
+    def _extract_lot_urls_from_listing(self, content: str) -> List[str]:
+        """Extract lot URLs from auction listing page"""
+        # Pattern matches /lots/ followed by digits
+        pattern = r'href=["\']([/]a/[^"\']+)["\']'
+        matches = re.findall(pattern, content, re.IGNORECASE)
+
+        lot_urls = []
+        for match in matches:
+            full_url = urljoin(self.base_url, match)
+            lot_urls.append(full_url)
+
+        # Remove duplicates
+        return list(set(lot_urls))
+
+    def _extract_lot_id(self, url: str) -> str:
+        """Extract lot ID from URL"""
+        path = urlparse(url).path
+        # Try /lots/ pattern first (legacy)
+        match = re.search(r'/lots/(\d+)', path)
+        if match:
+            return match.group(1)
+        # Try /a/ pattern (current format: /a/title-A7-12345)
+        match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path)
+        if match:
+            return match.group(1)
+        # Fallback: return last part of path
+        return path.split('/')[-1] if path else ""
+
+    def _parse_lot_page(self, content: str, url: str) -> Dict:
+        """Parse individual lot page and extract data"""
+        # First try to extract from __NEXT_DATA__ JSON (Next.js sites)
+        next_data = self._extract_nextjs_data(content)
+        if next_data:
+            return next_data
+
+        # Fallback to HTML parsing
+        content = re.sub(r'\s+', ' ', content)
+
+        data = {
+            'url': url,
+            'lot_id': self._extract_lot_id(url),
+            'title': self._extract_meta_content(content, 'og:title'),
+            'current_bid': self._extract_current_bid(content),
+            'bid_count': self._extract_bid_count(content),
+            'end_date': self._extract_end_date(content),
+            'location': self._extract_location(content),
+            'description': self._extract_description(content),
+            'category': self._extract_category(content),
+            'images': self._extract_images(content),
+            'scraped_at': datetime.now().isoformat()
+        }
+
+        return data
+
+    def _extract_nextjs_data(self, content: str) -> Optional[Dict]:
+        """Extract data from Next.js __NEXT_DATA__ JSON"""
+        try:
+            # Find the __NEXT_DATA__ script tag
+            match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
+            if not match:
+                return None
+
+            data = json.loads(match.group(1))
+
+            # Navigate to pageProps
+            page_props = data.get('props', {}).get('pageProps', {})
+
+            # Check if this is an auction page (contains lot data)
+            if 'auction' in page_props:
+                # This is a single lot/auction page
+                auction = page_props.get('auction', {})
+
+                # Extract main data
+                result = {
+                    'url': self.base_url + '/a/' + auction.get('urlSlug', ''),
+                    'lot_id': auction.get('displayId', ''),
+                    'title': auction.get('name', ''),
+                    'current_bid': '', # Need to check if this has bid info
+                    'bid_count': 0,
+                    'end_date': self._format_timestamp(auction.get('minEndDate', '')),
+                    'location': self._extract_location_from_json(auction),
+                    'description': auction.get('description', ''),
+                    'category': auction.get('category', {}).get('name', '') if isinstance(auction.get('category'), dict) else '',
+                    'images': [auction['image']['url']] if auction.get('image') and auction['image'].get('url') else [],
+                    'scraped_at': datetime.now().isoformat()
+                }
+
+                return result
+
+            return None
+
+        except Exception as e:
+            print(f"  → Error parsing __NEXT_DATA__: {e}")
+            return None
+
+    def _format_timestamp(self, timestamp: any) -> str:
+        """Convert Unix timestamp to readable date"""
+        try:
+            if isinstance(timestamp, (int, float)) and timestamp > 0:
+                return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
+            return str(timestamp) if timestamp else ''
+        except:
+            return str(timestamp) if timestamp else ''
+
+    def _extract_location_from_json(self, auction_data: Dict) -> str:
+        """Extract location from auction JSON data"""
+        # Try viewingDays first
+        viewing_days = auction_data.get('viewingDays', [])
+        if viewing_days and len(viewing_days) > 0:
+            first_location = viewing_days[0]
+            city = first_location.get('city', '')
+            country = first_location.get('countryCode', '').upper()
+            if city:
+                return f"{city}, {country}" if country else city
+
+        # Try collectionDays
+        collection_days = auction_data.get('collectionDays', [])
+        if collection_days and len(collection_days) > 0:
+            first_location = collection_days[0]
+            city = first_location.get('city', '')
+            country = first_location.get('countryCode', '').upper()
+            if city:
+                return f"{city}, {country}" if country else city
+
+        return ''
+
+    def _extract_meta_content(self, content: str, property_name: str) -> str:
+        """Extract content from meta tags"""
+        pattern = rf'<meta[^>]*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']'
+        match = re.search(pattern, content, re.IGNORECASE)
+        if match:
+            return self._clean_text(match.group(1))
+        return ""
+
+    def _extract_current_bid(self, content: str) -> str:
+        """Extract current bid amount"""
+        patterns = [
+            # JSON data patterns (most reliable)
+            r'"currentBid"\s*:\s*"([^"]+)"',
+            r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)',
+            r'currentBid["\']?\s*:\s*["\']?([€\d,.\s]+)["\']?',
+            # HTML patterns - look for bid amount AFTER the label
+            r'(?:Current bid|Huidig bod)[:\s]*</?\w*>\s*(€[\d,.\s]+)',
+            r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)',
+            r'<[^>]*bid-amount[^>]*>[\s]*(€[\d,.\s]+)',
+            # Meta tags
+            r'<meta[^>]*property=["\']auction:currentBid["\'][^>]*content=["\']([^"\']+)["\']',
+            # Structured data
+            r'"price"\s*:\s*"([€\d,.\s]+)"',
+        ]
+
+        for pattern in patterns:
+            match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
+            if match:
+                bid = match.group(1).strip()
+                # Validate it's not just the label
+                if bid and bid.lower() not in ['huidig bod', 'current bid', 'locatie', 'location']:
+                    # Clean up the bid value
+                    if not bid.startswith('€'):
+                        bid = f"€{bid}"
+                    return bid
+
+        return "€0"
+
+    def _extract_bid_count(self, content: str) -> int:
+        """Extract number of bids"""
+        patterns = [
+            r'(\d+)\s*bids?',
+            r'bidCount["\']:\s*["\']?(\d+)["\']?'
+        ]
+
+        for pattern in patterns:
+            match = re.search(pattern, content, re.IGNORECASE)
+            if match:
+                try:
+                    return int(match.group(1))
+                except:
+                    return 0
+
+        return 0
+
+    def _extract_end_date(self, content: str) -> str:
+        """Extract auction end date"""
+        patterns = [
+            r'Ends?[:\s]+([A-Za-z0-9,:\s]+)',
+            r'endTime["\']:\s*["\']([^"\']+)["\']',
+            r'class="[^"]*end[^"]*".*?>([A-Za-z0-9,:\s]+)<'
+        ]
+
+        for pattern in patterns:
+            match = re.search(pattern, content, re.IGNORECASE)
+            if match:
+                return match.group(1).strip()
+
+        return ""
+
+    def _extract_location(self, content: str) -> str:
+        """Extract location"""
+        patterns = [
+            # JSON data patterns (most reliable)
+            r'"location"\s*:\s*"([^"]+)"',
+            r'"address"\s*:\s*"([^"]+)"',
+            r'"addressLocality"\s*:\s*"([^"]+)"',
+            # HTML patterns - look for location AFTER the label
+            r'(?:Location|Locatie)[:\s]*</?\w*>\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)',
+            r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<br|</|$)',
+            r'<[^>]*location[^>]*>[\s]*([A-Za-zÀ-ÿ0-9\s,.-]+?)</[^>]*>',
+            # Icon or label based
+            r'<i[^>]*location[^>]*></i>\s*([A-Za-zÀ-ÿ0-9\s,.-]+)',
+            # Meta tags
+            r'<meta[^>]*property=["\']auction:location["\'][^>]*content=["\']([^"\']+)["\']',
+        ]
+
+        for pattern in patterns:
+            match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
+            if match:
+                location = self._clean_text(match.group(1))
+                # Validate it's not just the label
+                if location and location.lower() not in ['locatie', 'location', 'huidig bod', 'current bid']:
+                    # Remove trailing punctuation and whitespace
+                    location = re.sub(r'[,.\s]+$', '', location)
+                    if len(location) > 2:  # Must be more than 2 chars
+                        return location
+
+        return ""
+
+    def _extract_description(self, content: str) -> str:
+        """Extract description"""
+        patterns = [
+            r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']',
+            r'class="[^"]*description[^"]*".*?>([^<]+)<'
+        ]
+
+        for pattern in patterns:
+            match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
+            if match:
+                return self._clean_text(match.group(1))[:500]
+
+        return ""
+
+    def _extract_category(self, content: str) -> str:
+        """Extract category from breadcrumb or meta tags"""
+        # Try breadcrumb first
+        pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)</a>'
+        match = re.search(pattern, content, re.IGNORECASE)
+        if match:
+            return self._clean_text(match.group(1))
+
+        # Try meta
+        return self._extract_meta_content(content, 'category')
+
+    def _extract_images(self, content: str) -> List[str]:
+        """Extract image URLs"""
+        pattern = r'<img[^>]*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>'
+        matches = re.findall(pattern, content, re.IGNORECASE)
+
+        images = []
+        for match in matches:
+            if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']):
+                continue
+            full_url = urljoin(self.base_url, match)
+            images.append(full_url)
+
+        return images[:5]  # Limit to 5 images
+
+    def _clean_text(self, text: str) -> str:
+        """Clean extracted text"""
+        import html
+        text = html.unescape(text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+
+    async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
+        """Crawl a single listing page and return lot URLs"""
+        url = f"{self.base_url}/auctions?page={page_num}"
+        print(f"\n{'='*60}")
+        print(f"LISTING PAGE {page_num}: {url}")
+        print(f"{'='*60}")
+
+        content = await self._get_page(page, url)
+        if not content:
+            return []
+
+        lot_urls = self._extract_lot_urls_from_listing(content)
+        print(f"→ Found {len(lot_urls)} lot URLs")
+
+        return lot_urls
+
+    async def crawl_lot(self, page: Page, url: str) -> Optional[Dict]:
+        """Crawl an individual lot page"""
+        if url in self.visited_lots:
+            print(f"  → Skipping (already visited): {url}")
+            return None
+
+        lot_id = self._extract_lot_id(url)
+        print(f"\n[LOT {lot_id}]")
+
+        content = await self._get_page(page, url)
+        if not content:
+            return None
+
+        lot_data = self._parse_lot_page(content, url)
+        self.visited_lots.add(url)
+
+        print(f"  → Title: {lot_data.get('title', 'N/A')[:60]}...")
+        print(f"  → Bid: {lot_data.get('current_bid', 'N/A')}")
+        print(f"  → Location: {lot_data.get('location', 'N/A')}")
+
+        return lot_data
+
+    async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
+        """Main crawl function"""
+        async with async_playwright() as p:
+            print("Launching browser...")
+            browser = await p.chromium.launch(
+                headless=True,
+                args=[
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-blink-features=AutomationControlled'
+                ]
+            )
+
+            page = await browser.new_page(
+                viewport={'width': 1920, 'height': 1080},
+                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
+            )
+
+            # Set extra headers
+            await page.set_extra_http_headers({
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+            })
+
+            all_lot_urls = []
+
+            # First pass: collect all lot URLs from listing pages
+            print("\n" + "="*60)
+            print("PHASE 1: COLLECTING LOT URLs FROM LISTING PAGES")
+            print("="*60)
+
+            for page_num in range(1, max_pages + 1):
+                lot_urls = await self.crawl_listing_page(page, page_num)
+                if not lot_urls:
+                    print(f"No lots found on page {page_num}, stopping")
+                    break
+                all_lot_urls.extend(lot_urls)
+                print(f"  → Total lots collected so far: {len(all_lot_urls)}")
+
+            # Remove duplicates
+            all_lot_urls = list(set(all_lot_urls))
+            print(f"\n{'='*60}")
+            print(f"PHASE 1 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS TO SCRAPE")
+            print(f"{'='*60}")
+
+            # Second pass: scrape each lot page
+            print("\n" + "="*60)
+            print("PHASE 2: SCRAPING INDIVIDUAL LOT PAGES")
+            print("="*60)
+
+            results = []
+            for i, lot_url in enumerate(all_lot_urls):
+                print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
+                lot_data = await self.crawl_lot(page, lot_url)
+                if lot_data:
+                    results.append(lot_data)
+                    # Save progress after each successful scrape
+                    if (i + 1) % 10 == 0:  # Save every 10 lots
+                        self._save_intermediate(results)
+
+            await browser.close()
+            return results
+
+    def _save_intermediate(self, data: List[Dict]):
+        """Save intermediate results"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"{OUTPUT_DIR}/troostwijk_lots_partial_{timestamp}.json"
+
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump({
+                'count': len(data),
+                'lots': data
+            }, f, indent=2, ensure_ascii=False)
+
+        print(f"\n  → PROGRESS SAVED: {filename}")
+
+    def save_final_results(self, data: List[Dict]):
+        """Save final results in multiple formats"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+        # Save JSON
+        json_file = f"{OUTPUT_DIR}/troostwijk_lots_final_{timestamp}.json"
+        with open(json_file, 'w', encoding='utf-8') as f:
+            json.dump({
+                'count': len(data),
+                'scraped_at': datetime.now().isoformat(),
+                'rate_limit_seconds': RATE_LIMIT_SECONDS,
+                'lots': data
+            }, f, indent=2, ensure_ascii=False)
+
+        # Save CSV
+        csv_file = f"{OUTPUT_DIR}/troostwijk_lots_final_{timestamp}.csv"
+        if data:
+            flat_data = []
+            for item in data:
+                flat_item = item.copy()
+                flat_item['images'] = ', '.join(flat_item.get('images', []))
+                flat_data.append(flat_item)
+
+            with open(csv_file, 'w', newline='', encoding='utf-8') as f:
+                fieldnames = ['url', 'lot_id', 'title', 'current_bid', 'bid_count',
+                              'end_date', 'location', 'description', 'category', 'images', 'scraped_at']
+                writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
+                writer.writeheader()
+                writer.writerows(flat_data)
+
+        return json_file, csv_file
+
+
+def test_extraction(test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
+    """Test extraction on a specific cached URL to debug patterns"""
+    scraper = TroostwijkScraper()
+
+    # Try to get from cache
+    cached = scraper.cache.get(test_url)
+    if not cached:
+        print(f"ERROR: URL not found in cache: {test_url}")
+        print(f"\nAvailable cached URLs:")
+        with sqlite3.connect(CACHE_DB) as conn:
+            cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
+            for row in cursor.fetchall():
+                print(f"  - {row[0]}")
+        return
+
+    content = cached['content']
+    print(f"\n{'='*60}")
+    print(f"TESTING EXTRACTION FROM: {test_url}")
+    print(f"{'='*60}")
+    print(f"Content length: {len(content)} chars")
+    print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
+
+    # Test each extraction method
+    lot_data = scraper._parse_lot_page(content, test_url)
+
+    print(f"\n{'='*60}")
+    print("EXTRACTED DATA:")
+    print(f"{'='*60}")
+    for key, value in lot_data.items():
+        if key == 'images':
+            print(f"{key:.<20}: {len(value)} images")
+            for img in value[:3]:
+                print(f"{'':.<20}  - {img}")
+        else:
+            display_value = str(value)[:100] if value else "(empty)"
+            # Handle Unicode characters that Windows console can't display
+            try:
+                print(f"{key:.<20}: {display_value}")
+            except UnicodeEncodeError:
+                safe_value = display_value.encode('ascii', 'replace').decode('ascii')
+                print(f"{key:.<20}: {safe_value}")
+
+    # Validation checks
+    print(f"\n{'='*60}")
+    print("VALIDATION CHECKS:")
+    print(f"{'='*60}")
+
+    issues = []
+    if lot_data['current_bid'] in ['Huidig bod', 'Current bid', '€0', '']:
+        issues.append("[!] Current bid not extracted correctly")
+    else:
+        print("[OK] Current bid looks valid:", lot_data['current_bid'])
+
+    if lot_data['location'] in ['Locatie', 'Location', '']:
+        issues.append("[!] Location not extracted correctly")
+    else:
+        print("[OK] Location looks valid:", lot_data['location'])
+
+    if lot_data['title'] in ['', '...']:
+        issues.append("[!] Title not extracted correctly")
+    else:
+        print("[OK] Title looks valid:", lot_data['title'][:50])
+
+    if issues:
+        print(f"\n[ISSUES FOUND]")
+        for issue in issues:
+            print(f"  {issue}")
+    else:
+        print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
+
+    # Debug: Show raw HTML snippets for problematic fields
+    print(f"\n{'='*60}")
+    print("DEBUG: RAW HTML SNIPPETS")
+    print(f"{'='*60}")
+
+    # Look for bid-related content
+    print(f"\n1. Bid patterns in content:")
+    bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
+    for i, match in enumerate(bid_matches[:5], 1):
+        print(f"   {i}. {match}")
+
+    # Look for location content
+    print(f"\n2. Location patterns in content:")
+    loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
+    for i, match in enumerate(loc_matches[:5], 1):
+        print(f"   {i}. ...{match}...")
+
+    # Look for JSON data
+    print(f"\n3. JSON/Script data containing auction info:")
+    json_patterns = [
+        r'"currentBid"[^,}]+',
+        r'"location"[^,}]+',
+        r'"price"[^,}]+',
+        r'"addressLocality"[^,}]+'
+    ]
+    for pattern in json_patterns:
+        matches = re.findall(pattern, content[:50000], re.IGNORECASE)
+        if matches:
+            print(f"   {pattern}: {matches[:3]}")
+
+    # Look for script tags with structured data
+    script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
+    if script_matches:
+        print(f"\n4. Structured data (JSON-LD) found:")
+        for i, script in enumerate(script_matches[:2], 1):
+            try:
+                data = json.loads(script)
+                print(f"   Script {i}: {json.dumps(data, indent=6)[:500]}...")
+            except:
+                print(f"   Script {i}: {script[:300]}...")
+
+
+def main():
+    """Main execution"""
+    import sys
+
+    # Check for test mode
+    if len(sys.argv) > 1 and sys.argv[1] == "--test":
+        test_url = sys.argv[2] if len(sys.argv) > 2 else None
+        if test_url:
+            test_extraction(test_url)
+        else:
+            test_extraction()
+        return
+
+    print("Troostwijk Auctions Scraper")
+    print("=" * 60)
+    print(f"Rate limit: {RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
+    print(f"Cache database: {CACHE_DB}")
+    print(f"Output directory: {OUTPUT_DIR}")
+    print(f"Max listing pages: {MAX_PAGES}")
+    print("=" * 60)
+
+    scraper = TroostwijkScraper()
+
+    try:
+        # Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
+        scraper.cache.clear_old(max_age_hours=168)
+
+        # Run the crawler
+        results = asyncio.run(scraper.crawl_auctions(max_pages=MAX_PAGES))
+
+        # Save final results
+        if results:
+            json_file, csv_file = scraper.save_final_results(results)
+
+            print("\n" + "="*60)
+            print("CRAWLING COMPLETED SUCCESSFULLY")
+            print("="*60)
+            print(f"Total lots scraped: {len(results)}")
+            print(f"JSON file: {json_file}")
+            print(f"CSV file: {csv_file}")
+
+            # Show sample
+            if results:
+                print(f"\n{'='*60}")
+                print("SAMPLE DATA:")
+                print(f"{'='*60}")
+                sample = results[0]
+                for key, value in sample.items():
+                    if key != 'images':
+                        print(f"{key:.<20}: {str(value)[:80]}...")
+        else:
+            print("\nNo results collected. Check cache and logs.")
+
+    except KeyboardInterrupt:
+        print("\nScraping interrupted by user - partial results saved in output directory")
+    except Exception as e:
+        print(f"\nERROR during scraping: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+# Troostwijk Auctions Scraper - Dependencies
+
+# Core web scraping - Playwright for bypassing Cloudflare
+playwright==1.49.0
+
+# Python version requirement: 3.8+
+# Install Playwright browsers after installing this package:
+#   playwright install chromium