first

2025-12-04 14:49:58 +01:00
commit 79e14be37a
22 changed files with 2765 additions and 0 deletions
--- a/src/cache.py
+++ b/src/cache.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Cache Manager module for SQLite-based caching and data storage
+"""
+
+import sqlite3
+import time
+import zlib
+from datetime import datetime
+from typing import Dict, List, Optional
+
+import config
+
+class CacheManager:
+    """Manages page caching and data storage using SQLite"""
+
+    def __init__(self, db_path: str = None):
+        self.db_path = db_path or config.CACHE_DB
+        self._init_db()
+
+    def _init_db(self):
+        """Initialize cache and data storage database"""
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS cache (
+                    url TEXT PRIMARY KEY,
+                    content BLOB,
+                    timestamp REAL,
+                    status_code INTEGER
+                )
+            """)
+            conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
+            """)
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS auctions (
+                    auction_id TEXT PRIMARY KEY,
+                    url TEXT UNIQUE,
+                    title TEXT,
+                    location TEXT,
+                    lots_count INTEGER,
+                    first_lot_closing_time TEXT,
+                    scraped_at TEXT
+                )
+            """)
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS lots (
+                    lot_id TEXT PRIMARY KEY,
+                    auction_id TEXT,
+                    url TEXT UNIQUE,
+                    title TEXT,
+                    current_bid TEXT,
+                    bid_count INTEGER,
+                    closing_time TEXT,
+                    viewing_time TEXT,
+                    pickup_date TEXT,
+                    location TEXT,
+                    description TEXT,
+                    category TEXT,
+                    scraped_at TEXT,
+                    FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
+                )
+            """)
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS images (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    lot_id TEXT,
+                    url TEXT,
+                    local_path TEXT,
+                    downloaded INTEGER DEFAULT 0,
+                    FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
+                )
+            """)
+            conn.commit()
+
+    def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
+        """Get cached page if it exists and is not too old"""
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.execute(
+                "SELECT content, timestamp, status_code FROM cache WHERE url = ?",
+                (url,)
+            )
+            row = cursor.fetchone()
+
+            if row:
+                content, timestamp, status_code = row
+                age_hours = (time.time() - timestamp) / 3600
+
+                if age_hours <= max_age_hours:
+                    try:
+                        content = zlib.decompress(content).decode('utf-8')
+                    except Exception as e:
+                        print(f"  ⚠️  Failed to decompress cache for {url}: {e}")
+                        return None
+
+                    return {
+                        'content': content,
+                        'timestamp': timestamp,
+                        'status_code': status_code,
+                        'cached': True
+                    }
+        return None
+
+    def set(self, url: str, content: str, status_code: int = 200):
+        """Cache a page with compression"""
+        with sqlite3.connect(self.db_path) as conn:
+            compressed_content = zlib.compress(content.encode('utf-8'), level=9)
+            original_size = len(content.encode('utf-8'))
+            compressed_size = len(compressed_content)
+            ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0
+
+            conn.execute(
+                "INSERT OR REPLACE INTO cache (url, content, timestamp, status_code) VALUES (?, ?, ?, ?)",
+                (url, compressed_content, time.time(), status_code)
+            )
+            conn.commit()
+            print(f"  → Cached: {url} (compressed {ratio:.1f}%)")
+
+    def clear_old(self, max_age_hours: int = 168):
+        """Clear old cache entries to prevent database bloat"""
+        cutoff_time = time.time() - (max_age_hours * 3600)
+        with sqlite3.connect(self.db_path) as conn:
+            deleted = conn.execute("DELETE FROM cache WHERE timestamp < ?", (cutoff_time,)).rowcount
+            conn.commit()
+            if deleted > 0:
+                print(f"  → Cleared {deleted} old cache entries")
+
+    def save_auction(self, auction_data: Dict):
+        """Save auction data to database"""
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("""
+                INSERT OR REPLACE INTO auctions
+                (auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
+                VALUES (?, ?, ?, ?, ?, ?, ?)
+            """, (
+                auction_data['auction_id'],
+                auction_data['url'],
+                auction_data['title'],
+                auction_data['location'],
+                auction_data.get('lots_count', 0),
+                auction_data.get('first_lot_closing_time', ''),
+                auction_data['scraped_at']
+            ))
+            conn.commit()
+
+    def save_lot(self, lot_data: Dict):
+        """Save lot data to database"""
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("""
+                INSERT OR REPLACE INTO lots
+                (lot_id, auction_id, url, title, current_bid, bid_count, closing_time,
+                 viewing_time, pickup_date, location, description, category, scraped_at)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """, (
+                lot_data['lot_id'],
+                lot_data.get('auction_id', ''),
+                lot_data['url'],
+                lot_data['title'],
+                lot_data.get('current_bid', ''),
+                lot_data.get('bid_count', 0),
+                lot_data.get('closing_time', ''),
+                lot_data.get('viewing_time', ''),
+                lot_data.get('pickup_date', ''),
+                lot_data.get('location', ''),
+                lot_data.get('description', ''),
+                lot_data.get('category', ''),
+                lot_data['scraped_at']
+            ))
+            conn.commit()
+
+    def save_images(self, lot_id: str, image_urls: List[str]):
+        """Save image URLs for a lot"""
+        with sqlite3.connect(self.db_path) as conn:
+            for url in image_urls:
+                conn.execute("""
+                    INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?)
+                """, (lot_id, url))
+            conn.commit()
--- a/src/config.py
+++ b/src/config.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+"""
+Configuration module for Scaev Auctions Scraper
+"""
+
+import sys
+from pathlib import Path
+
+# Require Python 3.10+
+if sys.version_info < (3, 10):
+    print("ERROR: This script requires Python 3.10 or higher")
+    print(f"Current version: {sys.version}")
+    sys.exit(1)
+
+# ==================== CONFIGURATION ====================
+BASE_URL = "https://www.troostwijkauctions.com"
+CACHE_DB = "/mnt/okcomputer/output/cache.db"
+OUTPUT_DIR = "/mnt/okcomputer/output"
+IMAGES_DIR = "/mnt/okcomputer/output/images"
+RATE_LIMIT_SECONDS = 0.5  # EXACTLY 0.5 seconds between requests
+MAX_PAGES = 50  # Number of listing pages to crawl
+DOWNLOAD_IMAGES = False  # Set to True to download images
+
+# Setup directories
+Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+Path(IMAGES_DIR).mkdir(parents=True, exist_ok=True)
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""
+Troostwijk Auctions Scraper - Main Entry Point
+Focuses on extracting auction lots with caching and rate limiting
+"""
+
+import sys
+import asyncio
+import json
+import csv
+import sqlite3
+from datetime import datetime
+from pathlib import Path
+
+import config
+from cache import CacheManager
+from scraper import TroostwijkScraper
+
+def main():
+    """Main execution"""
+    # Check for test mode
+    if len(sys.argv) > 1 and sys.argv[1] == "--test":
+        # Import test function only when needed to avoid circular imports
+        from test import test_extraction
+        test_url = sys.argv[2] if len(sys.argv) > 2 else None
+        if test_url:
+            test_extraction(test_url)
+        else:
+            test_extraction()
+        return
+
+    print("Troostwijk Auctions Scraper")
+    print("=" * 60)
+    print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST")
+    print(f"Cache database: {config.CACHE_DB}")
+    print(f"Output directory: {config.OUTPUT_DIR}")
+    print(f"Max listing pages: {config.MAX_PAGES}")
+    print("=" * 60)
+
+    scraper = TroostwijkScraper()
+
+    try:
+        # Clear old cache (older than 7 days) - KEEP DATABASE CLEAN
+        scraper.cache.clear_old(max_age_hours=168)
+
+        # Run the crawler
+        results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES))
+
+        # Export results to files
+        print("\n" + "="*60)
+        print("EXPORTING RESULTS TO FILES")
+        print("="*60)
+
+        files = scraper.export_to_files()
+
+        print("\n" + "="*60)
+        print("CRAWLING COMPLETED SUCCESSFULLY")
+        print("="*60)
+        print(f"Total pages scraped: {len(results)}")
+        print(f"\nAuctions JSON: {files['auctions_json']}")
+        print(f"Auctions CSV: {files['auctions_csv']}")
+        print(f"Lots JSON: {files['lots_json']}")
+        print(f"Lots CSV: {files['lots_csv']}")
+
+        # Count auctions vs lots
+        auctions = [r for r in results if r.get('type') == 'auction']
+        lots = [r for r in results if r.get('type') == 'lot']
+        print(f"\n  Auctions: {len(auctions)}")
+        print(f"  Lots: {len(lots)}")
+
+    except KeyboardInterrupt:
+        print("\nScraping interrupted by user - partial results saved in output directory")
+    except Exception as e:
+        print(f"\nERROR during scraping: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    from cache import CacheManager
+    from scraper import TroostwijkScraper
+    main()
--- a/src/parse.py
+++ b/src/parse.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+"""
+Parser module for extracting data from HTML/JSON content
+"""
+import json
+import re
+import html
+from datetime import datetime
+from urllib.parse import urljoin, urlparse
+from typing import Dict, List, Optional
+
+from config import BASE_URL
+
+
+class DataParser:
+    """Handles all data extraction from HTML/JSON content"""
+
+    @staticmethod
+    def extract_lot_id(url: str) -> str:
+        """Extract lot ID from URL"""
+        path = urlparse(url).path
+        match = re.search(r'/lots/(\d+)', path)
+        if match:
+            return match.group(1)
+        match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path)
+        if match:
+            return match.group(1)
+        return path.split('/')[-1] if path else ""
+
+    @staticmethod
+    def clean_text(text: str) -> str:
+        """Clean extracted text"""
+        text = html.unescape(text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+
+    @staticmethod
+    def format_timestamp(timestamp) -> str:
+        """Convert Unix timestamp to readable date"""
+        try:
+            if isinstance(timestamp, (int, float)) and timestamp > 0:
+                return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
+            return str(timestamp) if timestamp else ''
+        except:
+            return str(timestamp) if timestamp else ''
+
+    @staticmethod
+    def format_currency(amount) -> str:
+        """Format currency amount"""
+        if isinstance(amount, (int, float)):
+            return f"€{amount:,.2f}" if amount > 0 else "€0"
+        return str(amount) if amount else "€0"
+
+    def parse_page(self, content: str, url: str) -> Optional[Dict]:
+        """Parse page and determine if it's an auction or lot"""
+        next_data = self._extract_nextjs_data(content, url)
+        if next_data:
+            return next_data
+
+        content = re.sub(r'\s+', ' ', content)
+        return {
+            'type': 'lot',
+            'url': url,
+            'lot_id': self.extract_lot_id(url),
+            'title': self._extract_meta_content(content, 'og:title'),
+            'current_bid': self._extract_current_bid(content),
+            'bid_count': self._extract_bid_count(content),
+            'closing_time': self._extract_end_date(content),
+            'location': self._extract_location(content),
+            'description': self._extract_description(content),
+            'category': self._extract_category(content),
+            'images': self._extract_images(content),
+            'scraped_at': datetime.now().isoformat()
+        }
+
+    def _extract_nextjs_data(self, content: str, url: str) -> Optional[Dict]:
+        """Extract data from Next.js __NEXT_DATA__ JSON"""
+        try:
+            match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
+            if not match:
+                return None
+
+            data = json.loads(match.group(1))
+            page_props = data.get('props', {}).get('pageProps', {})
+
+            if 'lot' in page_props:
+                return self._parse_lot_json(page_props.get('lot', {}), url)
+            if 'auction' in page_props:
+                return self._parse_auction_json(page_props.get('auction', {}), url)
+            return None
+
+        except Exception as e:
+            print(f"  → Error parsing __NEXT_DATA__: {e}")
+            return None
+
+    def _parse_lot_json(self, lot_data: Dict, url: str) -> Dict:
+        """Parse lot data from JSON"""
+        location_data = lot_data.get('location', {})
+        city = location_data.get('city', '')
+        country = location_data.get('countryCode', '').upper()
+        location = f"{city}, {country}" if city and country else (city or country)
+
+        current_bid = lot_data.get('currentBid') or lot_data.get('highestBid') or lot_data.get('startingBid')
+        if current_bid is None or current_bid == 0:
+            bidding = lot_data.get('bidding', {})
+            current_bid = bidding.get('currentBid') or bidding.get('amount')
+
+        current_bid_str = self.format_currency(current_bid) if current_bid and current_bid > 0 else "No bids"
+
+        bid_count = lot_data.get('bidCount', 0)
+        if bid_count == 0:
+            bid_count = lot_data.get('bidding', {}).get('bidCount', 0)
+
+        description = lot_data.get('description', {})
+        if isinstance(description, dict):
+            description = description.get('description', '')
+        else:
+            description = str(description)
+
+        category = lot_data.get('category', {})
+        category_name = category.get('name', '') if isinstance(category, dict) else ''
+
+        return {
+            'type': 'lot',
+            'lot_id': lot_data.get('displayId', ''),
+            'auction_id': lot_data.get('auctionId', ''),
+            'url': url,
+            'title': lot_data.get('title', ''),
+            'current_bid': current_bid_str,
+            'bid_count': bid_count,
+            'closing_time': self.format_timestamp(lot_data.get('endDate', '')),
+            'viewing_time': self._extract_viewing_time(lot_data),
+            'pickup_date': self._extract_pickup_date(lot_data),
+            'location': location,
+            'description': description,
+            'category': category_name,
+            'images': self._extract_images_from_json(lot_data),
+            'scraped_at': datetime.now().isoformat()
+        }
+
+    def _parse_auction_json(self, auction_data: Dict, url: str) -> Dict:
+        """Parse auction data from JSON"""
+        is_auction = 'lots' in auction_data and isinstance(auction_data['lots'], list)
+        is_lot = 'lotNumber' in auction_data or 'currentBid' in auction_data
+
+        if is_auction:
+            lots = auction_data.get('lots', [])
+            first_lot_closing = None
+            if lots:
+                first_lot_closing = self.format_timestamp(lots[0].get('endDate', ''))
+
+            return {
+                'type': 'auction',
+                'auction_id': auction_data.get('displayId', ''),
+                'url': url,
+                'title': auction_data.get('name', ''),
+                'location': self._extract_location_from_json(auction_data),
+                'lots_count': len(lots),
+                'first_lot_closing_time': first_lot_closing or self.format_timestamp(auction_data.get('minEndDate', '')),
+                'scraped_at': datetime.now().isoformat(),
+                'lots': lots
+            }
+        elif is_lot:
+            return self._parse_lot_json(auction_data, url)
+        return None
+
+    def _extract_viewing_time(self, auction_data: Dict) -> str:
+        """Extract viewing time from auction data"""
+        viewing_days = auction_data.get('viewingDays', [])
+        if viewing_days:
+            first = viewing_days[0]
+            start = self.format_timestamp(first.get('startDate', ''))
+            end = self.format_timestamp(first.get('endDate', ''))
+            if start and end:
+                return f"{start} - {end}"
+            return start or end
+        return ''
+
+    def _extract_pickup_date(self, auction_data: Dict) -> str:
+        """Extract pickup date from auction data"""
+        collection_days = auction_data.get('collectionDays', [])
+        if collection_days:
+            first = collection_days[0]
+            start = self.format_timestamp(first.get('startDate', ''))
+            end = self.format_timestamp(first.get('endDate', ''))
+            if start and end:
+                return f"{start} - {end}"
+            return start or end
+        return ''
+
+    def _extract_images_from_json(self, auction_data: Dict) -> List[str]:
+        """Extract all image URLs from auction data"""
+        images = []
+        if auction_data.get('image', {}).get('url'):
+            images.append(auction_data['image']['url'])
+        if isinstance(auction_data.get('images'), list):
+            for img in auction_data['images']:
+                if isinstance(img, dict) and img.get('url'):
+                    images.append(img['url'])
+                elif isinstance(img, str):
+                    images.append(img)
+        return images
+
+    def _extract_location_from_json(self, auction_data: Dict) -> str:
+        """Extract location from auction JSON data"""
+        for days in [auction_data.get('viewingDays', []), auction_data.get('collectionDays', [])]:
+            if days:
+                first_location = days[0]
+                city = first_location.get('city', '')
+                country = first_location.get('countryCode', '').upper()
+                if city:
+                    return f"{city}, {country}" if country else city
+        return ''
+
+    def _extract_meta_content(self, content: str, property_name: str) -> str:
+        """Extract content from meta tags"""
+        pattern = rf'<meta[^>]*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']'
+        match = re.search(pattern, content, re.IGNORECASE)
+        return self.clean_text(match.group(1)) if match else ""
+
+    def _extract_current_bid(self, content: str) -> str:
+        """Extract current bid amount"""
+        patterns = [
+            r'"currentBid"\s*:\s*"([^"]+)"',
+            r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)',
+            r'(?:Current bid|Huidig bod)[:\s]*</?\w*>\s*(€[\d,.\s]+)',
+            r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, content, re.IGNORECASE)
+            if match:
+                bid = match.group(1).strip()
+                if bid and bid.lower() not in ['huidig bod', 'current bid']:
+                    if not bid.startswith('€'):
+                        bid = f"€{bid}"
+                    return bid
+        return "€0"
+
+    def _extract_bid_count(self, content: str) -> int:
+        """Extract number of bids"""
+        match = re.search(r'(\d+)\s*bids?', content, re.IGNORECASE)
+        if match:
+            try:
+                return int(match.group(1))
+            except:
+                pass
+        return 0
+
+    def _extract_end_date(self, content: str) -> str:
+        """Extract auction end date"""
+        patterns = [
+            r'Ends?[:\s]+([A-Za-z0-9,:\s]+)',
+            r'endTime["\']:\s*["\']([^"\']+)["\']',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, content, re.IGNORECASE)
+            if match:
+                return match.group(1).strip()
+        return ""
+
+    def _extract_location(self, content: str) -> str:
+        """Extract location"""
+        patterns = [
+            r'(?:Location|Locatie)[:\s]*</?\w*>\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)',
+            r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<br|</|$)',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, content, re.IGNORECASE)
+            if match:
+                location = self.clean_text(match.group(1))
+                if location.lower() not in ['locatie', 'location', 'huidig bod', 'current bid']:
+                    location = re.sub(r'[,.\s]+$', '', location)
+                    if len(location) > 2:
+                        return location
+        return ""
+
+    def _extract_description(self, content: str) -> str:
+        """Extract description"""
+        pattern = r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']'
+        match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
+        return self.clean_text(match.group(1))[:500] if match else ""
+
+    def _extract_category(self, content: str) -> str:
+        """Extract category from breadcrumb or meta tags"""
+        pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)</a>'
+        match = re.search(pattern, content, re.IGNORECASE)
+        if match:
+            return self.clean_text(match.group(1))
+        return self._extract_meta_content(content, 'category')
+
+    def _extract_images(self, content: str) -> List[str]:
+        """Extract image URLs"""
+        pattern = r'<img[^>]*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>'
+        matches = re.findall(pattern, content, re.IGNORECASE)
+
+        images = []
+        for match in matches:
+            if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']):
+                continue
+            full_url = urljoin(BASE_URL, match)
+            images.append(full_url)
+
+        return images[:5]  # Limit to 5 images
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""
+Core scraper module for Scaev Auctions
+"""
+import sqlite3
+import asyncio
+import time
+import random
+import json
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+from urllib.parse import urljoin
+
+from playwright.async_api import async_playwright, Page
+
+from config import (
+    BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR
+)
+from cache import CacheManager
+from parse import DataParser
+
+class TroostwijkScraper:
+    """Main scraper class for Troostwijk Auctions"""
+
+    def __init__(self):
+        self.base_url = BASE_URL
+        self.cache = CacheManager()
+        self.parser = DataParser()
+        self.visited_lots: Set[str] = set()
+        self.last_request_time = 0
+        self.download_images = DOWNLOAD_IMAGES
+
+    async def _download_image(self, url: str, lot_id: str, index: int) -> Optional[str]:
+        """Download an image and save it locally"""
+        if not self.download_images:
+            return None
+
+        try:
+            import aiohttp
+            lot_dir = Path(IMAGES_DIR) / lot_id
+            lot_dir.mkdir(exist_ok=True)
+
+            ext = url.split('.')[-1].split('?')[0]
+            if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']:
+                ext = 'jpg'
+
+            filepath = lot_dir / f"{index:03d}.{ext}"
+            if filepath.exists():
+                return str(filepath)
+
+            await self._rate_limit()
+
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url, timeout=30) as response:
+                    if response.status == 200:
+                        content = await response.read()
+                        with open(filepath, 'wb') as f:
+                            f.write(content)
+
+                        with sqlite3.connect(self.cache.db_path) as conn:
+                            conn.execute("UPDATE images\n"
+                                         "SET local_path = ?, downloaded = 1\n"
+                                         "WHERE lot_id = ? AND url = ?\n"
+                                         "", (str(filepath), lot_id, url))
+                            conn.commit()
+                        return str(filepath)
+
+        except Exception as e:
+            print(f"    ERROR downloading image: {e}")
+            return None
+
+    async def _rate_limit(self):
+        """ENSURE EXACTLY 0.5s BETWEEN REQUESTS"""
+        current_time = time.time()
+        time_since_last = current_time - self.last_request_time
+
+        if time_since_last < RATE_LIMIT_SECONDS:
+            await asyncio.sleep(RATE_LIMIT_SECONDS - time_since_last)
+
+        self.last_request_time = time.time()
+
+    async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]:
+        """Get page content with caching and strict rate limiting"""
+        if use_cache:
+            cached = self.cache.get(url)
+            if cached:
+                print(f"  CACHE HIT: {url}")
+                return cached['content']
+
+        await self._rate_limit()
+
+        try:
+            print(f"  FETCHING: {url}")
+            await page.goto(url, wait_until='networkidle', timeout=30000)
+            await asyncio.sleep(random.uniform(0.3, 0.7))
+            content = await page.content()
+            self.cache.set(url, content, 200)
+            return content
+
+        except Exception as e:
+            print(f"  ERROR: {e}")
+            self.cache.set(url, "", 500)
+            return None
+
+    def _extract_auction_urls_from_listing(self, content: str) -> List[str]:
+        """Extract auction URLs from listing page"""
+        pattern = r'href=["\']([/]a/[^"\']+)["\']'
+        matches = re.findall(pattern, content, re.IGNORECASE)
+        return list(set(urljoin(self.base_url, match) for match in matches))
+
+    def _extract_lot_urls_from_auction(self, content: str, auction_url: str) -> List[str]:
+        """Extract lot URLs from an auction page"""
+        # Try Next.js data first
+        try:
+            match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
+            if match:
+                data = json.loads(match.group(1))
+                lots = data.get('props', {}).get('pageProps', {}).get('auction', {}).get('lots', [])
+                if lots:
+                    return list(set(f"{self.base_url}/l/{lot.get('urlSlug', '')}"
+                                  for lot in lots if lot.get('urlSlug')))
+        except:
+            pass
+
+        # Fallback to HTML parsing
+        pattern = r'href=["\']([/]l/[^"\']+)["\']'
+        matches = re.findall(pattern, content, re.IGNORECASE)
+        return list(set(urljoin(self.base_url, match) for match in matches))
+
+    async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]:
+        """Crawl a single listing page and return auction URLs"""
+        url = f"{self.base_url}/auctions?page={page_num}"
+        print(f"\n{'='*60}")
+        print(f"LISTING PAGE {page_num}: {url}")
+        print(f"{'='*60}")
+
+        content = await self._get_page(page, url)
+        if not content:
+            return []
+
+        auction_urls = self._extract_auction_urls_from_listing(content)
+        print(f"→ Found {len(auction_urls)} auction URLs")
+        return auction_urls
+
+    async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
+        """Crawl an auction page and extract lot URLs"""
+        content = await self._get_page(page, auction_url)
+        if not content:
+            return []
+
+        page_data = self.parser.parse_page(content, auction_url)
+        if page_data and page_data.get('type') == 'auction':
+            self.cache.save_auction(page_data)
+            print(f"    → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)")
+
+        return self._extract_lot_urls_from_auction(content, auction_url)
+
+    async def crawl_page(self, page: Page, url: str) -> Optional[Dict]:
+        """Crawl a page (auction or lot)"""
+        if url in self.visited_lots:
+            print(f"  → Skipping (already visited): {url}")
+            return None
+
+        page_id = self.parser.extract_lot_id(url)
+        print(f"\n[PAGE {page_id}]")
+
+        content = await self._get_page(page, url)
+        if not content:
+            return None
+
+        page_data = self.parser.parse_page(content, url)
+        if not page_data:
+            return None
+
+        self.visited_lots.add(url)
+
+        if page_data.get('type') == 'auction':
+            print(f"  → Type: AUCTION")
+            print(f"  → Title: {page_data.get('title', 'N/A')[:60]}...")
+            print(f"  → Location: {page_data.get('location', 'N/A')}")
+            print(f"  → Lots: {page_data.get('lots_count', 0)}")
+            self.cache.save_auction(page_data)
+
+        elif page_data.get('type') == 'lot':
+            print(f"  → Type: LOT")
+            print(f"  → Title: {page_data.get('title', 'N/A')[:60]}...")
+            print(f"  → Bid: {page_data.get('current_bid', 'N/A')}")
+            print(f"  → Location: {page_data.get('location', 'N/A')}")
+            self.cache.save_lot(page_data)
+
+            images = page_data.get('images', [])
+            if images:
+                self.cache.save_images(page_data['lot_id'], images)
+                print(f"  → Images: {len(images)}")
+
+                if self.download_images:
+                    for i, img_url in enumerate(images):
+                        local_path = await self._download_image(img_url, page_data['lot_id'], i)
+                        if local_path:
+                            print(f"    ✓ Downloaded: {Path(local_path).name}")
+
+        return page_data
+
+    async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
+        """Main crawl function"""
+        async with async_playwright() as p:
+            print("Launching browser...")
+            browser = await p.chromium.launch(
+                headless=True,
+                args=[
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-blink-features=AutomationControlled'
+                ]
+            )
+
+            page = await browser.new_page(
+                viewport={'width': 1920, 'height': 1080},
+                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
+            )
+
+            await page.set_extra_http_headers({
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+            })
+
+            all_auction_urls = []
+            all_lot_urls = []
+
+            # Phase 1: Collect auction URLs
+            print("\n" + "="*60)
+            print("PHASE 1: COLLECTING AUCTION URLs FROM LISTING PAGES")
+            print("="*60)
+
+            for page_num in range(1, max_pages + 1):
+                auction_urls = await self.crawl_listing_page(page, page_num)
+                if not auction_urls:
+                    print(f"No auctions found on page {page_num}, stopping")
+                    break
+                all_auction_urls.extend(auction_urls)
+                print(f"  → Total auctions collected so far: {len(all_auction_urls)}")
+
+            all_auction_urls = list(set(all_auction_urls))
+            print(f"\n{'='*60}")
+            print(f"PHASE 1 COMPLETE: {len(all_auction_urls)} UNIQUE AUCTIONS")
+            print(f"{'='*60}")
+
+            # Phase 2: Extract lot URLs from each auction
+            print("\n" + "="*60)
+            print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS")
+            print("="*60)
+
+            for i, auction_url in enumerate(all_auction_urls):
+                print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}")
+                lot_urls = await self.crawl_auction_for_lots(page, auction_url)
+                if lot_urls:
+                    all_lot_urls.extend(lot_urls)
+                    print(f"    → Found {len(lot_urls)} lots")
+
+            all_lot_urls = list(set(all_lot_urls))
+            print(f"\n{'='*60}")
+            print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
+            print(f"{'='*60}")
+
+            # Phase 3: Scrape each lot page
+            print("\n" + "="*60)
+            print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
+            print("="*60)
+
+            results = []
+            for i, lot_url in enumerate(all_lot_urls):
+                print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
+                page_data = await self.crawl_page(page, lot_url)
+                if page_data:
+                    results.append(page_data)
+
+            await browser.close()
+            return results
--- a/src/test.py
+++ b/src/test.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Test module for debugging extraction patterns
+"""
+
+import sys
+import sqlite3
+import time
+import re
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import config
+from cache import CacheManager
+from scraper import TroostwijkScraper
+
+
+def test_extraction(
+        test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
+    """Test extraction on a specific cached URL to debug patterns"""
+    scraper = TroostwijkScraper()
+
+    # Try to get from cache
+    cached = scraper.cache.get(test_url)
+    if not cached:
+        print(f"ERROR: URL not found in cache: {test_url}")
+        print(f"\nAvailable cached URLs:")
+        with sqlite3.connect(config.CACHE_DB) as conn:
+            cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
+            for row in cursor.fetchall():
+                print(f"  - {row[0]}")
+        return
+
+    content = cached['content']
+    print(f"\n{'=' * 60}")
+    print(f"TESTING EXTRACTION FROM: {test_url}")
+    print(f"{'=' * 60}")
+    print(f"Content length: {len(content)} chars")
+    print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
+
+    # Test each extraction method
+    page_data = scraper._parse_page(content, test_url)
+
+    print(f"\n{'=' * 60}")
+    print("EXTRACTED DATA:")
+    print(f"{'=' * 60}")
+
+    if not page_data:
+        print("ERROR: No data extracted!")
+        return
+
+    print(f"Page Type: {page_data.get('type', 'UNKNOWN')}")
+    print()
+
+    for key, value in page_data.items():
+        if key == 'images':
+            print(f"{key:.<20}: {len(value)} images")
+            for img in value[:3]:
+                print(f"{'':.<20}  - {img}")
+        elif key == 'lots':
+            print(f"{key:.<20}: {len(value)} lots in auction")
+        else:
+            display_value = str(value)[:100] if value else "(empty)"
+            # Handle Unicode characters that Windows console can't display
+            try:
+                print(f"{key:.<20}: {display_value}")
+            except UnicodeEncodeError:
+                safe_value = display_value.encode('ascii', 'replace').decode('ascii')
+                print(f"{key:.<20}: {safe_value}")
+
+    # Validation checks
+    print(f"\n{'=' * 60}")
+    print("VALIDATION CHECKS:")
+    print(f"{'=' * 60}")
+
+    issues = []
+
+    if page_data.get('type') == 'lot':
+        if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']:
+            issues.append("[!] Current bid not extracted correctly")
+        else:
+            print("[OK] Current bid looks valid:", page_data.get('current_bid'))
+
+        if page_data.get('location') in ['Locatie', 'Location', '']:
+            issues.append("[!] Location not extracted correctly")
+        else:
+            print("[OK] Location looks valid:", page_data.get('location'))
+
+    if page_data.get('title') in ['', '...']:
+        issues.append("[!] Title not extracted correctly")
+    else:
+        print("[OK] Title looks valid:", page_data.get('title', '')[:50])
+
+    if issues:
+        print(f"\n[ISSUES FOUND]")
+        for issue in issues:
+            print(f"  {issue}")
+    else:
+        print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
+
+    # Debug: Show raw HTML snippets for problematic fields
+    print(f"\n{'=' * 60}")
+    print("DEBUG: RAW HTML SNIPPETS")
+    print(f"{'=' * 60}")
+
+    # Look for bid-related content
+    print(f"\n1. Bid patterns in content:")
+    bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
+    for i, match in enumerate(bid_matches[:5], 1):
+        print(f"   {i}. {match}")
+
+    # Look for location content
+    print(f"\n2. Location patterns in content:")
+    loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
+    for i, match in enumerate(loc_matches[:5], 1):
+        print(f"   {i}. ...{match}...")
+
+    # Look for JSON data
+    print(f"\n3. JSON/Script data containing auction info:")
+    json_patterns = [
+        r'"currentBid"[^,}]+',
+        r'"location"[^,}]+',
+        r'"price"[^,}]+',
+        r'"addressLocality"[^,}]+'
+    ]
+    for pattern in json_patterns:
+        matches = re.findall(pattern, content[:50000], re.IGNORECASE)
+        if matches:
+            print(f"   {pattern}: {matches[:3]}")
+
+    # Look for script tags with structured data
+    script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
+    if script_matches:
+        print(f"\n4. Structured data (JSON-LD) found:")
+        for i, script in enumerate(script_matches[:2], 1):
+            try:
+                data = json.loads(script)
+                print(f"   Script {i}: {json.dumps(data, indent=6)[:500]}...")
+            except:
+                print(f"   Script {i}: {script[:300]}...")