From 79e14be37a413361a10dc29aa9c6b0ad45be1c52 Mon Sep 17 00:00:00 2001 From: Tour Date: Thu, 4 Dec 2025 14:49:58 +0100 Subject: [PATCH] first --- .aiignore | 12 ++ .gitignore | 176 ++++++++++++++++ .gitmodules | 3 + .python-version | 1 + Dockerfile | 50 +++++ README.md | 85 ++++++++ docker-compose.yml | 22 ++ requirements.txt | 10 + script/migrate_compress_cache.py | 139 +++++++++++++ src/cache.py | 178 ++++++++++++++++ src/config.py | 26 +++ src/main.py | 81 ++++++++ src/parse.py | 303 ++++++++++++++++++++++++++++ src/scraper.py | 279 +++++++++++++++++++++++++ src/test.py | 142 +++++++++++++ test/test_scraper.py | 335 +++++++++++++++++++++++++++++++ wiki/ARCHITECTURE.md | 326 ++++++++++++++++++++++++++++++ wiki/Deployment.md | 122 +++++++++++ wiki/Getting-Started.md | 71 +++++++ wiki/HOLISTIC.md | 107 ++++++++++ wiki/Home.md | 18 ++ wiki/TESTING.md | 279 +++++++++++++++++++++++++ 22 files changed, 2765 insertions(+) create mode 100644 .aiignore create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 .python-version create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 docker-compose.yml create mode 100644 requirements.txt create mode 100644 script/migrate_compress_cache.py create mode 100644 src/cache.py create mode 100644 src/config.py create mode 100644 src/main.py create mode 100644 src/parse.py create mode 100644 src/scraper.py create mode 100644 src/test.py create mode 100644 test/test_scraper.py create mode 100644 wiki/ARCHITECTURE.md create mode 100644 wiki/Deployment.md create mode 100644 wiki/Getting-Started.md create mode 100644 wiki/HOLISTIC.md create mode 100644 wiki/Home.md create mode 100644 wiki/TESTING.md diff --git a/.aiignore b/.aiignore new file mode 100644 index 0000000..71ddf39 --- /dev/null +++ b/.aiignore @@ -0,0 +1,12 @@ +# An .aiignore file follows the same syntax as a .gitignore file. +# .gitignore documentation: https://git-scm.com/docs/gitignore + +# you can ignore files +.DS_Store +*.log +*.tmp + +# or folders +dist/ +build/ +out/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fdb5356 --- /dev/null +++ b/.gitignore @@ -0,0 +1,176 @@ +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# Project specific - Troostwijk Scraper +output/ +*.db +*.csv +*.json +!requirements.txt + +# Playwright +.playwright/ + +# macOS +.DS_Store diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..18ae5f2 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "wiki"] + path = wiki + url = git@git.appmodel.nl:Tour/scaev.wiki.git diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..c8cfe39 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3ec2661 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,50 @@ +# Use Python 3.10+ base image +FROM python:3.11-slim + +# Install system dependencies required for Playwright +RUN apt-get update && apt-get install -y \ + wget \ + gnupg \ + ca-certificates \ + fonts-liberation \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libxkbcommon0 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libasound2 \ + libpango-1.0-0 \ + libcairo2 \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Install Playwright browsers +RUN playwright install chromium +RUN playwright install-deps chromium + +# Copy the rest of the application +COPY . . + +# Create output directory +RUN mkdir -p output + +# Set Python path to include both project root and src directory +ENV PYTHONPATH=/app:/app/src + +# Run the scraper +CMD ["python", "src/main.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..67c4510 --- /dev/null +++ b/README.md @@ -0,0 +1,85 @@ +# Setup & IDE Configuration + +## Python Version Requirement + +This project **requires Python 3.10 or higher**. + +The code uses Python 3.10+ features including: +- Structural pattern matching +- Union type syntax (`X | Y`) +- Improved type hints +- Modern async/await patterns + +## IDE Configuration + +### PyCharm / IntelliJ IDEA + +If your IDE shows "Python 2.7 syntax" warnings, configure it for Python 3.10+: + +1. **File → Project Structure → Project Settings → Project** + - Set Python SDK to 3.10 or higher + +2. **File → Settings → Project → Python Interpreter** + - Select Python 3.10+ interpreter + - Click gear icon → Add → System Interpreter → Browse to your Python 3.10 installation + +3. **File → Settings → Editor → Inspections → Python** + - Ensure "Python version" is set to 3.10+ + - Check "Code compatibility inspection" → Set minimum version to 3.10 + +### VS Code + +Add to `.vscode/settings.json`: +```json +{ + "python.pythonPath": "path/to/python3.10", + "python.analysis.typeCheckingMode": "basic", + "python.languageServer": "Pylance" +} +``` + +## Installation + +```bash +# Check Python version +python --version # Should be 3.10+ + +# Install dependencies +pip install -r requirements.txt + +# Install Playwright browsers +playwright install chromium +``` + +## Verifying Setup + +```bash +# Should print version 3.10.x or higher +python -c "import sys; print(sys.version)" + +# Should run without errors +python main.py --help +``` + +## Common Issues + +### "ModuleNotFoundError: No module named 'playwright'" +```bash +pip install playwright +playwright install chromium +``` + +### "Python 2.7 does not support..." warnings in IDE +- Your IDE is configured for Python 2.7 +- Follow IDE configuration steps above +- The code WILL work with Python 3.10+ despite warnings + +### Script exits with "requires Python 3.10 or higher" +- You're running Python 3.9 or older +- Upgrade to Python 3.10+: https://www.python.org/downloads/ + +## Version Files + +- `.python-version` - Used by pyenv and similar tools +- `requirements.txt` - Package dependencies +- Runtime checks in scripts ensure Python 3.10+ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..02b518a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,22 @@ +version: '3.8' + +services: + scaev-scraper: + build: + context: . + dockerfile: Dockerfile + container_name: scaev-scraper + volumes: + # Mount output directory to persist results + - ./output:/app/output + # Mount cache database to persist between runs + - ./cache:/app/cache + # environment: + # Configuration via environment variables (optional) + # Uncomment and modify as needed + # RATE_LIMIT_SECONDS: 2 + # MAX_PAGES: 5 + # DOWNLOAD_IMAGES: False + restart: unless-stopped + # Uncomment to run in test mode + # command: python src/main.py --test diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6931009 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# Scaev Scraper Requirements +# Python 3.10+ required + +# Core dependencies +playwright>=1.40.0 +aiohttp>=3.9.0 # Optional: only needed if DOWNLOAD_IMAGES=True + +# Development/Testing +pytest>=7.4.0 # Optional: for testing +pytest-asyncio>=0.21.0 # Optional: for async tests diff --git a/script/migrate_compress_cache.py b/script/migrate_compress_cache.py new file mode 100644 index 0000000..a494f0b --- /dev/null +++ b/script/migrate_compress_cache.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +Migrate uncompressed cache entries to compressed format +This script compresses all cache entries where compressed=0 +""" + +import sqlite3 +import zlib +import time + +CACHE_DB = "/mnt/okcomputer/output/cache.db" + +def migrate_cache(): + """Compress all uncompressed cache entries""" + + with sqlite3.connect(CACHE_DB) as conn: + # Get uncompressed entries + cursor = conn.execute( + "SELECT url, content FROM cache WHERE compressed = 0 OR compressed IS NULL" + ) + uncompressed = cursor.fetchall() + + if not uncompressed: + print("✓ No uncompressed entries found. All cache is already compressed!") + return + + print(f"Found {len(uncompressed)} uncompressed cache entries") + print("Starting compression...") + + total_original_size = 0 + total_compressed_size = 0 + compressed_count = 0 + + for url, content in uncompressed: + try: + # Handle both text and bytes + if isinstance(content, str): + content_bytes = content.encode('utf-8') + else: + content_bytes = content + + original_size = len(content_bytes) + + # Compress + compressed_content = zlib.compress(content_bytes, level=9) + compressed_size = len(compressed_content) + + # Update in database + conn.execute( + "UPDATE cache SET content = ?, compressed = 1 WHERE url = ?", + (compressed_content, url) + ) + + total_original_size += original_size + total_compressed_size += compressed_size + compressed_count += 1 + + if compressed_count % 100 == 0: + conn.commit() + ratio = (1 - total_compressed_size / total_original_size) * 100 + print(f" Compressed {compressed_count}/{len(uncompressed)} entries... " + f"({ratio:.1f}% reduction so far)") + + except Exception as e: + print(f" ERROR compressing {url}: {e}") + continue + + # Final commit + conn.commit() + + # Calculate final statistics + ratio = (1 - total_compressed_size / total_original_size) * 100 if total_original_size > 0 else 0 + size_saved_mb = (total_original_size - total_compressed_size) / (1024 * 1024) + + print("\n" + "="*60) + print("MIGRATION COMPLETE") + print("="*60) + print(f"Entries compressed: {compressed_count}") + print(f"Original size: {total_original_size / (1024*1024):.2f} MB") + print(f"Compressed size: {total_compressed_size / (1024*1024):.2f} MB") + print(f"Space saved: {size_saved_mb:.2f} MB") + print(f"Compression ratio: {ratio:.1f}%") + print("="*60) + +def verify_migration(): + """Verify all entries are compressed""" + with sqlite3.connect(CACHE_DB) as conn: + cursor = conn.execute( + "SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL" + ) + uncompressed_count = cursor.fetchone()[0] + + cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1") + compressed_count = cursor.fetchone()[0] + + print("\nVERIFICATION:") + print(f" Compressed entries: {compressed_count}") + print(f" Uncompressed entries: {uncompressed_count}") + + if uncompressed_count == 0: + print(" ✓ All cache entries are compressed!") + return True + else: + print(" ✗ Some entries are still uncompressed") + return False + +def get_db_size(): + """Get current database file size""" + import os + if os.path.exists(CACHE_DB): + size_mb = os.path.getsize(CACHE_DB) / (1024 * 1024) + return size_mb + return 0 + +if __name__ == "__main__": + print("Cache Compression Migration Tool") + print("="*60) + + # Show initial DB size + initial_size = get_db_size() + print(f"Initial database size: {initial_size:.2f} MB\n") + + # Run migration + start_time = time.time() + migrate_cache() + elapsed = time.time() - start_time + + print(f"\nTime taken: {elapsed:.2f} seconds") + + # Verify + verify_migration() + + # Show final DB size + final_size = get_db_size() + print(f"\nFinal database size: {final_size:.2f} MB") + print(f"Database size reduced by: {initial_size - final_size:.2f} MB") + + print("\n✓ Migration complete! You can now run VACUUM to reclaim disk space:") + print(" sqlite3 /mnt/okcomputer/output/cache.db 'VACUUM;'") diff --git a/src/cache.py b/src/cache.py new file mode 100644 index 0000000..948311d --- /dev/null +++ b/src/cache.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Cache Manager module for SQLite-based caching and data storage +""" + +import sqlite3 +import time +import zlib +from datetime import datetime +from typing import Dict, List, Optional + +import config + +class CacheManager: + """Manages page caching and data storage using SQLite""" + + def __init__(self, db_path: str = None): + self.db_path = db_path or config.CACHE_DB + self._init_db() + + def _init_db(self): + """Initialize cache and data storage database""" + with sqlite3.connect(self.db_path) as conn: + conn.execute(""" + CREATE TABLE IF NOT EXISTS cache ( + url TEXT PRIMARY KEY, + content BLOB, + timestamp REAL, + status_code INTEGER + ) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS auctions ( + auction_id TEXT PRIMARY KEY, + url TEXT UNIQUE, + title TEXT, + location TEXT, + lots_count INTEGER, + first_lot_closing_time TEXT, + scraped_at TEXT + ) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS lots ( + lot_id TEXT PRIMARY KEY, + auction_id TEXT, + url TEXT UNIQUE, + title TEXT, + current_bid TEXT, + bid_count INTEGER, + closing_time TEXT, + viewing_time TEXT, + pickup_date TEXT, + location TEXT, + description TEXT, + category TEXT, + scraped_at TEXT, + FOREIGN KEY (auction_id) REFERENCES auctions(auction_id) + ) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS images ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + lot_id TEXT, + url TEXT, + local_path TEXT, + downloaded INTEGER DEFAULT 0, + FOREIGN KEY (lot_id) REFERENCES lots(lot_id) + ) + """) + conn.commit() + + def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]: + """Get cached page if it exists and is not too old""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute( + "SELECT content, timestamp, status_code FROM cache WHERE url = ?", + (url,) + ) + row = cursor.fetchone() + + if row: + content, timestamp, status_code = row + age_hours = (time.time() - timestamp) / 3600 + + if age_hours <= max_age_hours: + try: + content = zlib.decompress(content).decode('utf-8') + except Exception as e: + print(f" ⚠️ Failed to decompress cache for {url}: {e}") + return None + + return { + 'content': content, + 'timestamp': timestamp, + 'status_code': status_code, + 'cached': True + } + return None + + def set(self, url: str, content: str, status_code: int = 200): + """Cache a page with compression""" + with sqlite3.connect(self.db_path) as conn: + compressed_content = zlib.compress(content.encode('utf-8'), level=9) + original_size = len(content.encode('utf-8')) + compressed_size = len(compressed_content) + ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0 + + conn.execute( + "INSERT OR REPLACE INTO cache (url, content, timestamp, status_code) VALUES (?, ?, ?, ?)", + (url, compressed_content, time.time(), status_code) + ) + conn.commit() + print(f" → Cached: {url} (compressed {ratio:.1f}%)") + + def clear_old(self, max_age_hours: int = 168): + """Clear old cache entries to prevent database bloat""" + cutoff_time = time.time() - (max_age_hours * 3600) + with sqlite3.connect(self.db_path) as conn: + deleted = conn.execute("DELETE FROM cache WHERE timestamp < ?", (cutoff_time,)).rowcount + conn.commit() + if deleted > 0: + print(f" → Cleared {deleted} old cache entries") + + def save_auction(self, auction_data: Dict): + """Save auction data to database""" + with sqlite3.connect(self.db_path) as conn: + conn.execute(""" + INSERT OR REPLACE INTO auctions + (auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, ( + auction_data['auction_id'], + auction_data['url'], + auction_data['title'], + auction_data['location'], + auction_data.get('lots_count', 0), + auction_data.get('first_lot_closing_time', ''), + auction_data['scraped_at'] + )) + conn.commit() + + def save_lot(self, lot_data: Dict): + """Save lot data to database""" + with sqlite3.connect(self.db_path) as conn: + conn.execute(""" + INSERT OR REPLACE INTO lots + (lot_id, auction_id, url, title, current_bid, bid_count, closing_time, + viewing_time, pickup_date, location, description, category, scraped_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, ( + lot_data['lot_id'], + lot_data.get('auction_id', ''), + lot_data['url'], + lot_data['title'], + lot_data.get('current_bid', ''), + lot_data.get('bid_count', 0), + lot_data.get('closing_time', ''), + lot_data.get('viewing_time', ''), + lot_data.get('pickup_date', ''), + lot_data.get('location', ''), + lot_data.get('description', ''), + lot_data.get('category', ''), + lot_data['scraped_at'] + )) + conn.commit() + + def save_images(self, lot_id: str, image_urls: List[str]): + """Save image URLs for a lot""" + with sqlite3.connect(self.db_path) as conn: + for url in image_urls: + conn.execute(""" + INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?) + """, (lot_id, url)) + conn.commit() \ No newline at end of file diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..07c4786 --- /dev/null +++ b/src/config.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +""" +Configuration module for Scaev Auctions Scraper +""" + +import sys +from pathlib import Path + +# Require Python 3.10+ +if sys.version_info < (3, 10): + print("ERROR: This script requires Python 3.10 or higher") + print(f"Current version: {sys.version}") + sys.exit(1) + +# ==================== CONFIGURATION ==================== +BASE_URL = "https://www.troostwijkauctions.com" +CACHE_DB = "/mnt/okcomputer/output/cache.db" +OUTPUT_DIR = "/mnt/okcomputer/output" +IMAGES_DIR = "/mnt/okcomputer/output/images" +RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests +MAX_PAGES = 50 # Number of listing pages to crawl +DOWNLOAD_IMAGES = False # Set to True to download images + +# Setup directories +Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) +Path(IMAGES_DIR).mkdir(parents=True, exist_ok=True) \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..1eb53ac --- /dev/null +++ b/src/main.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" +Troostwijk Auctions Scraper - Main Entry Point +Focuses on extracting auction lots with caching and rate limiting +""" + +import sys +import asyncio +import json +import csv +import sqlite3 +from datetime import datetime +from pathlib import Path + +import config +from cache import CacheManager +from scraper import TroostwijkScraper + +def main(): + """Main execution""" + # Check for test mode + if len(sys.argv) > 1 and sys.argv[1] == "--test": + # Import test function only when needed to avoid circular imports + from test import test_extraction + test_url = sys.argv[2] if len(sys.argv) > 2 else None + if test_url: + test_extraction(test_url) + else: + test_extraction() + return + + print("Troostwijk Auctions Scraper") + print("=" * 60) + print(f"Rate limit: {config.RATE_LIMIT_SECONDS} seconds BETWEEN EVERY REQUEST") + print(f"Cache database: {config.CACHE_DB}") + print(f"Output directory: {config.OUTPUT_DIR}") + print(f"Max listing pages: {config.MAX_PAGES}") + print("=" * 60) + + scraper = TroostwijkScraper() + + try: + # Clear old cache (older than 7 days) - KEEP DATABASE CLEAN + scraper.cache.clear_old(max_age_hours=168) + + # Run the crawler + results = asyncio.run(scraper.crawl_auctions(max_pages=config.MAX_PAGES)) + + # Export results to files + print("\n" + "="*60) + print("EXPORTING RESULTS TO FILES") + print("="*60) + + files = scraper.export_to_files() + + print("\n" + "="*60) + print("CRAWLING COMPLETED SUCCESSFULLY") + print("="*60) + print(f"Total pages scraped: {len(results)}") + print(f"\nAuctions JSON: {files['auctions_json']}") + print(f"Auctions CSV: {files['auctions_csv']}") + print(f"Lots JSON: {files['lots_json']}") + print(f"Lots CSV: {files['lots_csv']}") + + # Count auctions vs lots + auctions = [r for r in results if r.get('type') == 'auction'] + lots = [r for r in results if r.get('type') == 'lot'] + print(f"\n Auctions: {len(auctions)}") + print(f" Lots: {len(lots)}") + + except KeyboardInterrupt: + print("\nScraping interrupted by user - partial results saved in output directory") + except Exception as e: + print(f"\nERROR during scraping: {e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + from cache import CacheManager + from scraper import TroostwijkScraper + main() \ No newline at end of file diff --git a/src/parse.py b/src/parse.py new file mode 100644 index 0000000..5b3e83c --- /dev/null +++ b/src/parse.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +""" +Parser module for extracting data from HTML/JSON content +""" +import json +import re +import html +from datetime import datetime +from urllib.parse import urljoin, urlparse +from typing import Dict, List, Optional + +from config import BASE_URL + + +class DataParser: + """Handles all data extraction from HTML/JSON content""" + + @staticmethod + def extract_lot_id(url: str) -> str: + """Extract lot ID from URL""" + path = urlparse(url).path + match = re.search(r'/lots/(\d+)', path) + if match: + return match.group(1) + match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path) + if match: + return match.group(1) + return path.split('/')[-1] if path else "" + + @staticmethod + def clean_text(text: str) -> str: + """Clean extracted text""" + text = html.unescape(text) + text = re.sub(r'\s+', ' ', text) + return text.strip() + + @staticmethod + def format_timestamp(timestamp) -> str: + """Convert Unix timestamp to readable date""" + try: + if isinstance(timestamp, (int, float)) and timestamp > 0: + return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S') + return str(timestamp) if timestamp else '' + except: + return str(timestamp) if timestamp else '' + + @staticmethod + def format_currency(amount) -> str: + """Format currency amount""" + if isinstance(amount, (int, float)): + return f"€{amount:,.2f}" if amount > 0 else "€0" + return str(amount) if amount else "€0" + + def parse_page(self, content: str, url: str) -> Optional[Dict]: + """Parse page and determine if it's an auction or lot""" + next_data = self._extract_nextjs_data(content, url) + if next_data: + return next_data + + content = re.sub(r'\s+', ' ', content) + return { + 'type': 'lot', + 'url': url, + 'lot_id': self.extract_lot_id(url), + 'title': self._extract_meta_content(content, 'og:title'), + 'current_bid': self._extract_current_bid(content), + 'bid_count': self._extract_bid_count(content), + 'closing_time': self._extract_end_date(content), + 'location': self._extract_location(content), + 'description': self._extract_description(content), + 'category': self._extract_category(content), + 'images': self._extract_images(content), + 'scraped_at': datetime.now().isoformat() + } + + def _extract_nextjs_data(self, content: str, url: str) -> Optional[Dict]: + """Extract data from Next.js __NEXT_DATA__ JSON""" + try: + match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) + if not match: + return None + + data = json.loads(match.group(1)) + page_props = data.get('props', {}).get('pageProps', {}) + + if 'lot' in page_props: + return self._parse_lot_json(page_props.get('lot', {}), url) + if 'auction' in page_props: + return self._parse_auction_json(page_props.get('auction', {}), url) + return None + + except Exception as e: + print(f" → Error parsing __NEXT_DATA__: {e}") + return None + + def _parse_lot_json(self, lot_data: Dict, url: str) -> Dict: + """Parse lot data from JSON""" + location_data = lot_data.get('location', {}) + city = location_data.get('city', '') + country = location_data.get('countryCode', '').upper() + location = f"{city}, {country}" if city and country else (city or country) + + current_bid = lot_data.get('currentBid') or lot_data.get('highestBid') or lot_data.get('startingBid') + if current_bid is None or current_bid == 0: + bidding = lot_data.get('bidding', {}) + current_bid = bidding.get('currentBid') or bidding.get('amount') + + current_bid_str = self.format_currency(current_bid) if current_bid and current_bid > 0 else "No bids" + + bid_count = lot_data.get('bidCount', 0) + if bid_count == 0: + bid_count = lot_data.get('bidding', {}).get('bidCount', 0) + + description = lot_data.get('description', {}) + if isinstance(description, dict): + description = description.get('description', '') + else: + description = str(description) + + category = lot_data.get('category', {}) + category_name = category.get('name', '') if isinstance(category, dict) else '' + + return { + 'type': 'lot', + 'lot_id': lot_data.get('displayId', ''), + 'auction_id': lot_data.get('auctionId', ''), + 'url': url, + 'title': lot_data.get('title', ''), + 'current_bid': current_bid_str, + 'bid_count': bid_count, + 'closing_time': self.format_timestamp(lot_data.get('endDate', '')), + 'viewing_time': self._extract_viewing_time(lot_data), + 'pickup_date': self._extract_pickup_date(lot_data), + 'location': location, + 'description': description, + 'category': category_name, + 'images': self._extract_images_from_json(lot_data), + 'scraped_at': datetime.now().isoformat() + } + + def _parse_auction_json(self, auction_data: Dict, url: str) -> Dict: + """Parse auction data from JSON""" + is_auction = 'lots' in auction_data and isinstance(auction_data['lots'], list) + is_lot = 'lotNumber' in auction_data or 'currentBid' in auction_data + + if is_auction: + lots = auction_data.get('lots', []) + first_lot_closing = None + if lots: + first_lot_closing = self.format_timestamp(lots[0].get('endDate', '')) + + return { + 'type': 'auction', + 'auction_id': auction_data.get('displayId', ''), + 'url': url, + 'title': auction_data.get('name', ''), + 'location': self._extract_location_from_json(auction_data), + 'lots_count': len(lots), + 'first_lot_closing_time': first_lot_closing or self.format_timestamp(auction_data.get('minEndDate', '')), + 'scraped_at': datetime.now().isoformat(), + 'lots': lots + } + elif is_lot: + return self._parse_lot_json(auction_data, url) + return None + + def _extract_viewing_time(self, auction_data: Dict) -> str: + """Extract viewing time from auction data""" + viewing_days = auction_data.get('viewingDays', []) + if viewing_days: + first = viewing_days[0] + start = self.format_timestamp(first.get('startDate', '')) + end = self.format_timestamp(first.get('endDate', '')) + if start and end: + return f"{start} - {end}" + return start or end + return '' + + def _extract_pickup_date(self, auction_data: Dict) -> str: + """Extract pickup date from auction data""" + collection_days = auction_data.get('collectionDays', []) + if collection_days: + first = collection_days[0] + start = self.format_timestamp(first.get('startDate', '')) + end = self.format_timestamp(first.get('endDate', '')) + if start and end: + return f"{start} - {end}" + return start or end + return '' + + def _extract_images_from_json(self, auction_data: Dict) -> List[str]: + """Extract all image URLs from auction data""" + images = [] + if auction_data.get('image', {}).get('url'): + images.append(auction_data['image']['url']) + if isinstance(auction_data.get('images'), list): + for img in auction_data['images']: + if isinstance(img, dict) and img.get('url'): + images.append(img['url']) + elif isinstance(img, str): + images.append(img) + return images + + def _extract_location_from_json(self, auction_data: Dict) -> str: + """Extract location from auction JSON data""" + for days in [auction_data.get('viewingDays', []), auction_data.get('collectionDays', [])]: + if days: + first_location = days[0] + city = first_location.get('city', '') + country = first_location.get('countryCode', '').upper() + if city: + return f"{city}, {country}" if country else city + return '' + + def _extract_meta_content(self, content: str, property_name: str) -> str: + """Extract content from meta tags""" + pattern = rf']*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']' + match = re.search(pattern, content, re.IGNORECASE) + return self.clean_text(match.group(1)) if match else "" + + def _extract_current_bid(self, content: str) -> str: + """Extract current bid amount""" + patterns = [ + r'"currentBid"\s*:\s*"([^"]+)"', + r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)', + r'(?:Current bid|Huidig bod)[:\s]*\s*(€[\d,.\s]+)', + r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)', + ] + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + bid = match.group(1).strip() + if bid and bid.lower() not in ['huidig bod', 'current bid']: + if not bid.startswith('€'): + bid = f"€{bid}" + return bid + return "€0" + + def _extract_bid_count(self, content: str) -> int: + """Extract number of bids""" + match = re.search(r'(\d+)\s*bids?', content, re.IGNORECASE) + if match: + try: + return int(match.group(1)) + except: + pass + return 0 + + def _extract_end_date(self, content: str) -> str: + """Extract auction end date""" + patterns = [ + r'Ends?[:\s]+([A-Za-z0-9,:\s]+)', + r'endTime["\']:\s*["\']([^"\']+)["\']', + ] + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + return match.group(1).strip() + return "" + + def _extract_location(self, content: str) -> str: + """Extract location""" + patterns = [ + r'(?:Location|Locatie)[:\s]*\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)', + r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?: 2: + return location + return "" + + def _extract_description(self, content: str) -> str: + """Extract description""" + pattern = r']*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']' + match = re.search(pattern, content, re.IGNORECASE | re.DOTALL) + return self.clean_text(match.group(1))[:500] if match else "" + + def _extract_category(self, content: str) -> str: + """Extract category from breadcrumb or meta tags""" + pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)' + match = re.search(pattern, content, re.IGNORECASE) + if match: + return self.clean_text(match.group(1)) + return self._extract_meta_content(content, 'category') + + def _extract_images(self, content: str) -> List[str]: + """Extract image URLs""" + pattern = r']*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>' + matches = re.findall(pattern, content, re.IGNORECASE) + + images = [] + for match in matches: + if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']): + continue + full_url = urljoin(BASE_URL, match) + images.append(full_url) + + return images[:5] # Limit to 5 images \ No newline at end of file diff --git a/src/scraper.py b/src/scraper.py new file mode 100644 index 0000000..af39ba2 --- /dev/null +++ b/src/scraper.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +""" +Core scraper module for Scaev Auctions +""" +import sqlite3 +import asyncio +import time +import random +import json +import re +from pathlib import Path +from typing import Dict, List, Optional, Set +from urllib.parse import urljoin + +from playwright.async_api import async_playwright, Page + +from config import ( + BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR +) +from cache import CacheManager +from parse import DataParser + +class TroostwijkScraper: + """Main scraper class for Troostwijk Auctions""" + + def __init__(self): + self.base_url = BASE_URL + self.cache = CacheManager() + self.parser = DataParser() + self.visited_lots: Set[str] = set() + self.last_request_time = 0 + self.download_images = DOWNLOAD_IMAGES + + async def _download_image(self, url: str, lot_id: str, index: int) -> Optional[str]: + """Download an image and save it locally""" + if not self.download_images: + return None + + try: + import aiohttp + lot_dir = Path(IMAGES_DIR) / lot_id + lot_dir.mkdir(exist_ok=True) + + ext = url.split('.')[-1].split('?')[0] + if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']: + ext = 'jpg' + + filepath = lot_dir / f"{index:03d}.{ext}" + if filepath.exists(): + return str(filepath) + + await self._rate_limit() + + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=30) as response: + if response.status == 200: + content = await response.read() + with open(filepath, 'wb') as f: + f.write(content) + + with sqlite3.connect(self.cache.db_path) as conn: + conn.execute("UPDATE images\n" + "SET local_path = ?, downloaded = 1\n" + "WHERE lot_id = ? AND url = ?\n" + "", (str(filepath), lot_id, url)) + conn.commit() + return str(filepath) + + except Exception as e: + print(f" ERROR downloading image: {e}") + return None + + async def _rate_limit(self): + """ENSURE EXACTLY 0.5s BETWEEN REQUESTS""" + current_time = time.time() + time_since_last = current_time - self.last_request_time + + if time_since_last < RATE_LIMIT_SECONDS: + await asyncio.sleep(RATE_LIMIT_SECONDS - time_since_last) + + self.last_request_time = time.time() + + async def _get_page(self, page: Page, url: str, use_cache: bool = True) -> Optional[str]: + """Get page content with caching and strict rate limiting""" + if use_cache: + cached = self.cache.get(url) + if cached: + print(f" CACHE HIT: {url}") + return cached['content'] + + await self._rate_limit() + + try: + print(f" FETCHING: {url}") + await page.goto(url, wait_until='networkidle', timeout=30000) + await asyncio.sleep(random.uniform(0.3, 0.7)) + content = await page.content() + self.cache.set(url, content, 200) + return content + + except Exception as e: + print(f" ERROR: {e}") + self.cache.set(url, "", 500) + return None + + def _extract_auction_urls_from_listing(self, content: str) -> List[str]: + """Extract auction URLs from listing page""" + pattern = r'href=["\']([/]a/[^"\']+)["\']' + matches = re.findall(pattern, content, re.IGNORECASE) + return list(set(urljoin(self.base_url, match) for match in matches)) + + def _extract_lot_urls_from_auction(self, content: str, auction_url: str) -> List[str]: + """Extract lot URLs from an auction page""" + # Try Next.js data first + try: + match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) + if match: + data = json.loads(match.group(1)) + lots = data.get('props', {}).get('pageProps', {}).get('auction', {}).get('lots', []) + if lots: + return list(set(f"{self.base_url}/l/{lot.get('urlSlug', '')}" + for lot in lots if lot.get('urlSlug'))) + except: + pass + + # Fallback to HTML parsing + pattern = r'href=["\']([/]l/[^"\']+)["\']' + matches = re.findall(pattern, content, re.IGNORECASE) + return list(set(urljoin(self.base_url, match) for match in matches)) + + async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]: + """Crawl a single listing page and return auction URLs""" + url = f"{self.base_url}/auctions?page={page_num}" + print(f"\n{'='*60}") + print(f"LISTING PAGE {page_num}: {url}") + print(f"{'='*60}") + + content = await self._get_page(page, url) + if not content: + return [] + + auction_urls = self._extract_auction_urls_from_listing(content) + print(f"→ Found {len(auction_urls)} auction URLs") + return auction_urls + + async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]: + """Crawl an auction page and extract lot URLs""" + content = await self._get_page(page, auction_url) + if not content: + return [] + + page_data = self.parser.parse_page(content, auction_url) + if page_data and page_data.get('type') == 'auction': + self.cache.save_auction(page_data) + print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)") + + return self._extract_lot_urls_from_auction(content, auction_url) + + async def crawl_page(self, page: Page, url: str) -> Optional[Dict]: + """Crawl a page (auction or lot)""" + if url in self.visited_lots: + print(f" → Skipping (already visited): {url}") + return None + + page_id = self.parser.extract_lot_id(url) + print(f"\n[PAGE {page_id}]") + + content = await self._get_page(page, url) + if not content: + return None + + page_data = self.parser.parse_page(content, url) + if not page_data: + return None + + self.visited_lots.add(url) + + if page_data.get('type') == 'auction': + print(f" → Type: AUCTION") + print(f" → Title: {page_data.get('title', 'N/A')[:60]}...") + print(f" → Location: {page_data.get('location', 'N/A')}") + print(f" → Lots: {page_data.get('lots_count', 0)}") + self.cache.save_auction(page_data) + + elif page_data.get('type') == 'lot': + print(f" → Type: LOT") + print(f" → Title: {page_data.get('title', 'N/A')[:60]}...") + print(f" → Bid: {page_data.get('current_bid', 'N/A')}") + print(f" → Location: {page_data.get('location', 'N/A')}") + self.cache.save_lot(page_data) + + images = page_data.get('images', []) + if images: + self.cache.save_images(page_data['lot_id'], images) + print(f" → Images: {len(images)}") + + if self.download_images: + for i, img_url in enumerate(images): + local_path = await self._download_image(img_url, page_data['lot_id'], i) + if local_path: + print(f" ✓ Downloaded: {Path(local_path).name}") + + return page_data + + async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]: + """Main crawl function""" + async with async_playwright() as p: + print("Launching browser...") + browser = await p.chromium.launch( + headless=True, + args=[ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-blink-features=AutomationControlled' + ] + ) + + page = await browser.new_page( + viewport={'width': 1920, 'height': 1080}, + user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + ) + + await page.set_extra_http_headers({ + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + }) + + all_auction_urls = [] + all_lot_urls = [] + + # Phase 1: Collect auction URLs + print("\n" + "="*60) + print("PHASE 1: COLLECTING AUCTION URLs FROM LISTING PAGES") + print("="*60) + + for page_num in range(1, max_pages + 1): + auction_urls = await self.crawl_listing_page(page, page_num) + if not auction_urls: + print(f"No auctions found on page {page_num}, stopping") + break + all_auction_urls.extend(auction_urls) + print(f" → Total auctions collected so far: {len(all_auction_urls)}") + + all_auction_urls = list(set(all_auction_urls)) + print(f"\n{'='*60}") + print(f"PHASE 1 COMPLETE: {len(all_auction_urls)} UNIQUE AUCTIONS") + print(f"{'='*60}") + + # Phase 2: Extract lot URLs from each auction + print("\n" + "="*60) + print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS") + print("="*60) + + for i, auction_url in enumerate(all_auction_urls): + print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {self.parser.extract_lot_id(auction_url)}") + lot_urls = await self.crawl_auction_for_lots(page, auction_url) + if lot_urls: + all_lot_urls.extend(lot_urls) + print(f" → Found {len(lot_urls)} lots") + + all_lot_urls = list(set(all_lot_urls)) + print(f"\n{'='*60}") + print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS") + print(f"{'='*60}") + + # Phase 3: Scrape each lot page + print("\n" + "="*60) + print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES") + print("="*60) + + results = [] + for i, lot_url in enumerate(all_lot_urls): + print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="") + page_data = await self.crawl_page(page, lot_url) + if page_data: + results.append(page_data) + + await browser.close() + return results \ No newline at end of file diff --git a/src/test.py b/src/test.py new file mode 100644 index 0000000..bbb813c --- /dev/null +++ b/src/test.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +""" +Test module for debugging extraction patterns +""" + +import sys +import sqlite3 +import time +import re +import json +from datetime import datetime +from pathlib import Path +from typing import Optional + +import config +from cache import CacheManager +from scraper import TroostwijkScraper + + +def test_extraction( + test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"): + """Test extraction on a specific cached URL to debug patterns""" + scraper = TroostwijkScraper() + + # Try to get from cache + cached = scraper.cache.get(test_url) + if not cached: + print(f"ERROR: URL not found in cache: {test_url}") + print(f"\nAvailable cached URLs:") + with sqlite3.connect(config.CACHE_DB) as conn: + cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10") + for row in cursor.fetchall(): + print(f" - {row[0]}") + return + + content = cached['content'] + print(f"\n{'=' * 60}") + print(f"TESTING EXTRACTION FROM: {test_url}") + print(f"{'=' * 60}") + print(f"Content length: {len(content)} chars") + print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours") + + # Test each extraction method + page_data = scraper._parse_page(content, test_url) + + print(f"\n{'=' * 60}") + print("EXTRACTED DATA:") + print(f"{'=' * 60}") + + if not page_data: + print("ERROR: No data extracted!") + return + + print(f"Page Type: {page_data.get('type', 'UNKNOWN')}") + print() + + for key, value in page_data.items(): + if key == 'images': + print(f"{key:.<20}: {len(value)} images") + for img in value[:3]: + print(f"{'':.<20} - {img}") + elif key == 'lots': + print(f"{key:.<20}: {len(value)} lots in auction") + else: + display_value = str(value)[:100] if value else "(empty)" + # Handle Unicode characters that Windows console can't display + try: + print(f"{key:.<20}: {display_value}") + except UnicodeEncodeError: + safe_value = display_value.encode('ascii', 'replace').decode('ascii') + print(f"{key:.<20}: {safe_value}") + + # Validation checks + print(f"\n{'=' * 60}") + print("VALIDATION CHECKS:") + print(f"{'=' * 60}") + + issues = [] + + if page_data.get('type') == 'lot': + if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']: + issues.append("[!] Current bid not extracted correctly") + else: + print("[OK] Current bid looks valid:", page_data.get('current_bid')) + + if page_data.get('location') in ['Locatie', 'Location', '']: + issues.append("[!] Location not extracted correctly") + else: + print("[OK] Location looks valid:", page_data.get('location')) + + if page_data.get('title') in ['', '...']: + issues.append("[!] Title not extracted correctly") + else: + print("[OK] Title looks valid:", page_data.get('title', '')[:50]) + + if issues: + print(f"\n[ISSUES FOUND]") + for issue in issues: + print(f" {issue}") + else: + print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]") + + # Debug: Show raw HTML snippets for problematic fields + print(f"\n{'=' * 60}") + print("DEBUG: RAW HTML SNIPPETS") + print(f"{'=' * 60}") + + # Look for bid-related content + print(f"\n1. Bid patterns in content:") + bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000]) + for i, match in enumerate(bid_matches[:5], 1): + print(f" {i}. {match}") + + # Look for location content + print(f"\n2. Location patterns in content:") + loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE) + for i, match in enumerate(loc_matches[:5], 1): + print(f" {i}. ...{match}...") + + # Look for JSON data + print(f"\n3. JSON/Script data containing auction info:") + json_patterns = [ + r'"currentBid"[^,}]+', + r'"location"[^,}]+', + r'"price"[^,}]+', + r'"addressLocality"[^,}]+' + ] + for pattern in json_patterns: + matches = re.findall(pattern, content[:50000], re.IGNORECASE) + if matches: + print(f" {pattern}: {matches[:3]}") + + # Look for script tags with structured data + script_matches = re.findall(r']*type=["\']application/ld\+json["\'][^>]*>(.*?)', content, re.DOTALL) + if script_matches: + print(f"\n4. Structured data (JSON-LD) found:") + for i, script in enumerate(script_matches[:2], 1): + try: + data = json.loads(script) + print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...") + except: + print(f" Script {i}: {script[:300]}...") diff --git a/test/test_scraper.py b/test/test_scraper.py new file mode 100644 index 0000000..a3dbeef --- /dev/null +++ b/test/test_scraper.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python3 +""" +Test suite for Troostwijk Scraper +Tests both auction and lot parsing with cached data + +Requires Python 3.10+ +""" + +import sys + +# Require Python 3.10+ +if sys.version_info < (3, 10): + print("ERROR: This script requires Python 3.10 or higher") + print(f"Current version: {sys.version}") + sys.exit(1) + +import asyncio +import json +import sqlite3 +from datetime import datetime +from pathlib import Path + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent)) + +from main import TroostwijkScraper, CacheManager, CACHE_DB + +# Test URLs - these will use cached data to avoid overloading the server +TEST_AUCTIONS = [ + "https://www.troostwijkauctions.com/a/online-auction-cnc-lathes-machining-centres-precision-measurement-romania-A7-39813", + "https://www.troostwijkauctions.com/a/faillissement-bab-shortlease-i-ii-b-v-%E2%80%93-2024-big-ass-energieopslagsystemen-A1-39557", + "https://www.troostwijkauctions.com/a/industriele-goederen-uit-diverse-bedrijfsbeeindigingen-A1-38675", +] + +TEST_LOTS = [ + "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", + "https://www.troostwijkauctions.com/l/tos-sui-50-1000-universele-draaibank-A7-39568-9", + "https://www.troostwijkauctions.com/l/rolcontainer-%25282x%2529-A1-40191-101", +] + +class TestResult: + def __init__(self, url, success, message, data=None): + self.url = url + self.success = success + self.message = message + self.data = data + +class ScraperTester: + def __init__(self): + self.scraper = TroostwijkScraper() + self.results = [] + + def check_cache_exists(self, url): + """Check if URL is cached""" + cached = self.scraper.cache.get(url, max_age_hours=999999) # Get even old cache + return cached is not None + + def test_auction_parsing(self, url): + """Test auction page parsing""" + print(f"\n{'='*70}") + print(f"Testing Auction: {url}") + print('='*70) + + # Check cache + if not self.check_cache_exists(url): + return TestResult( + url, + False, + "❌ NOT IN CACHE - Please run scraper first to cache this URL", + None + ) + + # Get cached content + cached = self.scraper.cache.get(url, max_age_hours=999999) + content = cached['content'] + + print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)") + + # Parse + try: + data = self.scraper._parse_page(content, url) + + if not data: + return TestResult(url, False, "❌ Parsing returned None", None) + + if data.get('type') != 'auction': + return TestResult( + url, + False, + f"❌ Expected type='auction', got '{data.get('type')}'", + data + ) + + # Validate required fields + issues = [] + required_fields = { + 'auction_id': str, + 'title': str, + 'location': str, + 'lots_count': int, + 'first_lot_closing_time': str, + } + + for field, expected_type in required_fields.items(): + value = data.get(field) + if value is None or value == '': + issues.append(f" ❌ {field}: MISSING or EMPTY") + elif not isinstance(value, expected_type): + issues.append(f" ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})") + else: + # Pretty print value + display_value = str(value)[:60] + print(f" ✓ {field}: {display_value}") + + if issues: + return TestResult(url, False, "\n".join(issues), data) + + print(f" ✓ lots_count: {data.get('lots_count')}") + + return TestResult(url, True, "✅ All auction fields validated successfully", data) + + except Exception as e: + return TestResult(url, False, f"❌ Exception during parsing: {e}", None) + + def test_lot_parsing(self, url): + """Test lot page parsing""" + print(f"\n{'='*70}") + print(f"Testing Lot: {url}") + print('='*70) + + # Check cache + if not self.check_cache_exists(url): + return TestResult( + url, + False, + "❌ NOT IN CACHE - Please run scraper first to cache this URL", + None + ) + + # Get cached content + cached = self.scraper.cache.get(url, max_age_hours=999999) + content = cached['content'] + + print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)") + + # Parse + try: + data = self.scraper._parse_page(content, url) + + if not data: + return TestResult(url, False, "❌ Parsing returned None", None) + + if data.get('type') != 'lot': + return TestResult( + url, + False, + f"❌ Expected type='lot', got '{data.get('type')}'", + data + ) + + # Validate required fields + issues = [] + required_fields = { + 'lot_id': (str, lambda x: x and len(x) > 0), + 'title': (str, lambda x: x and len(x) > 3 and x not in ['...', 'N/A']), + 'location': (str, lambda x: x and len(x) > 2 and x not in ['Locatie', 'Location']), + 'current_bid': (str, lambda x: x and x not in ['€Huidig ​​bod', 'Huidig bod']), + 'closing_time': (str, lambda x: True), # Can be empty + 'images': (list, lambda x: True), # Can be empty list + } + + for field, (expected_type, validator) in required_fields.items(): + value = data.get(field) + + if value is None: + issues.append(f" ❌ {field}: MISSING (None)") + elif not isinstance(value, expected_type): + issues.append(f" ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})") + elif not validator(value): + issues.append(f" ❌ {field}: Invalid value: '{value}'") + else: + # Pretty print value + if field == 'images': + print(f" ✓ {field}: {len(value)} images") + for i, img in enumerate(value[:3], 1): + print(f" {i}. {img[:60]}...") + else: + display_value = str(value)[:60] + print(f" ✓ {field}: {display_value}") + + # Additional checks + if data.get('bid_count') is not None: + print(f" ✓ bid_count: {data.get('bid_count')}") + + if data.get('viewing_time'): + print(f" ✓ viewing_time: {data.get('viewing_time')}") + + if data.get('pickup_date'): + print(f" ✓ pickup_date: {data.get('pickup_date')}") + + if issues: + return TestResult(url, False, "\n".join(issues), data) + + return TestResult(url, True, "✅ All lot fields validated successfully", data) + + except Exception as e: + import traceback + return TestResult(url, False, f"❌ Exception during parsing: {e}\n{traceback.format_exc()}", None) + + def run_all_tests(self): + """Run all tests""" + print("\n" + "="*70) + print("TROOSTWIJK SCRAPER TEST SUITE") + print("="*70) + print("\nThis test suite uses CACHED data only - no live requests to server") + print("="*70) + + # Test auctions + print("\n" + "="*70) + print("TESTING AUCTIONS") + print("="*70) + + for url in TEST_AUCTIONS: + result = self.test_auction_parsing(url) + self.results.append(result) + + # Test lots + print("\n" + "="*70) + print("TESTING LOTS") + print("="*70) + + for url in TEST_LOTS: + result = self.test_lot_parsing(url) + self.results.append(result) + + # Summary + self.print_summary() + + def print_summary(self): + """Print test summary""" + print("\n" + "="*70) + print("TEST SUMMARY") + print("="*70) + + passed = sum(1 for r in self.results if r.success) + failed = sum(1 for r in self.results if not r.success) + total = len(self.results) + + print(f"\nTotal tests: {total}") + print(f"Passed: {passed} ✓") + print(f"Failed: {failed} ✗") + print(f"Success rate: {passed/total*100:.1f}%") + + if failed > 0: + print("\n" + "="*70) + print("FAILED TESTS:") + print("="*70) + for result in self.results: + if not result.success: + print(f"\n{result.url}") + print(result.message) + if result.data: + print("\nParsed data:") + for key, value in result.data.items(): + if key != 'lots': # Don't print full lots array + print(f" {key}: {str(value)[:80]}") + + print("\n" + "="*70) + + return failed == 0 + +def check_cache_status(): + """Check cache compression status""" + print("\n" + "="*70) + print("CACHE STATUS CHECK") + print("="*70) + + try: + with sqlite3.connect(CACHE_DB) as conn: + # Total entries + cursor = conn.execute("SELECT COUNT(*) FROM cache") + total = cursor.fetchone()[0] + + # Compressed vs uncompressed + cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1") + compressed = cursor.fetchone()[0] + + cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL") + uncompressed = cursor.fetchone()[0] + + print(f"Total cache entries: {total}") + print(f"Compressed: {compressed} ({compressed/total*100:.1f}%)") + print(f"Uncompressed: {uncompressed} ({uncompressed/total*100:.1f}%)") + + if uncompressed > 0: + print(f"\n⚠️ Warning: {uncompressed} entries are still uncompressed") + print(" Run: python migrate_compress_cache.py") + else: + print("\n✓ All cache entries are compressed!") + + # Check test URLs + print(f"\n{'='*70}") + print("TEST URL CACHE STATUS:") + print('='*70) + + all_test_urls = TEST_AUCTIONS + TEST_LOTS + cached_count = 0 + + for url in all_test_urls: + cursor = conn.execute("SELECT url FROM cache WHERE url = ?", (url,)) + if cursor.fetchone(): + print(f"✓ {url[:60]}...") + cached_count += 1 + else: + print(f"✗ {url[:60]}... (NOT CACHED)") + + print(f"\n{cached_count}/{len(all_test_urls)} test URLs are cached") + + if cached_count < len(all_test_urls): + print("\n⚠️ Some test URLs are not cached. Tests for those URLs will fail.") + print(" Run the main scraper to cache these URLs first.") + + except Exception as e: + print(f"Error checking cache status: {e}") + +if __name__ == "__main__": + # Check cache status first + check_cache_status() + + # Run tests + tester = ScraperTester() + success = tester.run_all_tests() + + # Exit with appropriate code + sys.exit(0 if success else 1) diff --git a/wiki/ARCHITECTURE.md b/wiki/ARCHITECTURE.md new file mode 100644 index 0000000..d77c123 --- /dev/null +++ b/wiki/ARCHITECTURE.md @@ -0,0 +1,326 @@ +# Scaev - Architecture & Data Flow + +## System Overview + +The scraper follows a **3-phase hierarchical crawling pattern** to extract auction and lot data from Troostwijk Auctions website. + +## Architecture Diagram + +```mariadb +┌─────────────────────────────────────────────────────────────────┐ +│ TROOSTWIJK SCRAPER │ +└─────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────┐ +│ PHASE 1: COLLECT AUCTION URLs │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Listing Page │────────▶│ Extract /a/ │ │ +│ │ /auctions? │ │ auction URLs │ │ +│ │ page=1..N │ └──────────────┘ │ +│ └──────────────┘ │ │ +│ ▼ │ +│ [ List of Auction URLs ] │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ PHASE 2: EXTRACT LOT URLs FROM AUCTIONS │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Auction Page │────────▶│ Parse │ │ +│ │ /a/... │ │ __NEXT_DATA__│ │ +│ └──────────────┘ │ JSON │ │ +│ │ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Save Auction │ │ Extract /l/ │ │ +│ │ Metadata │ │ lot URLs │ │ +│ │ to DB │ └──────────────┘ │ +│ └──────────────┘ │ │ +│ ▼ │ +│ [ List of Lot URLs ] │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ PHASE 3: SCRAPE LOT DETAILS │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Lot Page │────────▶│ Parse │ │ +│ │ /l/... │ │ __NEXT_DATA__│ │ +│ └──────────────┘ │ JSON │ │ +│ └──────────────┘ │ +│ │ │ +│ ┌─────────────────────────┴─────────────────┐ │ +│ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Save Lot │ │ Save Images │ │ +│ │ Details │ │ URLs to DB │ │ +│ │ to DB │ └──────────────┘ │ +│ └──────────────┘ │ │ +│ ▼ │ +│ [Optional Download] │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Database Schema + +```mariadb +┌──────────────────────────────────────────────────────────────────┐ +│ CACHE TABLE (HTML Storage with Compression) │ +├──────────────────────────────────────────────────────────────────┤ +│ cache │ +│ ├── url (TEXT, PRIMARY KEY) │ +│ ├── content (BLOB) -- Compressed HTML (zlib) │ +│ ├── timestamp (REAL) │ +│ ├── status_code (INTEGER) │ +│ └── compressed (INTEGER) -- 1=compressed, 0=plain │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ AUCTIONS TABLE │ +├──────────────────────────────────────────────────────────────────┤ +│ auctions │ +│ ├── auction_id (TEXT, PRIMARY KEY) -- e.g. "A7-39813" │ +│ ├── url (TEXT, UNIQUE) │ +│ ├── title (TEXT) │ +│ ├── location (TEXT) -- e.g. "Cluj-Napoca, RO" │ +│ ├── lots_count (INTEGER) │ +│ ├── first_lot_closing_time (TEXT) │ +│ └── scraped_at (TEXT) │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ LOTS TABLE │ +├──────────────────────────────────────────────────────────────────┤ +│ lots │ +│ ├── lot_id (TEXT, PRIMARY KEY) -- e.g. "A1-28505-5" │ +│ ├── auction_id (TEXT) -- FK to auctions │ +│ ├── url (TEXT, UNIQUE) │ +│ ├── title (TEXT) │ +│ ├── current_bid (TEXT) -- "€123.45" or "No bids" │ +│ ├── bid_count (INTEGER) │ +│ ├── closing_time (TEXT) │ +│ ├── viewing_time (TEXT) │ +│ ├── pickup_date (TEXT) │ +│ ├── location (TEXT) -- e.g. "Dongen, NL" │ +│ ├── description (TEXT) │ +│ ├── category (TEXT) │ +│ └── scraped_at (TEXT) │ +│ FOREIGN KEY (auction_id) → auctions(auction_id) │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ IMAGES TABLE (Image URLs & Download Status) │ +├──────────────────────────────────────────────────────────────────┤ +│ images ◀── THIS TABLE HOLDS IMAGE LINKS│ +│ ├── id (INTEGER, PRIMARY KEY AUTOINCREMENT) │ +│ ├── lot_id (TEXT) -- FK to lots │ +│ ├── url (TEXT) -- Image URL │ +│ ├── local_path (TEXT) -- Path after download │ +│ └── downloaded (INTEGER) -- 0=pending, 1=downloaded │ +│ FOREIGN KEY (lot_id) → lots(lot_id) │ +└──────────────────────────────────────────────────────────────────┘ +``` + +## Sequence Diagram + +``` +User Scraper Playwright Cache DB Data Tables + │ │ │ │ │ + │ Run │ │ │ │ + ├──────────────▶│ │ │ │ + │ │ │ │ │ + │ │ Phase 1: Listing Pages │ │ + │ ├───────────────▶│ │ │ + │ │ goto() │ │ │ + │ │◀───────────────┤ │ │ + │ │ HTML │ │ │ + │ ├───────────────────────────────▶│ │ + │ │ compress & cache │ │ + │ │ │ │ │ + │ │ Phase 2: Auction Pages │ │ + │ ├───────────────▶│ │ │ + │ │◀───────────────┤ │ │ + │ │ HTML │ │ │ + │ │ │ │ │ + │ │ Parse __NEXT_DATA__ JSON │ │ + │ │────────────────────────────────────────────────▶│ + │ │ │ │ INSERT auctions + │ │ │ │ │ + │ │ Phase 3: Lot Pages │ │ + │ ├───────────────▶│ │ │ + │ │◀───────────────┤ │ │ + │ │ HTML │ │ │ + │ │ │ │ │ + │ │ Parse __NEXT_DATA__ JSON │ │ + │ │────────────────────────────────────────────────▶│ + │ │ │ │ INSERT lots │ + │ │────────────────────────────────────────────────▶│ + │ │ │ │ INSERT images│ + │ │ │ │ │ + │ │ Export to CSV/JSON │ │ + │ │◀────────────────────────────────────────────────┤ + │ │ Query all data │ │ + │◀──────────────┤ │ │ │ + │ Results │ │ │ │ +``` + +## Data Flow Details + +### 1. **Page Retrieval & Caching** +``` +Request URL + │ + ├──▶ Check cache DB (with timestamp validation) + │ │ + │ ├─[HIT]──▶ Decompress (if compressed=1) + │ │ └──▶ Return HTML + │ │ + │ └─[MISS]─▶ Fetch via Playwright + │ │ + │ ├──▶ Compress HTML (zlib level 9) + │ │ ~70-90% size reduction + │ │ + │ └──▶ Store in cache DB (compressed=1) + │ + └──▶ Return HTML for parsing +``` + +### 2. **JSON Parsing Strategy** +``` +HTML Content + │ + └──▶ Extract