commit e69563d4d6e827e504c17423caa95cb2415282c3 Author: Tour Date: Tue Dec 9 08:04:16 2025 +0100 Initial diff --git a/.aiignore b/.aiignore new file mode 100644 index 0000000..71ddf39 --- /dev/null +++ b/.aiignore @@ -0,0 +1,12 @@ +# An .aiignore file follows the same syntax as a .gitignore file. +# .gitignore documentation: https://git-scm.com/docs/gitignore + +# you can ignore files +.DS_Store +*.log +*.tmp + +# or folders +dist/ +build/ +out/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..119e892 --- /dev/null +++ b/.gitignore @@ -0,0 +1,144 @@ +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +.idea/ + +# Project specific - Scaev +output/ +*.db +*.csv +*.json +!requirements.txt + +# Playwright +.playwright/ + +# macOS +.DS_Store diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..c8cfe39 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3ec2661 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,50 @@ +# Use Python 3.10+ base image +FROM python:3.11-slim + +# Install system dependencies required for Playwright +RUN apt-get update && apt-get install -y \ + wget \ + gnupg \ + ca-certificates \ + fonts-liberation \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libxkbcommon0 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libasound2 \ + libpango-1.0-0 \ + libcairo2 \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Install Playwright browsers +RUN playwright install chromium +RUN playwright install-deps chromium + +# Copy the rest of the application +COPY . . + +# Create output directory +RUN mkdir -p output + +# Set Python path to include both project root and src directory +ENV PYTHONPATH=/app:/app/src + +# Run the scraper +CMD ["python", "src/main.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..016d033 --- /dev/null +++ b/README.md @@ -0,0 +1,85 @@ +# Setup & IDE Configuration + +## Python Version Requirement + +This project **requires Python 3.10 or higher**. + +The code uses Python 3.10+ features including: +- Structural pattern matching +- Union type syntax (`X | Y`) +- Improved type hints +- Modern async/await patterns + +## IDE Configuration + +### PyCharm / IntelliJ IDEA + +If your IDE shows "Python 2.7 syntax" warnings, configure it for Python 3.10+: + +1. **File → Project Structure → Project Settings → Project** + - Set Python SDK to 3.10 or higher + +2. **File → Settings → Project → Python Interpreter** + - Select Python 3.10+ interpreter + - Click gear icon → Add → System Interpreter → Browse to your Python 3.10 installation + +3. **File → Settings → Editor → Inspections → Python** + - Ensure "Python version" is set to 3.10+ + - Check "Code compatibility inspection" → Set minimum version to 3.10 + +### VS Code + +Add to `.vscode/settings.json`: +```json +{ + "python.pythonPath": "path/to/python3.10", + "python.analysis.typeCheckingMode": "basic", + "python.languageServer": "Pylance" +} +``` + +## Installation + +```bash +# Check Python version +python --version # Should be 3.10+ + +# Install dependencies +pip install -r requirements.txt + +# Install Playwright browsers +playwright install chromium +``` + +## Verifying Setup + +```bash +# Should print version 3.10.x or higher +python -c "import sys; print(sys.version)" + +# Should run without errors +python main.py --help +``` + +## Common Issues + +### "ModuleNotFoundError: No module named 'playwright'" +```bash +pip install playwright +playwright install chromium +``` + +### "Python 2.7 does not support..." warnings in IDE +- Your IDE is configured for Python 2.7 +- Follow IDE configuration steps above +- The code WILL work with Python 3.10+ despite warnings + +### Script exits with "requires Python 3.10 or higher" +- You're running Python 3.9 or older +- Upgrade to Python 3.10+: https://www.python.org/downloads/ + +## Version Files + +- `.python-version` - Used by pyenv and similar tools +- `requirements.txt` - Package dependencies +- Runtime checks in scripts ensure Python 3.10+ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..6f9734f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,42 @@ +services: + scaev: + build: + context: /opt/apps/scaev + dockerfile: Dockerfile + container_name: scaev + restart: unless-stopped + networks: + scaev_mobile_net: + ipv4_address: 172.30.0.10 + traefik_net: + environment: + RATE_LIMIT_SECONDS: "0.5" + MAX_PAGES: "500" + DOWNLOAD_IMAGES: "True" + volumes: + - shared-auction-data:/mnt/okcomputer/output + labels: + - "traefik.enable=true" + - "traefik.http.routers.scaev.rule=Host(`scaev.appmodel.nl`)" + - "traefik.http.routers.scaev.entrypoints=websecure" + - "traefik.http.routers.scaev.tls=true" + - "traefik.http.routers.scaev.tls.certresolver=letsencrypt" + - "traefik.http.services.scaev.loadbalancer.server.port=8000" + + +networks: + scaev_mobile_net: + driver: bridge + driver_opts: + com.docker.network.bridge.name: br-scaev-mobile + ipam: + config: + - subnet: 172.30.0.0/24 + gateway: 172.30.0.1 + traefik_net: + external: true + name: traefik_net + +volumes: + shared-auction-data: + external: true \ No newline at end of file diff --git a/docs/API_INTELLIGENCE_FINDINGS.md b/docs/API_INTELLIGENCE_FINDINGS.md new file mode 100644 index 0000000..012f285 --- /dev/null +++ b/docs/API_INTELLIGENCE_FINDINGS.md @@ -0,0 +1,240 @@ +# API Intelligence Findings + +## GraphQL API - Available Fields for Intelligence + +### Key Discovery: Additional Fields Available + +From GraphQL schema introspection on `Lot` type: + +#### **Already Captured ✓** +- `currentBidAmount` (Money) - Current bid +- `initialAmount` (Money) - Starting bid +- `nextMinimalBid` (Money) - Minimum bid +- `bidsCount` (Int) - Bid count +- `startDate` / `endDate` (TbaDate) - Timing +- `minimumBidAmountMet` (MinimumBidAmountMet) - Status +- `attributes` - Brand/model extraction +- `title`, `description`, `images` + +#### **NEW - Available but NOT Captured:** + +1. **followersCount** (Int) - **CRITICAL for intelligence!** + - This is the "watch count" we thought was missing + - Indicates bidder interest level + - **ACTION: Add to schema and extraction** + +2. **biddingStatus** (BiddingStatus) - Lot bidding state + - More detailed than minimumBidAmountMet + - **ACTION: Investigate enum values** + +3. **estimatedFullPrice** (EstimatedFullPrice) - **Found it!** + - Available via `LotDetails.estimatedFullPrice` + - May contain estimated min/max values + - **ACTION: Test extraction** + +4. **nextBidStepInCents** (Long) - Exact bid increment + - More precise than our calculated bid_increment + - **ACTION: Replace calculated field** + +5. **condition** (String) - Direct condition field + - Cleaner than attribute extraction + - **ACTION: Use as primary source** + +6. **categoryInformation** (LotCategoryInformation) - Category data + - Structured category info + - **ACTION: Extract category path** + +7. **location** (LotLocation) - Lot location details + - City, country, possibly address + - **ACTION: Add to schema** + +8. **remarks** (String) - Additional notes + - May contain pickup/viewing text + - **ACTION: Check for viewing/pickup extraction** + +9. **appearance** (String) - Condition appearance + - Visual condition notes + - **ACTION: Combine with condition_description** + +10. **packaging** (String) - Packaging details + - Relevant for shipping intelligence + +11. **quantity** (Long) - Lot quantity + - Important for bulk lots + +12. **vat** (BigDecimal) - VAT percentage + - For total cost calculations + +13. **buyerPremiumPercentage** (BigDecimal) - Buyer premium + - For total cost calculations + +14. **videos** - Video URLs (if available) + - **ACTION: Add video support** + +15. **documents** - Document URLs (if available) + - May contain specs/manuals + +## Bid History API - Fields + +### Currently Captured ✓ +- `buyerId` (UUID) - Anonymized bidder +- `buyerNumber` (Int) - Bidder number +- `currentBid.cents` / `currency` - Bid amount +- `autoBid` (Boolean) - Autobid flag +- `createdAt` (Timestamp) - Bid time + +### Additional Available: +- `negotiated` (Boolean) - Was bid negotiated + - **ACTION: Add to bid_history table** + +## Auction API - Not Available +- Attempted `auctionDetails` query - **does not exist** +- Auction data must be scraped from listing pages + +## Priority Actions for Intelligence + +### HIGH PRIORITY (Immediate): +1. ✅ Add `followersCount` field (watch count) +2. ✅ Add `estimatedFullPrice` extraction +3. ✅ Use `nextBidStepInCents` instead of calculated increment +4. ✅ Add `condition` as primary condition source +5. ✅ Add `categoryInformation` extraction +6. ✅ Add `location` details +7. ✅ Add `negotiated` to bid_history table + +### MEDIUM PRIORITY: +8. Extract `remarks` for viewing/pickup text +9. Add `appearance` and `packaging` fields +10. Add `quantity` field +11. Add `vat` and `buyerPremiumPercentage` for cost calculations +12. Add `biddingStatus` enum extraction + +### LOW PRIORITY: +13. Add video URL support +14. Add document URL support + +## Updated Schema Requirements + +### lots table - NEW columns: +```sql +ALTER TABLE lots ADD COLUMN followers_count INTEGER DEFAULT 0; +ALTER TABLE lots ADD COLUMN estimated_min_price REAL; +ALTER TABLE lots ADD COLUMN estimated_max_price REAL; +ALTER TABLE lots ADD COLUMN location_city TEXT; +ALTER TABLE lots ADD COLUMN location_country TEXT; +ALTER TABLE lots ADD COLUMN lot_condition TEXT; -- Direct from API +ALTER TABLE lots ADD COLUMN appearance TEXT; +ALTER TABLE lots ADD COLUMN packaging TEXT; +ALTER TABLE lots ADD COLUMN quantity INTEGER DEFAULT 1; +ALTER TABLE lots ADD COLUMN vat_percentage REAL; +ALTER TABLE lots ADD COLUMN buyer_premium_percentage REAL; +ALTER TABLE lots ADD COLUMN remarks TEXT; +ALTER TABLE lots ADD COLUMN bidding_status TEXT; +ALTER TABLE lots ADD COLUMN videos_json TEXT; -- Store as JSON array +ALTER TABLE lots ADD COLUMN documents_json TEXT; -- Store as JSON array +``` + +### bid_history table - NEW column: +```sql +ALTER TABLE bid_history ADD COLUMN negotiated INTEGER DEFAULT 0; +``` + +## Intelligence Use Cases + +### With followers_count: +- Predict lot popularity and final price +- Identify hot items early +- Calculate interest-to-bid conversion rate + +### With estimated prices: +- Compare final price to estimate +- Identify bargains (final < estimate) +- Calculate auction house accuracy + +### With nextBidStepInCents: +- Show exact next bid amount +- Calculate optimal bidding strategy + +### With location: +- Filter by proximity +- Calculate pickup logistics + +### With vat/buyer_premium: +- Calculate true total cost +- Compare all-in prices + +### With condition/appearance: +- Better condition scoring +- Identify restoration projects + +## Updated GraphQL Query + +```graphql +query EnhancedLotQuery($lotDisplayId: String!, $locale: String!, $platform: Platform!) { + lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) { + estimatedFullPrice { + min { cents currency } + max { cents currency } + } + lot { + id + displayId + title + description { text } + currentBidAmount { cents currency } + initialAmount { cents currency } + nextMinimalBid { cents currency } + nextBidStepInCents + bidsCount + followersCount + startDate + endDate + minimumBidAmountMet + biddingStatus + condition + appearance + packaging + quantity + vat + buyerPremiumPercentage + remarks + auctionId + location { + city + countryCode + addressLine1 + addressLine2 + } + categoryInformation { + id + name + path + } + images { + url + thumbnailUrl + } + videos { + url + thumbnailUrl + } + documents { + url + name + } + attributes { + name + value + } + } + } +} +``` + +## Summary + +**NEW fields found:** 15+ additional intelligence fields available +**Most critical:** `followersCount` (watch count), `estimatedFullPrice`, `nextBidStepInCents` +**Data quality impact:** Estimated 80%+ increase in intelligence value + +These fields will significantly enhance prediction and analysis capabilities. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..de0fdb0 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,531 @@ +# Scaev - Architecture & Data Flow + +## System Overview + +The scraper follows a **3-phase hierarchical crawling pattern** to extract auction and lot data from Troostwijk Auctions website. + +## Architecture Diagram + +```mariadb +┌─────────────────────────────────────────────────────────────────┐ +│ TROOSTWIJK SCRAPER │ +└─────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────┐ +│ PHASE 1: COLLECT AUCTION URLs │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Listing Page │────────▶│ Extract /a/ │ │ +│ │ /auctions? │ │ auction URLs │ │ +│ │ page=1..N │ └──────────────┘ │ +│ └──────────────┘ │ │ +│ ▼ │ +│ [ List of Auction URLs ] │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ PHASE 2: EXTRACT LOT URLs FROM AUCTIONS │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Auction Page │────────▶│ Parse │ │ +│ │ /a/... │ │ __NEXT_DATA__│ │ +│ └──────────────┘ │ JSON │ │ +│ │ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Save Auction │ │ Extract /l/ │ │ +│ │ Metadata │ │ lot URLs │ │ +│ │ to DB │ └──────────────┘ │ +│ └──────────────┘ │ │ +│ ▼ │ +│ [ List of Lot URLs ] │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ PHASE 3: SCRAPE LOT DETAILS + API ENRICHMENT │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Lot Page │────────▶│ Parse │ │ +│ │ /l/... │ │ __NEXT_DATA__│ │ +│ └──────────────┘ │ JSON │ │ +│ └──────────────┘ │ +│ │ │ +│ ┌─────────────────────────┼─────────────────┐ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ GraphQL API │ │ Bid History │ │ Save Images │ │ +│ │ (Bidding + │ │ REST API │ │ URLs to DB │ │ +│ │ Enrichment) │ │ (per lot) │ └──────────────┘ │ +│ └──────────────┘ └──────────────┘ │ │ +│ │ │ ▼ │ +│ └──────────┬────────────┘ [Optional Download │ +│ ▼ Concurrent per Lot] │ +│ ┌──────────────┐ │ +│ │ Save to DB: │ │ +│ │ - Lot data │ │ +│ │ - Bid data │ │ +│ │ - Enrichment │ │ +│ └──────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Database Schema + +```mariadb +┌──────────────────────────────────────────────────────────────────┐ +│ CACHE TABLE (HTML Storage with Compression) │ +├──────────────────────────────────────────────────────────────────┤ +│ cache │ +│ ├── url (TEXT, PRIMARY KEY) │ +│ ├── content (BLOB) -- Compressed HTML (zlib) │ +│ ├── timestamp (REAL) │ +│ ├── status_code (INTEGER) │ +│ └── compressed (INTEGER) -- 1=compressed, 0=plain │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ AUCTIONS TABLE │ +├──────────────────────────────────────────────────────────────────┤ +│ auctions │ +│ ├── auction_id (TEXT, PRIMARY KEY) -- e.g. "A7-39813" │ +│ ├── url (TEXT, UNIQUE) │ +│ ├── title (TEXT) │ +│ ├── location (TEXT) -- e.g. "Cluj-Napoca, RO" │ +│ ├── lots_count (INTEGER) │ +│ ├── first_lot_closing_time (TEXT) │ +│ └── scraped_at (TEXT) │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ LOTS TABLE (Core + Enriched Intelligence) │ +├──────────────────────────────────────────────────────────────────┤ +│ lots │ +│ ├── lot_id (TEXT, PRIMARY KEY) -- e.g. "A1-28505-5" │ +│ ├── auction_id (TEXT) -- FK to auctions │ +│ ├── url (TEXT, UNIQUE) │ +│ ├── title (TEXT) │ +│ │ │ +│ ├─ BIDDING DATA (GraphQL API) ──────────────────────────────────┤ +│ ├── current_bid (TEXT) -- Current bid amount │ +│ ├── starting_bid (TEXT) -- Initial/opening bid │ +│ ├── minimum_bid (TEXT) -- Next minimum bid │ +│ ├── bid_count (INTEGER) -- Number of bids │ +│ ├── bid_increment (REAL) -- Bid step size │ +│ ├── closing_time (TEXT) -- Lot end date │ +│ ├── status (TEXT) -- Minimum bid status │ +│ │ │ +│ ├─ BID INTELLIGENCE (Calculated from bid_history) ──────────────┤ +│ ├── first_bid_time (TEXT) -- First bid timestamp │ +│ ├── last_bid_time (TEXT) -- Latest bid timestamp │ +│ ├── bid_velocity (REAL) -- Bids per hour │ +│ │ │ +│ ├─ VALUATION & ATTRIBUTES (from __NEXT_DATA__) ─────────────────┤ +│ ├── brand (TEXT) -- Brand from attributes │ +│ ├── model (TEXT) -- Model from attributes │ +│ ├── manufacturer (TEXT) -- Manufacturer name │ +│ ├── year_manufactured (INTEGER) -- Year extracted │ +│ ├── condition_score (REAL) -- 0-10 condition rating │ +│ ├── condition_description (TEXT) -- Condition text │ +│ ├── serial_number (TEXT) -- Serial/VIN number │ +│ ├── damage_description (TEXT) -- Damage notes │ +│ ├── attributes_json (TEXT) -- Full attributes JSON │ +│ │ │ +│ ├─ LEGACY/OTHER ─────────────────────────────────────────────────┤ +│ ├── viewing_time (TEXT) -- Viewing schedule │ +│ ├── pickup_date (TEXT) -- Pickup schedule │ +│ ├── location (TEXT) -- e.g. "Dongen, NL" │ +│ ├── description (TEXT) -- Lot description │ +│ ├── category (TEXT) -- Lot category │ +│ ├── sale_id (INTEGER) -- Legacy field │ +│ ├── type (TEXT) -- Legacy field │ +│ ├── year (INTEGER) -- Legacy field │ +│ ├── currency (TEXT) -- Currency code │ +│ ├── closing_notified (INTEGER) -- Notification flag │ +│ └── scraped_at (TEXT) -- Scrape timestamp │ +│ FOREIGN KEY (auction_id) → auctions(auction_id) │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ IMAGES TABLE (Image URLs & Download Status) │ +├──────────────────────────────────────────────────────────────────┤ +│ images ◀── THIS TABLE HOLDS IMAGE LINKS│ +│ ├── id (INTEGER, PRIMARY KEY AUTOINCREMENT) │ +│ ├── lot_id (TEXT) -- FK to lots │ +│ ├── url (TEXT) -- Image URL │ +│ ├── local_path (TEXT) -- Path after download │ +│ └── downloaded (INTEGER) -- 0=pending, 1=downloaded │ +│ FOREIGN KEY (lot_id) → lots(lot_id) │ +│ UNIQUE INDEX idx_unique_lot_url ON (lot_id, url) │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ BID_HISTORY TABLE (Complete Bid Tracking for Intelligence) │ +├──────────────────────────────────────────────────────────────────┤ +│ bid_history ◀── REST API: /bidding-history │ +│ ├── id (INTEGER, PRIMARY KEY AUTOINCREMENT) │ +│ ├── lot_id (TEXT) -- FK to lots │ +│ ├── bid_amount (REAL) -- Bid in EUR │ +│ ├── bid_time (TEXT) -- ISO 8601 timestamp │ +│ ├── is_autobid (INTEGER) -- 0=manual, 1=autobid │ +│ ├── bidder_id (TEXT) -- Anonymized bidder UUID │ +│ ├── bidder_number (INTEGER) -- Bidder display number │ +│ └── created_at (TEXT) -- Record creation timestamp │ +│ FOREIGN KEY (lot_id) → lots(lot_id) │ +│ INDEX idx_bid_history_lot ON (lot_id) │ +│ INDEX idx_bid_history_time ON (bid_time) │ +└──────────────────────────────────────────────────────────────────┘ +``` + +## Sequence Diagram + +``` +User Scraper Playwright Cache DB Data Tables + │ │ │ │ │ + │ Run │ │ │ │ + ├──────────────▶│ │ │ │ + │ │ │ │ │ + │ │ Phase 1: Listing Pages │ │ + │ ├───────────────▶│ │ │ + │ │ goto() │ │ │ + │ │◀───────────────┤ │ │ + │ │ HTML │ │ │ + │ ├───────────────────────────────▶│ │ + │ │ compress & cache │ │ + │ │ │ │ │ + │ │ Phase 2: Auction Pages │ │ + │ ├───────────────▶│ │ │ + │ │◀───────────────┤ │ │ + │ │ HTML │ │ │ + │ │ │ │ │ + │ │ Parse __NEXT_DATA__ JSON │ │ + │ │────────────────────────────────────────────────▶│ + │ │ │ │ INSERT auctions + │ │ │ │ │ + │ │ Phase 3: Lot Pages │ │ + │ ├───────────────▶│ │ │ + │ │◀───────────────┤ │ │ + │ │ HTML │ │ │ + │ │ │ │ │ + │ │ Parse __NEXT_DATA__ JSON │ │ + │ │────────────────────────────────────────────────▶│ + │ │ │ │ INSERT lots │ + │ │────────────────────────────────────────────────▶│ + │ │ │ │ INSERT images│ + │ │ │ │ │ + │ │ Export to CSV/JSON │ │ + │ │◀────────────────────────────────────────────────┤ + │ │ Query all data │ │ + │◀──────────────┤ │ │ │ + │ Results │ │ │ │ +``` + +## Data Flow Details + +### 1. **Page Retrieval & Caching** +``` +Request URL + │ + ├──▶ Check cache DB (with timestamp validation) + │ │ + │ ├─[HIT]──▶ Decompress (if compressed=1) + │ │ └──▶ Return HTML + │ │ + │ └─[MISS]─▶ Fetch via Playwright + │ │ + │ ├──▶ Compress HTML (zlib level 9) + │ │ ~70-90% size reduction + │ │ + │ └──▶ Store in cache DB (compressed=1) + │ + └──▶ Return HTML for parsing +``` + +### 2. **JSON Parsing Strategy** +``` +HTML Content + │ + └──▶ Extract ', content, re.DOTALL) + if not match: + return None + + data = json.loads(match.group(1)) + page_props = data.get('props', {}).get('pageProps', {}) + + if 'lot' in page_props: + # Pass both lot and auction data (auction is included in lot pages) + return self._parse_lot_json(page_props.get('lot', {}), url, page_props.get('auction')) + if 'auction' in page_props: + return self._parse_auction_json(page_props.get('auction', {}), url) + return None + + except Exception as e: + print(f" → Error parsing __NEXT_DATA__: {e}") + return None + + def _parse_lot_json(self, lot_data: Dict, url: str, auction_data: Optional[Dict] = None) -> Dict: + """Parse lot data from JSON + + Args: + lot_data: Lot object from __NEXT_DATA__ + url: Page URL + auction_data: Optional auction object (included in lot pages) + """ + location_data = lot_data.get('location', {}) + city = location_data.get('city', '') + country = location_data.get('countryCode', '').upper() + location = f"{city}, {country}" if city and country else (city or country) + + current_bid = lot_data.get('currentBid') or lot_data.get('highestBid') or lot_data.get('startingBid') + if current_bid is None or current_bid == 0: + bidding = lot_data.get('bidding', {}) + current_bid = bidding.get('currentBid') or bidding.get('amount') + + current_bid_str = self.format_currency(current_bid) if current_bid and current_bid > 0 else "No bids" + + bid_count = lot_data.get('bidCount', 0) + if bid_count == 0: + bid_count = lot_data.get('bidding', {}).get('bidCount', 0) + + description = lot_data.get('description', {}) + if isinstance(description, dict): + description = description.get('description', '') + else: + description = str(description) + + category = lot_data.get('category', {}) + category_name = category.get('name', '') if isinstance(category, dict) else '' + + # Get auction displayId from auction data if available (lot pages include auction) + # Otherwise fall back to the UUID auctionId + auction_id = lot_data.get('auctionId', '') + if auction_data and auction_data.get('displayId'): + auction_id = auction_data.get('displayId') + + return { + 'type': 'lot', + 'lot_id': lot_data.get('displayId', ''), + 'auction_id': auction_id, + 'url': url, + 'title': lot_data.get('title', ''), + 'current_bid': current_bid_str, + 'bid_count': bid_count, + 'closing_time': self.format_timestamp(lot_data.get('endDate', '')), + 'viewing_time': self._extract_viewing_time(lot_data), + 'pickup_date': self._extract_pickup_date(lot_data), + 'location': location, + 'description': description, + 'category': category_name, + 'images': self._extract_images_from_json(lot_data), + 'scraped_at': datetime.now().isoformat() + } + + def _parse_auction_json(self, auction_data: Dict, url: str) -> Dict: + """Parse auction data from JSON""" + is_auction = 'lots' in auction_data and isinstance(auction_data['lots'], list) + is_lot = 'lotNumber' in auction_data or 'currentBid' in auction_data + + if is_auction: + lots = auction_data.get('lots', []) + first_lot_closing = None + if lots: + first_lot_closing = self.format_timestamp(lots[0].get('endDate', '')) + + return { + 'type': 'auction', + 'auction_id': auction_data.get('displayId', ''), + 'url': url, + 'title': auction_data.get('name', ''), + 'location': self._extract_location_from_json(auction_data), + 'lots_count': len(lots), + 'first_lot_closing_time': first_lot_closing or self.format_timestamp(auction_data.get('minEndDate', '')), + 'scraped_at': datetime.now().isoformat(), + 'lots': lots + } + elif is_lot: + return self._parse_lot_json(auction_data, url) + return None + + def _extract_viewing_time(self, auction_data: Dict) -> str: + """Extract viewing time from auction data""" + viewing_days = auction_data.get('viewingDays', []) + if viewing_days: + first = viewing_days[0] + start = self.format_timestamp(first.get('startDate', '')) + end = self.format_timestamp(first.get('endDate', '')) + if start and end: + return f"{start} - {end}" + return start or end + return '' + + def _extract_pickup_date(self, auction_data: Dict) -> str: + """Extract pickup date from auction data""" + collection_days = auction_data.get('collectionDays', []) + if collection_days: + first = collection_days[0] + start = self.format_timestamp(first.get('startDate', '')) + end = self.format_timestamp(first.get('endDate', '')) + if start and end: + return f"{start} - {end}" + return start or end + return '' + + def _extract_images_from_json(self, auction_data: Dict) -> List[str]: + """Extract all image URLs from auction data""" + images = [] + if auction_data.get('image', {}).get('url'): + images.append(auction_data['image']['url']) + if isinstance(auction_data.get('images'), list): + for img in auction_data['images']: + if isinstance(img, dict) and img.get('url'): + images.append(img['url']) + elif isinstance(img, str): + images.append(img) + return images + + def _extract_location_from_json(self, auction_data: Dict) -> str: + """Extract location from auction JSON data""" + for days in [auction_data.get('viewingDays', []), auction_data.get('collectionDays', [])]: + if days: + first_location = days[0] + city = first_location.get('city', '') + country = first_location.get('countryCode', '').upper() + if city: + return f"{city}, {country}" if country else city + return '' + + def _extract_meta_content(self, content: str, property_name: str) -> str: + """Extract content from meta tags""" + pattern = rf']*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']' + match = re.search(pattern, content, re.IGNORECASE) + return self.clean_text(match.group(1)) if match else "" + + def _extract_current_bid(self, content: str) -> str: + """Extract current bid amount""" + patterns = [ + r'"currentBid"\s*:\s*"([^"]+)"', + r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)', + r'(?:Current bid|Huidig bod)[:\s]*\s*(€[\d,.\s]+)', + r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)', + ] + + # Invalid bid texts that should be treated as "no bids" + invalid_bid_texts = [ + 'huidig bod', + 'current bid', + '€huidig bod', + '€huidig ​​bod', # With zero-width spaces + 'huidig ​​bod', + ] + + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + bid = match.group(1).strip() + # Remove zero-width spaces and other unicode whitespace + bid = re.sub(r'[\u200b\u200c\u200d\u00a0]+', ' ', bid).strip() + + # Check if it's a valid bid + if bid: + # Reject invalid bid texts + bid_lower = bid.lower().replace(' ', '').replace('€', '') + if bid_lower not in [t.lower().replace(' ', '').replace('€', '') for t in invalid_bid_texts]: + if not bid.startswith('€'): + bid = f"€{bid}" + return bid + + return "No bids" + + def _extract_bid_count(self, content: str) -> int: + """Extract number of bids""" + match = re.search(r'(\d+)\s*bids?', content, re.IGNORECASE) + if match: + try: + return int(match.group(1)) + except: + pass + return 0 + + def _extract_end_date(self, content: str) -> str: + """Extract auction end date""" + patterns = [ + r'Ends?[:\s]+([A-Za-z0-9,:\s]+)', + r'endTime["\']:\s*["\']([^"\']+)["\']', + ] + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + return match.group(1).strip() + return "" + + def _extract_location(self, content: str) -> str: + """Extract location""" + patterns = [ + r'(?:Location|Locatie)[:\s]*\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)', + r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?: 2: + return location + return "" + + def _extract_description(self, content: str) -> str: + """Extract description""" + pattern = r']*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']' + match = re.search(pattern, content, re.IGNORECASE | re.DOTALL) + return self.clean_text(match.group(1))[:500] if match else "" + + def _extract_category(self, content: str) -> str: + """Extract category from breadcrumb or meta tags""" + pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)' + match = re.search(pattern, content, re.IGNORECASE) + if match: + return self.clean_text(match.group(1)) + return self._extract_meta_content(content, 'category') + + def _extract_images(self, content: str) -> List[str]: + """Extract image URLs""" + pattern = r']*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>' + matches = re.findall(pattern, content, re.IGNORECASE) + + images = [] + for match in matches: + if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']): + continue + full_url = urljoin(BASE_URL, match) + images.append(full_url) + + return images[:5] # Limit to 5 images \ No newline at end of file diff --git a/src/priority.py b/src/priority.py new file mode 100644 index 0000000..0398db6 --- /dev/null +++ b/src/priority.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +Priority calculation for intelligent scraping +""" + +import time +from datetime import datetime +from typing import Optional, Tuple + + +def parse_closing_time(closing_time_str: Optional[str]) -> Optional[int]: + """Parse closing time string to unix timestamp""" + if not closing_time_str: + return None + + try: + # Try various date formats + formats = [ + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%d %H:%M', + '%d-%m-%Y %H:%M', + ] + + for fmt in formats: + try: + dt = datetime.strptime(closing_time_str, fmt) + return int(dt.timestamp()) + except: + continue + + return None + except: + return None + + +def calculate_ttl(closing_timestamp: int, current_time: Optional[int] = None) -> int: + """ + Calculate Time-To-Live (TTL) for cache based on time until closing + + Strategy: + - Closing in > 7 days: Scrape once per day (TTL = 24 hours) + - Closing in 3-7 days: Scrape every 12 hours + - Closing in 1-3 days: Scrape every 6 hours + - Closing in 12-24 hours: Scrape every 3 hours + - Closing in 6-12 hours: Scrape every 2 hours + - Closing in 1-6 hours: Scrape every 30 minutes + - Closing in < 1 hour: Scrape every 10 minutes + - Already closed: TTL = infinite (no need to rescrape) + """ + if current_time is None: + current_time = int(time.time()) + + time_until_close = closing_timestamp - current_time + + # Already closed - very low priority + if time_until_close <= 0: + return 999999999 # Effectively infinite TTL + + # Convert to hours + hours_until_close = time_until_close / 3600 + + if hours_until_close > 168: # > 7 days + return 24 * 3600 # 24 hours + elif hours_until_close > 72: # 3-7 days + return 12 * 3600 # 12 hours + elif hours_until_close > 24: # 1-3 days + return 6 * 3600 # 6 hours + elif hours_until_close > 12: # 12-24 hours + return 3 * 3600 # 3 hours + elif hours_until_close > 6: # 6-12 hours + return 2 * 3600 # 2 hours + elif hours_until_close > 1: # 1-6 hours + return 30 * 60 # 30 minutes + else: # < 1 hour - URGENT! + return 10 * 60 # 10 minutes + + +def calculate_priority( + closing_time_str: Optional[str], + scraped_at: Optional[int], + current_time: Optional[int] = None +) -> Tuple[int, int]: + """ + Calculate scrape priority and next_scrape_at timestamp + + Returns: + (priority, next_scrape_at) + + Priority Scale: + 10000+ = Never scraped (highest priority) + 9000+ = Closing within 1 hour + 8000+ = Closing within 6 hours + 7000+ = Closing within 24 hours + 6000+ = Closing within 3 days + 5000+ = Closing within 7 days + 1000+ = Due for re-scrape (TTL expired) + 0-999 = Recently scraped, not due yet + -1000 = Already closed + """ + if current_time is None: + current_time = int(time.time()) + + # Never scraped = highest priority + if scraped_at is None or scraped_at == 0: + closing_timestamp = parse_closing_time(closing_time_str) + if closing_timestamp: + ttl = calculate_ttl(closing_timestamp, current_time) + next_scrape = current_time # Scrape immediately + time_until_close = closing_timestamp - current_time + + # Boost priority based on urgency + if time_until_close <= 0: + return (10000, next_scrape) # Closed but never scraped + elif time_until_close < 3600: + return (19000, next_scrape) # < 1 hour - CRITICAL + elif time_until_close < 6 * 3600: + return (18000, next_scrape) # < 6 hours + elif time_until_close < 24 * 3600: + return (17000, next_scrape) # < 24 hours + elif time_until_close < 3 * 24 * 3600: + return (16000, next_scrape) # < 3 days + else: + return (15000, next_scrape) # > 3 days but never scraped + else: + return (15000, current_time) # No closing time, high priority anyway + + # Already scraped - calculate based on TTL + closing_timestamp = parse_closing_time(closing_time_str) + + if not closing_timestamp: + # No closing time - scrape once per day + ttl = 24 * 3600 + next_scrape = scraped_at + ttl + time_until_rescrape = next_scrape - current_time + + if time_until_rescrape <= 0: + return (1000, current_time) # Due for rescrape + else: + return (500, next_scrape) # Not due yet + + # Has closing time - intelligent TTL + time_until_close = closing_timestamp - current_time + + # Already closed + if time_until_close <= 0: + return (-1000, 999999999) # Very low priority, never rescrape + + # Calculate TTL and next scrape time + ttl = calculate_ttl(closing_timestamp, current_time) + next_scrape = scraped_at + ttl + time_until_rescrape = next_scrape - current_time + + # Priority based on urgency and TTL + if time_until_rescrape <= 0: + # Due for rescrape - urgency-based priority + if time_until_close < 3600: + return (9000, current_time) # < 1 hour - URGENT + elif time_until_close < 6 * 3600: + return (8000, current_time) # < 6 hours + elif time_until_close < 24 * 3600: + return (7000, current_time) # < 24 hours + elif time_until_close < 3 * 24 * 3600: + return (6000, current_time) # < 3 days + elif time_until_close < 7 * 24 * 3600: + return (5000, current_time) # < 7 days + else: + return (1000, current_time) # > 7 days, but due + else: + # Not due yet - low priority + return (min(999, int(time_until_close / 3600)), next_scrape) diff --git a/src/scraper.py b/src/scraper.py new file mode 100644 index 0000000..da61af8 --- /dev/null +++ b/src/scraper.py @@ -0,0 +1,991 @@ +#!/usr/bin/env python3 +""" +Core scaev module for Scaev Auctions +""" +import os +import sqlite3 +import asyncio +import time +import random +import json +import re +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Set, Tuple +from urllib.parse import urljoin + +from playwright.async_api import async_playwright, Page + +from config import ( + BASE_URL, RATE_LIMIT_SECONDS, MAX_PAGES, DOWNLOAD_IMAGES, IMAGES_DIR, OFFLINE +) +from cache import CacheManager +from parse import DataParser +from graphql_client import ( + fetch_lot_bidding_data, format_bid_data, + fetch_auction_data, format_auction_data, + extract_attributes_from_lot_json, + extract_enriched_attributes +) +from bid_history_client import fetch_bid_history, parse_bid_history +from priority import calculate_priority, parse_closing_time + +class TroostwijkScraper: + """Main scraper class for Troostwijk Auctions""" + + def __init__(self): + self.base_url = BASE_URL + self.cache = CacheManager() + self.parser = DataParser() + self.visited_lots: Set[str] = set() + self.last_request_time = 0 + self.download_images = DOWNLOAD_IMAGES + self.intercepted_api_data: Dict[str, str] = {} # Store intercepted GraphQL responses by lot_id + self.offline = OFFLINE + + async def _download_image(self, session: 'aiohttp.ClientSession', url: str, lot_id: str, index: int) -> Optional[str]: + """Download an image and save it locally (without rate limiting - concurrent within lot)""" + if not self.download_images: + return None + + try: + lot_dir = Path(IMAGES_DIR) / lot_id + lot_dir.mkdir(parents=True, exist_ok=True) + + ext = url.split('.')[-1].split('?')[0] + if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']: + ext = 'jpg' + + filepath = lot_dir / f"{index:03d}.{ext}" + if filepath.exists(): + return str(filepath) + + async with session.get(url, timeout=30) as response: + if response.status == 200: + content = await response.read() + with open(filepath, 'wb') as f: + f.write(content) + + with sqlite3.connect(self.cache.db_path) as conn: + conn.execute("UPDATE images\n" + "SET local_path = ?, downloaded = 1\n" + "WHERE lot_id = ? AND url = ?\n" + "", (str(filepath), lot_id, url)) + conn.commit() + return str(filepath) + + except Exception as e: + print(f" ERROR downloading image: {e}") + return None + + async def _rate_limit(self): + """ENSURE EXACTLY 0.5s BETWEEN REQUESTS""" + current_time = time.time() + time_since_last = current_time - self.last_request_time + + if time_since_last < RATE_LIMIT_SECONDS: + await asyncio.sleep(RATE_LIMIT_SECONDS - time_since_last) + + self.last_request_time = time.time() + + async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[Dict]: + """Get page content with caching and strict rate limiting + + Args: + fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading + (useful for auction listing pages where we just need HTML structure) + + Returns: + Dict with 'content' and 'from_cache' keys + """ + if use_cache: + cache_start = time.time() + cached = self.cache.get(url) + if cached: + cache_time = (time.time() - cache_start) * 1000 + print(f" CACHE HIT: {url} ({cache_time:.0f}ms)") + return {'content': cached['content'], 'from_cache': True} + + # In OFFLINE mode we never fetch from network + if self.offline: + print(f" OFFLINE: cache miss for {url} — skipping fetch") + return None + + await self._rate_limit() + + try: + fetch_start = time.time() + print(f" FETCHING: {url}") + + # Use faster loading strategy for auction pages (we only need HTML, not all assets) + wait_strategy = 'domcontentloaded' if fast_mode else 'networkidle' + await page.goto(url, wait_until=wait_strategy, timeout=30000) + goto_time = time.time() - fetch_start + + # Shorter delay for fast mode + delay = random.uniform(0.1, 0.3) if fast_mode else random.uniform(0.3, 0.7) + await asyncio.sleep(delay) + + content = await page.content() + total_time = time.time() - fetch_start + self.cache.set(url, content, 200) + print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]") + return {'content': content, 'from_cache': False} + + except Exception as e: + print(f" ERROR: {e}") + self.cache.set(url, "", 500) + return None + + def _extract_auction_urls_from_listing(self, content: str) -> List[str]: + """Extract auction URLs from listing page""" + pattern = r'href=["\']([/]a/[^"\']+)["\']' + matches = re.findall(pattern, content, re.IGNORECASE) + return list(set(urljoin(self.base_url, match) for match in matches)) + + def _extract_lot_urls_from_auction(self, content: str, auction_url: str) -> List[str]: + """Extract lot URLs from an auction page""" + # Try Next.js data first + try: + match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) + if match: + data = json.loads(match.group(1)) + lots = data.get('props', {}).get('pageProps', {}).get('auction', {}).get('lots', []) + if lots: + return list(set(f"{self.base_url}/l/{lot.get('urlSlug', '')}" + for lot in lots if lot.get('urlSlug'))) + except: + pass + + # Fallback to HTML parsing + pattern = r'href=["\']([/]l/[^"\']+)["\']' + matches = re.findall(pattern, content, re.IGNORECASE) + return list(set(urljoin(self.base_url, match) for match in matches)) + + async def crawl_listing_page(self, page: Page, page_num: int) -> List[str]: + """Crawl a single listing page and return auction URLs""" + url = f"{self.base_url}/auctions?page={page_num}" + print(f"\n{'='*60}") + print(f"LISTING PAGE {page_num}: {url}") + print(f"{'='*60}") + + # Use fast mode - we only need HTML structure for link extraction + result = await self._get_page(page, url, fast_mode=True) + if not result: + return [] + + auction_urls = self._extract_auction_urls_from_listing(result['content']) + print(f"→ Found {len(auction_urls)} auction URLs") + return auction_urls + + async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]: + """Crawl an auction page and extract lot URLs""" + # Use fast mode for auction pages - we only need the HTML structure, not all assets + result = await self._get_page(page, auction_url, fast_mode=True) + if not result: + return [] + + content = result['content'] + parse_start = time.time() + page_data = self.parser.parse_page(content, auction_url) + parse_time = (time.time() - parse_start) * 1000 + + if page_data and page_data.get('type') == 'auction': + save_start = time.time() + self.cache.save_auction(page_data) + save_time = (time.time() - save_start) * 1000 + print(f" → Auction: {page_data.get('title', '')[:50]}... ({page_data.get('lots_count', 0)} lots)") + print(f" [Parse: {parse_time:.0f}ms, Save: {save_time:.0f}ms]") + + extract_start = time.time() + lot_urls = self._extract_lot_urls_from_auction(content, auction_url) + extract_time = (time.time() - extract_start) * 1000 + print(f" [Extract lots: {extract_time:.0f}ms]") + return lot_urls + + async def crawl_page(self, page: Page, url: str) -> Optional[Dict]: + """Crawl a page (auction or lot)""" + if url in self.visited_lots: + print(f" → Skipping (already visited): {url}") + return None + + page_id = self.parser.extract_lot_id(url) + print(f"\n[PAGE {page_id}]") + + result = await self._get_page(page, url) + if not result: + # OFFLINE fallback: try to construct page data directly from DB + if self.offline: + import sqlite3 + conn = sqlite3.connect(self.cache.db_path) + cur = conn.cursor() + # Try lot first + cur.execute("SELECT * FROM lots WHERE url = ?", (url,)) + lot_row = cur.fetchone() + if lot_row: + # Build a dict using column names + col_names = [d[0] for d in cur.description] + lot_dict = dict(zip(col_names, lot_row)) + conn.close() + page_data = { + 'type': 'lot', + 'lot_id': lot_dict.get('lot_id'), + 'auction_id': lot_dict.get('auction_id'), + 'url': lot_dict.get('url') or url, + 'title': lot_dict.get('title') or '', + 'current_bid': lot_dict.get('current_bid') or '', + 'bid_count': lot_dict.get('bid_count') or 0, + 'closing_time': lot_dict.get('closing_time') or '', + 'viewing_time': lot_dict.get('viewing_time') or '', + 'pickup_date': lot_dict.get('pickup_date') or '', + 'location': lot_dict.get('location') or '', + 'description': lot_dict.get('description') or '', + 'category': lot_dict.get('category') or '', + 'status': lot_dict.get('status') or '', + 'brand': lot_dict.get('brand') or '', + 'model': lot_dict.get('model') or '', + 'attributes_json': lot_dict.get('attributes_json') or '', + 'first_bid_time': lot_dict.get('first_bid_time'), + 'last_bid_time': lot_dict.get('last_bid_time'), + 'bid_velocity': lot_dict.get('bid_velocity'), + 'followers_count': lot_dict.get('followers_count') or 0, + 'estimated_min_price': lot_dict.get('estimated_min_price'), + 'estimated_max_price': lot_dict.get('estimated_max_price'), + 'lot_condition': lot_dict.get('lot_condition') or '', + 'appearance': lot_dict.get('appearance') or '', + 'scraped_at': lot_dict.get('scraped_at') or '', + } + print(" OFFLINE: using DB record for lot") + self.visited_lots.add(url) + return page_data + + # Try auction by URL + cur.execute("SELECT * FROM auctions WHERE url = ?", (url,)) + auc_row = cur.fetchone() + if auc_row: + col_names = [d[0] for d in cur.description] + auc_dict = dict(zip(col_names, auc_row)) + conn.close() + page_data = { + 'type': 'auction', + 'auction_id': auc_dict.get('auction_id'), + 'url': auc_dict.get('url') or url, + 'title': auc_dict.get('title') or '', + 'location': auc_dict.get('location') or '', + 'lots_count': auc_dict.get('lots_count') or 0, + 'first_lot_closing_time': auc_dict.get('first_lot_closing_time') or '', + 'scraped_at': auc_dict.get('scraped_at') or '', + } + print(" OFFLINE: using DB record for auction") + self.visited_lots.add(url) + return page_data + + conn.close() + return None + + content = result['content'] + from_cache = result['from_cache'] + page_data = self.parser.parse_page(content, url) + if not page_data: + return None + + self.visited_lots.add(url) + + if page_data.get('type') == 'auction': + print(f" Type: AUCTION") + print(f" Title: {page_data.get('title', 'N/A')[:60]}...") + print(f" Location: {page_data.get('location', 'N/A')}") + print(f" Lots: {page_data.get('lots_count', 0)}") + self.cache.save_auction(page_data) + + elif page_data.get('type') == 'lot': + print(f" Type: LOT") + print(f" Title: {page_data.get('title', 'N/A')[:60]}...") + + # Extract ALL data from __NEXT_DATA__ lot object + import json + import re + lot_json = None + lot_uuid = None + + match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) + if match: + try: + data = json.loads(match.group(1)) + lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {}) + if lot_json: + # Basic attributes + attrs = extract_attributes_from_lot_json(lot_json) + page_data.update(attrs) + + # Enriched attributes (year, condition, etc.) + enriched = extract_enriched_attributes(lot_json, page_data) + page_data.update(enriched) + + # Get lot UUID for bid history + lot_uuid = lot_json.get('id') + except: + pass + + # Fetch all API data concurrently (or use intercepted/cached data) + lot_id = page_data.get('lot_id') + auction_id = page_data.get('auction_id') + import sqlite3 + + # Step 1: Check if we intercepted API data during page load + intercepted_data = None + if lot_id in self.intercepted_api_data: + print(f" Using intercepted API data (free!)") + try: + intercepted_json = self.intercepted_api_data[lot_id] + intercepted_data = json.loads(intercepted_json) + # Store the raw JSON for future offline use + page_data['api_data_json'] = intercepted_json + # Extract lot data from intercepted response + if 'data' in intercepted_data and 'lot' in intercepted_data['data']: + lot_api_data = intercepted_data['data']['lot'] + # Format it as if it came from our fetch_lot_bidding_data + bidding_data = {'lot': lot_api_data} + from_cache = False # We have fresh data + except Exception as e: + print(f" Error parsing intercepted data: {e}") + intercepted_data = None + + if intercepted_data: + # We got free API data from interception - skip the fetch logic + pass + elif from_cache: + # Check if we have cached API data in database + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT followers_count, estimated_min_price, current_bid, bid_count, closing_time, status + FROM lots WHERE lot_id = ? + """, (lot_id,)) + existing = cursor.fetchone() + conn.close() + + # Data quality check: Must have followers_count AND closing_time to be considered "complete" + # This prevents using stale records like old "0 bids" entries + is_complete = (existing and + existing[0] is not None and # followers_count exists + existing[4] is not None and # closing_time exists + existing[4] != '') # closing_time is not empty + + if is_complete: + print(f" Using cached API data") + page_data['followers_count'] = existing[0] + page_data['estimated_min_price'] = existing[1] + page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids') + page_data['bid_count'] = existing[3] or 0 + page_data['closing_time'] = existing[4] # Add closing_time + page_data['status'] = existing[5] or '' # Add status + bidding_data = None + bid_history_data = None + else: + print(f" Fetching lot data from API (concurrent)...") + # Make concurrent API calls + api_tasks = [fetch_lot_bidding_data(lot_id)] + if auction_id: + api_tasks.append(fetch_auction_data(auction_id)) + results = await asyncio.gather(*api_tasks, return_exceptions=True) + bidding_data = results[0] if results and not isinstance(results[0], Exception) else None + bid_history_data = None # Will fetch after we have lot_uuid + else: + # Fresh page fetch - make concurrent API calls for all data + if not self.offline: + print(f" Fetching lot data from API (concurrent)...") + api_tasks = [fetch_lot_bidding_data(lot_id)] + task_map = {'bidding': 0} # Track which index corresponds to which task + + # Add auction data fetch if we need viewing/pickup times + if auction_id: + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ? + """, (lot_id,)) + times = cursor.fetchone() + conn.close() + has_times = times and (times[0] or times[1]) + + if not has_times: + task_map['auction'] = len(api_tasks) + api_tasks.append(fetch_auction_data(auction_id)) + + # Add bid history fetch if we have lot_uuid and expect bids + if lot_uuid: + task_map['bid_history'] = len(api_tasks) + api_tasks.append(fetch_bid_history(lot_uuid)) + + # Execute all API calls concurrently + results = await asyncio.gather(*api_tasks, return_exceptions=True) + bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None + + # Store raw API JSON for offline replay + if bidding_data: + page_data['api_data_json'] = json.dumps(bidding_data) + + # Process auction data if it was fetched + if 'auction' in task_map and len(results) > task_map['auction']: + auction_data = results[task_map['auction']] + if not isinstance(auction_data, Exception) and auction_data: + auction_times = format_auction_data(auction_data) + page_data.update(auction_times) + + # Process bid history if it was fetched + bid_history_data = None + if 'bid_history' in task_map and len(results) > task_map['bid_history']: + bid_history_data = results[task_map['bid_history']] + if isinstance(bid_history_data, Exception): + bid_history_data = None + + if bidding_data: + formatted_data = format_bid_data(bidding_data) + + # Merge data intelligently - don't overwrite existing fields + # Parser (from __NEXT_DATA__) has: description, category, images + # API has: current_bid, bid_count, closing_time, status, followers, estimates + # Keep parser data, enhance with API data + for key, value in formatted_data.items(): + # Only update if current value is missing/empty + current_value = page_data.get(key) + if current_value is None or current_value == '' or current_value == 0 or current_value == 'No bids': + page_data[key] = value + # Special case: always update bid_count if API has higher value + elif key == 'bid_count' and isinstance(value, int) and value > current_value: + page_data[key] = value + + # Enhanced logging with new intelligence fields + print(f" Bid: {page_data.get('current_bid', 'N/A')}") + print(f" Status: {page_data.get('status', 'N/A')}") + + # NEW: Show followers count (watch count) + followers = page_data.get('followers_count', 0) + if followers > 0: + print(f" Followers: {followers} watching") + + # NEW: Show estimated prices for value assessment + est_min = page_data.get('estimated_min_price') + est_max = page_data.get('estimated_max_price') + if est_min or est_max: + if est_min and est_max: + print(f" Estimate: EUR {est_min:.2f} - EUR {est_max:.2f}") + + # Calculate and show value gap for bargain detection + current_bid_str = page_data.get('current_bid', '') + if 'EUR' in current_bid_str and 'No bids' not in current_bid_str: + try: + current_bid_val = float(current_bid_str.replace('EUR ', '').replace(',', '')) + value_gap = est_min - current_bid_val + if value_gap > 0: + gap_pct = (value_gap / est_min) * 100 + if gap_pct > 20: + print(f" >> BARGAIN: {gap_pct:.0f}% below estimate!") + else: + print(f" Value gap: {gap_pct:.0f}% below estimate") + except: + pass + elif est_min: + print(f" Estimate: From EUR {est_min:.2f}") + elif est_max: + print(f" Estimate: Up to EUR {est_max:.2f}") + + # NEW: Show condition information + condition = page_data.get('lot_condition') + if condition: + print(f" Condition: {condition}") + + # Show manufacturer/brand if available + brand = page_data.get('brand') or page_data.get('manufacturer') + model = page_data.get('model') + year = page_data.get('year_manufactured') + if brand or model or year: + parts = [] + if year: + parts.append(str(year)) + if brand: + parts.append(brand) + if model: + parts.append(model) + print(f" Item: {' '.join(parts)}") + + # Extract bid increment from nextBidStepInCents + lot_details_lot = bidding_data.get('lot', {}) + next_step_cents = lot_details_lot.get('nextBidStepInCents') + if next_step_cents: + page_data['bid_increment'] = next_step_cents / 100.0 + + # Get lot UUID if not already extracted + if not lot_uuid: + lot_uuid = lot_details_lot.get('id') + + # Process bid history if we fetched it concurrently + if not from_cache and 'bid_history_data' in locals() and bid_history_data and page_data.get('bid_count', 0) > 0: + bid_data = parse_bid_history(bid_history_data, lot_id) + page_data.update(bid_data) + print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour") + self.cache.save_bid_history(lot_id, bid_data['bid_records']) + elif not from_cache and lot_uuid and page_data.get('bid_count', 0) > 0: + # Fallback: fetch bid history if we didn't get it in the concurrent batch + # (This happens when lot_uuid wasn't available before the first API call) + print(f" Fetching bid history...") + bid_history = await fetch_bid_history(lot_uuid) + if bid_history: + bid_data = parse_bid_history(bid_history, lot_id) + page_data.update(bid_data) + print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour") + self.cache.save_bid_history(lot_id, bid_data['bid_records']) + elif from_cache and page_data.get('bid_count', 0) > 0: + # Check if cached bid history exists + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT COUNT(*) FROM bid_history WHERE lot_id = ? + """, (lot_id,)) + has_history = cursor.fetchone()[0] > 0 + conn.close() + if has_history: + print(f" Bid history cached") + else: + print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)") + + print(f" Location: {page_data.get('location', 'N/A')}") + + # Calculate and store priority for next scrape + current_time = int(time.time()) + priority, next_scrape = calculate_priority( + page_data.get('closing_time'), + current_time, # Just scraped now + current_time + ) + page_data['scrape_priority'] = priority + page_data['next_scrape_at'] = next_scrape + + self.cache.save_lot(page_data) + + images = page_data.get('images', []) + if images: + self.cache.save_images(page_data['lot_id'], images) + print(f" Images: {len(images)}") + + if self.download_images: + # Check which images are already downloaded + import sqlite3 + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT url FROM images + WHERE lot_id = ? AND downloaded = 1 + """, (page_data['lot_id'],)) + already_downloaded = {row[0] for row in cursor.fetchall()} + conn.close() + + # Only download missing images + images_to_download = [ + (i, img_url) for i, img_url in enumerate(images) + if img_url not in already_downloaded + ] + + if images_to_download: + import aiohttp + async with aiohttp.ClientSession() as session: + total = len(images_to_download) + + async def dl(i, img_url): + path = await self._download_image(session, img_url, page_data['lot_id'], i) + return i, img_url, path + + tasks = [ + asyncio.create_task(dl(i, img_url)) + for i, img_url in images_to_download + ] + + completed = 0 + succeeded: List[int] = [] + # In-place progress + print(f" Downloading images: 0/{total}", end="\r", flush=True) + for coro in asyncio.as_completed(tasks): + try: + i, img_url, path = await coro + if path: + succeeded.append(i) + except Exception: + pass + finally: + completed += 1 + print(f" Downloading images: {completed}/{total}", end="\r", flush=True) + + # Ensure next prints start on a new line + print() + print(f" Downloaded: {len(succeeded)}/{total} new images") + if succeeded: + succeeded.sort() + # Show which indexes were downloaded + idx_preview = ", ".join(str(x) for x in succeeded[:20]) + more = "" if len(succeeded) <= 20 else f" (+{len(succeeded)-20} more)" + print(f" Indexes: {idx_preview}{more}") + else: + print(f" All {len(images)} images already cached") + + return page_data + + def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]: + """ + Prioritize lots based on closing time and scrape history + + Returns list of (priority, url, description) tuples sorted by priority (highest first) + """ + import sqlite3 + + prioritized = [] + current_time = int(time.time()) + + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + + for url in lot_urls: + # Extract lot_id from URL + lot_id = self.parser.extract_lot_id(url) + + # Try to get existing data from database + cursor.execute(""" + SELECT closing_time, scraped_at, scrape_priority, next_scrape_at + FROM lots WHERE lot_id = ? OR url = ? + """, (lot_id, url)) + + row = cursor.fetchone() + + if row: + closing_time, scraped_at, existing_priority, next_scrape_at = row + + # Parse scraped_at (it might be a string timestamp) + if isinstance(scraped_at, str): + try: + scraped_at = int(datetime.strptime(scraped_at, '%Y-%m-%d %H:%M:%S').timestamp()) + except: + scraped_at = None + else: + closing_time = None + scraped_at = None + + # Calculate priority + priority, next_scrape = calculate_priority(closing_time, scraped_at, current_time) + + # Create description + if scraped_at is None: + desc = "Never scraped" + elif priority >= 15000: + desc = "Never scraped (high urgency)" + elif priority >= 9000: + desc = "URGENT: <1hr to close" + elif priority >= 8000: + desc = "High: <6hr to close" + elif priority >= 7000: + desc = "Medium: <24hr to close" + elif priority >= 5000: + desc = "Normal: <7d to close" + elif priority >= 1000: + desc = "Due for rescrape" + elif priority < 0: + desc = "Already closed" + else: + desc = f"Recently scraped" + + prioritized.append((priority, url, desc)) + + conn.close() + + # Sort by priority (highest first) + prioritized.sort(key=lambda x: x[0], reverse=True) + + return prioritized + + async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]: + """Main crawl function""" + if self.offline: + print("Launching OFFLINE crawl (no network requests)") + # Gather URLs from database + import sqlite3 + conn = sqlite3.connect(self.cache.db_path) + cur = conn.cursor() + cur.execute("SELECT DISTINCT url FROM auctions") + auction_urls = [r[0] for r in cur.fetchall() if r and r[0]] + cur.execute("SELECT DISTINCT url FROM lots") + lot_urls = [r[0] for r in cur.fetchall() if r and r[0]] + conn.close() + + print(f" OFFLINE: {len(auction_urls)} auctions and {len(lot_urls)} lots in DB") + + results: List[Dict] = [] + # Optionally process auctions (parse cached HTML if exists or DB fallback) + for i, auc_url in enumerate(auction_urls): + print(f"\n[AUC {i+1:>3}/{len(auction_urls)}] ", end="") + page_data = await self.crawl_page(page=None, url=auc_url) + if page_data: + results.append(page_data) + + print("\n" + "="*60) + print("PHASE OFFLINE: PROCESSING LOT PAGES FROM DB/CACHE") + print("="*60) + for i, lot_url in enumerate(lot_urls): + print(f"\n[LOT {i+1:>3}/{len(lot_urls)}] ", end="") + page_data = await self.crawl_page(page=None, url=lot_url) + if page_data: + results.append(page_data) + + return results + + async with async_playwright() as p: + print("Launching browser...") + browser = await p.chromium.launch( + headless=True, + args=[ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-blink-features=AutomationControlled' + ] + ) + + page = await browser.new_page( + viewport={'width': 1920, 'height': 1080}, + user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + ) + + await page.set_extra_http_headers({ + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + }) + + # Set up COMPREHENSIVE resource interception (cache EVERYTHING) + resource_stats = {'cached': 0, 'fetched': 0, 'failed': 0} + request_bodies = {} # Store POST request bodies by URL for cache key generation + + async def handle_request(request): + """Intercept requests to capture POST bodies for GraphQL""" + try: + if request.method == 'POST' and 'graphql' in request.url: + # Store the POST body + post_data = request.post_data + if post_data: + # Create hash of POST body for cache key + import hashlib + body_hash = hashlib.md5(post_data.encode() if isinstance(post_data, str) else post_data).hexdigest()[:16] + cache_key = f"{request.url}#{body_hash}" + request_bodies[request.url] = (cache_key, post_data) + except: + pass + + page.on('request', handle_request) + + async def handle_response(response): + """Intercept ALL resources and cache them""" + try: + url = response.url + status = response.status + + # Get content type + headers = await response.all_headers() + content_type = headers.get('content-type', '').split(';')[0].strip() + + # Determine if we should cache this resource + cacheable_types = [ + 'text/html', 'text/css', 'text/javascript', 'application/javascript', + 'application/json', 'application/x-javascript', 'image/', 'font/', + 'application/font', 'video/', 'audio/', 'application/xml', 'text/xml', + 'image/svg+xml' + ] + + should_cache = any(content_type.startswith(ct) for ct in cacheable_types) + + if should_cache and status == 200: + try: + body = await response.body() + + # Determine cache key (use composite key for GraphQL POST requests) + cache_key = None + if 'graphql' in url and url in request_bodies: + cache_key, post_data = request_bodies[url] + + # Save to resource cache + self.cache.save_resource( + url=url, + content=body, + content_type=content_type, + status_code=status, + headers=headers, + cache_key=cache_key + ) + resource_stats['cached'] += 1 + + # Special handling for GraphQL responses + if 'graphql' in url and 'application/json' in content_type: + try: + body_text = body.decode('utf-8') + data = json.loads(body_text) + + # Check if this is a lot details query + if 'data' in data and 'lot' in data.get('data', {}): + lot_data = data['data']['lot'] + lot_slug = lot_data.get('urlSlug', '') + if lot_slug: + self.intercepted_api_data[lot_slug] = body_text + print(f" >> Intercepted GraphQL for: {lot_slug}") + except: + pass + + except Exception as e: + resource_stats['failed'] += 1 + else: + resource_stats['fetched'] += 1 + + except Exception as e: + # Silent fail - interception is opportunistic + pass + + page.on('response', handle_response) + + all_auction_urls = [] + all_lot_urls = [] + + # Phase 1: Collect auction URLs + print("\n" + "="*60) + print("PHASE 1: COLLECTING AUCTION URLs FROM LISTING PAGES") + print("="*60) + + for page_num in range(1, max_pages + 1): + auction_urls = await self.crawl_listing_page(page, page_num) + if not auction_urls: + print(f"No auctions found on page {page_num}, stopping") + break + all_auction_urls.extend(auction_urls) + print(f" → Total auctions collected so far: {len(all_auction_urls)}") + + all_auction_urls = list(set(all_auction_urls)) + print(f"\n{'='*60}") + print(f"PHASE 1 COMPLETE: {len(all_auction_urls)} UNIQUE AUCTIONS") + print(f"{'='*60}") + + # Phase 2: Extract lot URLs from each auction + print("\n" + "="*60) + print("PHASE 2: EXTRACTING LOT URLs FROM AUCTIONS") + print("="*60) + + phase2_start = time.time() + for i, auction_url in enumerate(all_auction_urls): + auction_start = time.time() + auction_id = self.parser.extract_lot_id(auction_url) + print(f"\n[{i+1:>3}/{len(all_auction_urls)}] {auction_id}") + lot_urls = await self.crawl_auction_for_lots(page, auction_url) + auction_elapsed = time.time() - auction_start + if lot_urls: + all_lot_urls.extend(lot_urls) + print(f" → Found {len(lot_urls)} lots (took {auction_elapsed:.2f}s)") + else: + print(f" → No lots found (took {auction_elapsed:.2f}s)") + + # Progress estimation + avg_time = (time.time() - phase2_start) / (i + 1) + remaining = len(all_auction_urls) - (i + 1) + eta_seconds = avg_time * remaining + eta_minutes = eta_seconds / 60 + print(f" → Progress: {len(all_lot_urls)} lots total | ETA: {eta_minutes:.1f} min ({avg_time:.2f}s/auction)") + + all_lot_urls = list(set(all_lot_urls)) + print(f"\n{'='*60}") + print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS") + print(f"{'='*60}") + + # Phase 2.5: Sort lots by priority (closing time + TTL) + print("\n" + "="*60) + print("PHASE 2.5: CALCULATING SCRAPE PRIORITIES") + print("="*60) + + sorted_lots = self._prioritize_lots(all_lot_urls) + print(f" > Sorted {len(sorted_lots)} lots by priority") + print(f" > Highest priority: {sorted_lots[0][2] if sorted_lots else 'N/A'}") + print(f" > Lowest priority: {sorted_lots[-1][2] if sorted_lots else 'N/A'}") + + # Phase 3: Scrape each lot page (in priority order) + print("\n" + "="*60) + print("PHASE 3: SCRAPING LOTS (PRIORITY ORDER)") + print("="*60) + + results = [] + for i, (priority, lot_url, priority_desc) in enumerate(sorted_lots): + print(f"\n[{i+1:>3}/{len(sorted_lots)}] [P:{priority}] ", end="") + page_data = await self.crawl_page(page, lot_url) + if page_data: + results.append(page_data) + + await browser.close() + + # Print resource caching statistics + print(f"\n{'='*60}") + print(f"RESOURCE CACHE STATISTICS") + print(f"{'='*60}") + print(f" Cached: {resource_stats['cached']} resources") + print(f" Fetched (not cached): {resource_stats['fetched']}") + print(f" Failed: {resource_stats['failed']}") + print(f"{'='*60}") + + return results + + def export_to_files(self) -> Dict[str, str]: + """Export database to CSV/JSON files""" + import sqlite3 + import json + import csv + from datetime import datetime + + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + output_dir = os.path.dirname(self.cache.db_path) + + conn = sqlite3.connect(self.cache.db_path) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + files = {} + + # Export auctions + cursor.execute("SELECT * FROM auctions") + auctions = [dict(row) for row in cursor.fetchall()] + + auctions_csv = os.path.join(output_dir, f'auctions_{timestamp}.csv') + auctions_json = os.path.join(output_dir, f'auctions_{timestamp}.json') + + if auctions: + with open(auctions_csv, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=auctions[0].keys()) + writer.writeheader() + writer.writerows(auctions) + + with open(auctions_json, 'w', encoding='utf-8') as f: + json.dump(auctions, f, indent=2, ensure_ascii=False) + + files['auctions_csv'] = auctions_csv + files['auctions_json'] = auctions_json + print(f" Exported {len(auctions)} auctions") + + # Export lots + cursor.execute("SELECT * FROM lots") + lots = [dict(row) for row in cursor.fetchall()] + + lots_csv = os.path.join(output_dir, f'lots_{timestamp}.csv') + lots_json = os.path.join(output_dir, f'lots_{timestamp}.json') + + if lots: + with open(lots_csv, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=lots[0].keys()) + writer.writeheader() + writer.writerows(lots) + + with open(lots_json, 'w', encoding='utf-8') as f: + json.dump(lots, f, indent=2, ensure_ascii=False) + + files['lots_csv'] = lots_csv + files['lots_json'] = lots_json + print(f" Exported {len(lots)} lots") + + conn.close() + return files \ No newline at end of file diff --git a/src/test.py b/src/test.py new file mode 100644 index 0000000..bbb813c --- /dev/null +++ b/src/test.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +""" +Test module for debugging extraction patterns +""" + +import sys +import sqlite3 +import time +import re +import json +from datetime import datetime +from pathlib import Path +from typing import Optional + +import config +from cache import CacheManager +from scraper import TroostwijkScraper + + +def test_extraction( + test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"): + """Test extraction on a specific cached URL to debug patterns""" + scraper = TroostwijkScraper() + + # Try to get from cache + cached = scraper.cache.get(test_url) + if not cached: + print(f"ERROR: URL not found in cache: {test_url}") + print(f"\nAvailable cached URLs:") + with sqlite3.connect(config.CACHE_DB) as conn: + cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10") + for row in cursor.fetchall(): + print(f" - {row[0]}") + return + + content = cached['content'] + print(f"\n{'=' * 60}") + print(f"TESTING EXTRACTION FROM: {test_url}") + print(f"{'=' * 60}") + print(f"Content length: {len(content)} chars") + print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours") + + # Test each extraction method + page_data = scraper._parse_page(content, test_url) + + print(f"\n{'=' * 60}") + print("EXTRACTED DATA:") + print(f"{'=' * 60}") + + if not page_data: + print("ERROR: No data extracted!") + return + + print(f"Page Type: {page_data.get('type', 'UNKNOWN')}") + print() + + for key, value in page_data.items(): + if key == 'images': + print(f"{key:.<20}: {len(value)} images") + for img in value[:3]: + print(f"{'':.<20} - {img}") + elif key == 'lots': + print(f"{key:.<20}: {len(value)} lots in auction") + else: + display_value = str(value)[:100] if value else "(empty)" + # Handle Unicode characters that Windows console can't display + try: + print(f"{key:.<20}: {display_value}") + except UnicodeEncodeError: + safe_value = display_value.encode('ascii', 'replace').decode('ascii') + print(f"{key:.<20}: {safe_value}") + + # Validation checks + print(f"\n{'=' * 60}") + print("VALIDATION CHECKS:") + print(f"{'=' * 60}") + + issues = [] + + if page_data.get('type') == 'lot': + if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']: + issues.append("[!] Current bid not extracted correctly") + else: + print("[OK] Current bid looks valid:", page_data.get('current_bid')) + + if page_data.get('location') in ['Locatie', 'Location', '']: + issues.append("[!] Location not extracted correctly") + else: + print("[OK] Location looks valid:", page_data.get('location')) + + if page_data.get('title') in ['', '...']: + issues.append("[!] Title not extracted correctly") + else: + print("[OK] Title looks valid:", page_data.get('title', '')[:50]) + + if issues: + print(f"\n[ISSUES FOUND]") + for issue in issues: + print(f" {issue}") + else: + print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]") + + # Debug: Show raw HTML snippets for problematic fields + print(f"\n{'=' * 60}") + print("DEBUG: RAW HTML SNIPPETS") + print(f"{'=' * 60}") + + # Look for bid-related content + print(f"\n1. Bid patterns in content:") + bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000]) + for i, match in enumerate(bid_matches[:5], 1): + print(f" {i}. {match}") + + # Look for location content + print(f"\n2. Location patterns in content:") + loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE) + for i, match in enumerate(loc_matches[:5], 1): + print(f" {i}. ...{match}...") + + # Look for JSON data + print(f"\n3. JSON/Script data containing auction info:") + json_patterns = [ + r'"currentBid"[^,}]+', + r'"location"[^,}]+', + r'"price"[^,}]+', + r'"addressLocality"[^,}]+' + ] + for pattern in json_patterns: + matches = re.findall(pattern, content[:50000], re.IGNORECASE) + if matches: + print(f" {pattern}: {matches[:3]}") + + # Look for script tags with structured data + script_matches = re.findall(r']*type=["\']application/ld\+json["\'][^>]*>(.*?)', content, re.DOTALL) + if script_matches: + print(f"\n4. Structured data (JSON-LD) found:") + for i, script in enumerate(script_matches[:2], 1): + try: + data = json.loads(script) + print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...") + except: + print(f" Script {i}: {script[:300]}...") diff --git a/test/test_cache_behavior.py b/test/test_cache_behavior.py new file mode 100644 index 0000000..62c1e18 --- /dev/null +++ b/test/test_cache_behavior.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +""" +Test cache behavior - verify page is only fetched once and data persists offline +""" + +import sys +import os +import asyncio +import sqlite3 +import time +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) + +from cache import CacheManager +from scraper import TroostwijkScraper +import config + + +class TestCacheBehavior: + """Test suite for cache and offline functionality""" + + def __init__(self): + self.test_db = "test_cache.db" + self.original_db = config.CACHE_DB + self.cache = None + self.scraper = None + + def setup(self): + """Setup test environment""" + print("\n" + "="*60) + print("TEST SETUP") + print("="*60) + + # Use test database + config.CACHE_DB = self.test_db + + # Ensure offline mode is disabled for tests + config.OFFLINE = False + + # Clean up old test database + if os.path.exists(self.test_db): + os.remove(self.test_db) + print(f" * Removed old test database") + + # Initialize cache and scraper + self.cache = CacheManager() + self.scraper = TroostwijkScraper() + self.scraper.offline = False # Explicitly disable offline mode + + print(f" * Created test database: {self.test_db}") + print(f" * Initialized cache and scraper") + print(f" * Offline mode: DISABLED") + + def teardown(self): + """Cleanup test environment""" + print("\n" + "="*60) + print("TEST TEARDOWN") + print("="*60) + + # Restore original database path + config.CACHE_DB = self.original_db + + # Keep test database for inspection + print(f" * Test database preserved: {self.test_db}") + print(f" * Restored original database path") + + async def test_page_fetched_once(self): + """Test that a page is only fetched from network once""" + print("\n" + "="*60) + print("TEST 1: Page Fetched Only Once") + print("="*60) + + # Pick a real lot URL to test with + test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7" + + print(f"\nTest URL: {test_url}") + + # First visit - should fetch from network + print("\n--- FIRST VISIT (should fetch from network) ---") + start_time = time.time() + + async with asyncio.timeout(60): # 60 second timeout + page_data_1 = await self._scrape_single_page(test_url) + + first_visit_time = time.time() - start_time + + if not page_data_1: + print(" [FAIL] First visit returned no data") + return False + + print(f" [OK] First visit completed in {first_visit_time:.2f}s") + print(f" [OK] Got lot data: {page_data_1.get('title', 'N/A')[:60]}...") + + # Check closing time was captured + closing_time_1 = page_data_1.get('closing_time') + print(f" [OK] Closing time: {closing_time_1}") + + # Second visit - should use cache + print("\n--- SECOND VISIT (should use cache) ---") + start_time = time.time() + + async with asyncio.timeout(30): # Should be much faster + page_data_2 = await self._scrape_single_page(test_url) + + second_visit_time = time.time() - start_time + + if not page_data_2: + print(" [FAIL] Second visit returned no data") + return False + + print(f" [OK] Second visit completed in {second_visit_time:.2f}s") + + # Verify data matches + if page_data_1.get('lot_id') != page_data_2.get('lot_id'): + print(f" [FAIL] Lot IDs don't match") + return False + + closing_time_2 = page_data_2.get('closing_time') + print(f" [OK] Closing time: {closing_time_2}") + + if closing_time_1 != closing_time_2: + print(f" [FAIL] Closing times don't match!") + print(f" First: {closing_time_1}") + print(f" Second: {closing_time_2}") + return False + + # Verify second visit was significantly faster (used cache) + if second_visit_time >= first_visit_time * 0.5: + print(f" [WARN] Second visit not significantly faster") + print(f" First: {first_visit_time:.2f}s") + print(f" Second: {second_visit_time:.2f}s") + else: + print(f" [OK] Second visit was {(first_visit_time / second_visit_time):.1f}x faster (cache working!)") + + # Verify resource cache has entries + conn = sqlite3.connect(self.test_db) + cursor = conn.execute("SELECT COUNT(*) FROM resource_cache") + resource_count = cursor.fetchone()[0] + conn.close() + + print(f" [OK] Cached {resource_count} resources") + + print("\n[PASS] TEST 1 PASSED: Page fetched only once, data persists") + return True + + async def test_offline_mode(self): + """Test that offline mode works with cached data""" + print("\n" + "="*60) + print("TEST 2: Offline Mode with Cached Data") + print("="*60) + + # Use the same URL from test 1 (should be cached) + test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7" + + # Enable offline mode + original_offline = config.OFFLINE + config.OFFLINE = True + self.scraper.offline = True + + print(f"\nTest URL: {test_url}") + print(" * Offline mode: ENABLED") + + try: + # Try to scrape in offline mode + print("\n--- OFFLINE SCRAPE (should use DB/cache only) ---") + start_time = time.time() + + async with asyncio.timeout(30): + page_data = await self._scrape_single_page(test_url) + + offline_time = time.time() - start_time + + if not page_data: + print(" [FAIL] Offline mode returned no data") + return False + + print(f" [OK] Offline scrape completed in {offline_time:.2f}s") + print(f" [OK] Got lot data: {page_data.get('title', 'N/A')[:60]}...") + + # Check closing time is available + closing_time = page_data.get('closing_time') + if not closing_time: + print(f" [FAIL] No closing time in offline mode") + return False + + print(f" [OK] Closing time preserved: {closing_time}") + + # Verify essential fields are present + essential_fields = ['lot_id', 'title', 'url', 'location'] + missing_fields = [f for f in essential_fields if not page_data.get(f)] + + if missing_fields: + print(f" [FAIL] Missing essential fields: {missing_fields}") + return False + + print(f" [OK] All essential fields present") + + # Check database has the lot + conn = sqlite3.connect(self.test_db) + cursor = conn.execute("SELECT closing_time FROM lots WHERE url = ?", (test_url,)) + row = cursor.fetchone() + conn.close() + + if not row: + print(f" [FAIL] Lot not found in database") + return False + + db_closing_time = row[0] + print(f" [OK] Database has closing time: {db_closing_time}") + + if db_closing_time != closing_time: + print(f" [FAIL] Closing time mismatch") + print(f" Scraped: {closing_time}") + print(f" Database: {db_closing_time}") + return False + + print("\n[PASS] TEST 2 PASSED: Offline mode works, closing time preserved") + return True + + finally: + # Restore offline mode + config.OFFLINE = original_offline + self.scraper.offline = original_offline + + async def _scrape_single_page(self, url): + """Helper to scrape a single page""" + from playwright.async_api import async_playwright + + if config.OFFLINE or self.scraper.offline: + # Offline mode - use crawl_page directly + return await self.scraper.crawl_page(page=None, url=url) + + # Online mode - need browser + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + try: + result = await self.scraper.crawl_page(page, url) + return result + finally: + await browser.close() + + async def run_all_tests(self): + """Run all tests""" + print("\n" + "="*70) + print("CACHE BEHAVIOR TEST SUITE") + print("="*70) + + self.setup() + + results = [] + + try: + # Test 1: Page fetched once + result1 = await self.test_page_fetched_once() + results.append(("Page Fetched Once", result1)) + + # Test 2: Offline mode + result2 = await self.test_offline_mode() + results.append(("Offline Mode", result2)) + + except Exception as e: + print(f"\n[ERROR] TEST SUITE ERROR: {e}") + import traceback + traceback.print_exc() + + finally: + self.teardown() + + # Print summary + print("\n" + "="*70) + print("TEST SUMMARY") + print("="*70) + + all_passed = True + for test_name, passed in results: + status = "[PASS]" if passed else "[FAIL]" + print(f" {status}: {test_name}") + if not passed: + all_passed = False + + print("="*70) + + if all_passed: + print("\n*** ALL TESTS PASSED! ***") + return 0 + else: + print("\n*** SOME TESTS FAILED ***") + return 1 + + +async def main(): + """Run tests""" + tester = TestCacheBehavior() + exit_code = await tester.run_all_tests() + sys.exit(exit_code) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/test/test_description_simple.py b/test/test_description_simple.py new file mode 100644 index 0000000..f167a79 --- /dev/null +++ b/test/test_description_simple.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import sys +import os +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.insert(0, parent_dir) +sys.path.insert(0, os.path.join(parent_dir, 'src')) + +import asyncio +from scraper import TroostwijkScraper +import config +import os + +async def test(): + # Force online mode + os.environ['SCAEV_OFFLINE'] = '0' + config.OFFLINE = False + + scraper = TroostwijkScraper() + scraper.offline = False + + from playwright.async_api import async_playwright + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context() + page = await context.new_page() + + url = "https://www.troostwijkauctions.com/l/used-dometic-seastar-tfxchx8641p-top-mount-engine-control-liver-A1-39684-12" + + # Add debug logging to parser + original_parse = scraper.parser.parse_page + def debug_parse(content, url): + result = original_parse(content, url) + if result: + print(f"PARSER OUTPUT:") + print(f" description: {result.get('description', 'NONE')[:100] if result.get('description') else 'EMPTY'}") + print(f" closing_time: {result.get('closing_time', 'NONE')}") + print(f" bid_count: {result.get('bid_count', 'NONE')}") + return result + scraper.parser.parse_page = debug_parse + + page_data = await scraper.crawl_page(page, url) + + await browser.close() + + print(f"\nFINAL page_data:") + print(f" description: {page_data.get('description', 'NONE')[:100] if page_data and page_data.get('description') else 'EMPTY'}") + print(f" closing_time: {page_data.get('closing_time', 'NONE') if page_data else 'NONE'}") + print(f" bid_count: {page_data.get('bid_count', 'NONE') if page_data else 'NONE'}") + print(f" status: {page_data.get('status', 'NONE') if page_data else 'NONE'}") + +asyncio.run(test()) diff --git a/test/test_graphql_403.py b/test/test_graphql_403.py new file mode 100644 index 0000000..55790c2 --- /dev/null +++ b/test/test_graphql_403.py @@ -0,0 +1,85 @@ +import asyncio +import types +import sys +from pathlib import Path +import pytest + + +@pytest.mark.asyncio +async def test_fetch_lot_bidding_data_403(monkeypatch): + """ + Simulate a 403 from the GraphQL endpoint and verify: + - Function returns None (graceful handling) + - It attempts a retry and logs a clear 403 message + """ + # Load modules directly from src using importlib to avoid path issues + project_root = Path(__file__).resolve().parents[1] + src_path = project_root / 'src' + import importlib.util + + def _load_module(name, file_path): + spec = importlib.util.spec_from_file_location(name, str(file_path)) + module = importlib.util.module_from_spec(spec) + sys.modules[name] = module + spec.loader.exec_module(module) # type: ignore + return module + + # Load config first because graphql_client imports it by module name + config = _load_module('config', src_path / 'config.py') + graphql_client = _load_module('graphql_client', src_path / 'graphql_client.py') + monkeypatch.setattr(config, "OFFLINE", False, raising=False) + + log_messages = [] + + def fake_print(*args, **kwargs): + msg = " ".join(str(a) for a in args) + log_messages.append(msg) + + import builtins + monkeypatch.setattr(builtins, "print", fake_print) + + class MockResponse: + def __init__(self, status=403, text_body="Forbidden"): + self.status = status + self._text_body = text_body + + async def json(self): + return {} + + async def text(self): + return self._text_body + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + class MockSession: + def __init__(self, *args, **kwargs): + pass + + def post(self, *args, **kwargs): + # Always return 403 + return MockResponse(403, "Forbidden by WAF") + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + # Patch aiohttp.ClientSession to our mock + import types as _types + dummy_aiohttp = _types.SimpleNamespace() + dummy_aiohttp.ClientSession = MockSession + # Ensure that an `import aiohttp` inside the function resolves to our dummy + monkeypatch.setitem(sys.modules, 'aiohttp', dummy_aiohttp) + + result = await graphql_client.fetch_lot_bidding_data("A1-40179-35") + + # Should gracefully return None + assert result is None + + # Should have logged a 403 at least once + assert any("GraphQL API error: 403" in m for m in log_messages) diff --git a/test/test_missing_fields.py b/test/test_missing_fields.py new file mode 100644 index 0000000..14c417a --- /dev/null +++ b/test/test_missing_fields.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +""" +Test to validate that all expected fields are populated after scraping +""" +import sys +import os +import asyncio +import sqlite3 + +# Add parent and src directory to path +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.insert(0, parent_dir) +sys.path.insert(0, os.path.join(parent_dir, 'src')) + +# Force online mode before importing +os.environ['SCAEV_OFFLINE'] = '0' + +from scraper import TroostwijkScraper +import config + + +async def test_lot_has_all_fields(): + """Test that a lot page has all expected fields populated""" + + print("\n" + "="*60) + print("TEST: Lot has all required fields") + print("="*60) + + # Use the example lot from user + test_url = "https://www.troostwijkauctions.com/l/radaway-idea-black-dwj-doucheopstelling-A1-39956-18" + + # Ensure we're not in offline mode + config.OFFLINE = False + + scraper = TroostwijkScraper() + scraper.offline = False + + print(f"\n[1] Scraping: {test_url}") + + # Start playwright and scrape + from playwright.async_api import async_playwright + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context() + page = await context.new_page() + + page_data = await scraper.crawl_page(page, test_url) + + await browser.close() + + if not page_data: + print(" [FAIL] No data returned") + return False + + print(f"\n[2] Validating fields...") + + # Fields that MUST have values (critical for auction functionality) + required_fields = { + 'closing_time': 'Closing time', + 'current_bid': 'Current bid', + 'bid_count': 'Bid count', + 'status': 'Status', + } + + # Fields that SHOULD have values but may legitimately be empty + optional_fields = { + 'description': 'Description', + } + + missing_fields = [] + empty_fields = [] + optional_missing = [] + + # Check required fields + for field, label in required_fields.items(): + value = page_data.get(field) + + if value is None: + missing_fields.append(label) + print(f" [FAIL] {label}: MISSING (None)") + elif value == '' or value == 0 or value == 'No bids': + # Special case: 'No bids' is only acceptable if bid_count is 0 + if field == 'current_bid' and page_data.get('bid_count', 0) == 0: + print(f" [PASS] {label}: '{value}' (acceptable - no bids)") + else: + empty_fields.append(label) + print(f" [FAIL] {label}: EMPTY ('{value}')") + else: + print(f" [PASS] {label}: {value}") + + # Check optional fields (warn but don't fail) + for field, label in optional_fields.items(): + value = page_data.get(field) + if value is None or value == '': + optional_missing.append(label) + print(f" [WARN] {label}: EMPTY (may be legitimate)") + else: + print(f" [PASS] {label}: {value[:50]}...") + + # Check database + print(f"\n[3] Checking database entry...") + conn = sqlite3.connect(scraper.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT closing_time, current_bid, bid_count, description, status + FROM lots WHERE url = ? + """, (test_url,)) + row = cursor.fetchone() + conn.close() + + if row: + db_closing, db_bid, db_count, db_desc, db_status = row + print(f" DB closing_time: {db_closing or 'EMPTY'}") + print(f" DB current_bid: {db_bid or 'EMPTY'}") + print(f" DB bid_count: {db_count}") + print(f" DB description: {db_desc[:50] if db_desc else 'EMPTY'}...") + print(f" DB status: {db_status or 'EMPTY'}") + + # Verify DB matches page_data + if db_closing != page_data.get('closing_time'): + print(f" [WARN] DB closing_time doesn't match page_data") + if db_count != page_data.get('bid_count'): + print(f" [WARN] DB bid_count doesn't match page_data") + else: + print(f" [WARN] No database entry found") + + print(f"\n" + "="*60) + if missing_fields or empty_fields: + print(f"[FAIL] Missing fields: {', '.join(missing_fields)}") + print(f"[FAIL] Empty fields: {', '.join(empty_fields)}") + if optional_missing: + print(f"[WARN] Optional missing: {', '.join(optional_missing)}") + return False + else: + print("[PASS] All required fields are populated") + if optional_missing: + print(f"[WARN] Optional missing: {', '.join(optional_missing)}") + return True + + +async def test_lot_with_description(): + """Test that a lot with description preserves it""" + + print("\n" + "="*60) + print("TEST: Lot with description") + print("="*60) + + # Use a lot known to have description + test_url = "https://www.troostwijkauctions.com/l/used-dometic-seastar-tfxchx8641p-top-mount-engine-control-liver-A1-39684-12" + + config.OFFLINE = False + + scraper = TroostwijkScraper() + scraper.offline = False + + print(f"\n[1] Scraping: {test_url}") + + from playwright.async_api import async_playwright + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context() + page = await context.new_page() + + page_data = await scraper.crawl_page(page, test_url) + + await browser.close() + + if not page_data: + print(" [FAIL] No data returned") + return False + + print(f"\n[2] Checking description...") + description = page_data.get('description', '') + + if not description or description == '': + print(f" [FAIL] Description is empty") + return False + else: + print(f" [PASS] Description: {description[:100]}...") + return True + + +async def main(): + """Run all tests""" + print("\n" + "="*60) + print("MISSING FIELDS TEST SUITE") + print("="*60) + + test1 = await test_lot_has_all_fields() + test2 = await test_lot_with_description() + + print("\n" + "="*60) + if test1 and test2: + print("ALL TESTS PASSED") + else: + print("SOME TESTS FAILED") + if not test1: + print(" - test_lot_has_all_fields FAILED") + if not test2: + print(" - test_lot_with_description FAILED") + print("="*60 + "\n") + + return 0 if (test1 and test2) else 1 + + +if __name__ == '__main__': + exit_code = asyncio.run(main()) + sys.exit(exit_code) diff --git a/test/test_scraper.py b/test/test_scraper.py new file mode 100644 index 0000000..a3dbeef --- /dev/null +++ b/test/test_scraper.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python3 +""" +Test suite for Troostwijk Scraper +Tests both auction and lot parsing with cached data + +Requires Python 3.10+ +""" + +import sys + +# Require Python 3.10+ +if sys.version_info < (3, 10): + print("ERROR: This script requires Python 3.10 or higher") + print(f"Current version: {sys.version}") + sys.exit(1) + +import asyncio +import json +import sqlite3 +from datetime import datetime +from pathlib import Path + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent)) + +from main import TroostwijkScraper, CacheManager, CACHE_DB + +# Test URLs - these will use cached data to avoid overloading the server +TEST_AUCTIONS = [ + "https://www.troostwijkauctions.com/a/online-auction-cnc-lathes-machining-centres-precision-measurement-romania-A7-39813", + "https://www.troostwijkauctions.com/a/faillissement-bab-shortlease-i-ii-b-v-%E2%80%93-2024-big-ass-energieopslagsystemen-A1-39557", + "https://www.troostwijkauctions.com/a/industriele-goederen-uit-diverse-bedrijfsbeeindigingen-A1-38675", +] + +TEST_LOTS = [ + "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", + "https://www.troostwijkauctions.com/l/tos-sui-50-1000-universele-draaibank-A7-39568-9", + "https://www.troostwijkauctions.com/l/rolcontainer-%25282x%2529-A1-40191-101", +] + +class TestResult: + def __init__(self, url, success, message, data=None): + self.url = url + self.success = success + self.message = message + self.data = data + +class ScraperTester: + def __init__(self): + self.scraper = TroostwijkScraper() + self.results = [] + + def check_cache_exists(self, url): + """Check if URL is cached""" + cached = self.scraper.cache.get(url, max_age_hours=999999) # Get even old cache + return cached is not None + + def test_auction_parsing(self, url): + """Test auction page parsing""" + print(f"\n{'='*70}") + print(f"Testing Auction: {url}") + print('='*70) + + # Check cache + if not self.check_cache_exists(url): + return TestResult( + url, + False, + "❌ NOT IN CACHE - Please run scraper first to cache this URL", + None + ) + + # Get cached content + cached = self.scraper.cache.get(url, max_age_hours=999999) + content = cached['content'] + + print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)") + + # Parse + try: + data = self.scraper._parse_page(content, url) + + if not data: + return TestResult(url, False, "❌ Parsing returned None", None) + + if data.get('type') != 'auction': + return TestResult( + url, + False, + f"❌ Expected type='auction', got '{data.get('type')}'", + data + ) + + # Validate required fields + issues = [] + required_fields = { + 'auction_id': str, + 'title': str, + 'location': str, + 'lots_count': int, + 'first_lot_closing_time': str, + } + + for field, expected_type in required_fields.items(): + value = data.get(field) + if value is None or value == '': + issues.append(f" ❌ {field}: MISSING or EMPTY") + elif not isinstance(value, expected_type): + issues.append(f" ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})") + else: + # Pretty print value + display_value = str(value)[:60] + print(f" ✓ {field}: {display_value}") + + if issues: + return TestResult(url, False, "\n".join(issues), data) + + print(f" ✓ lots_count: {data.get('lots_count')}") + + return TestResult(url, True, "✅ All auction fields validated successfully", data) + + except Exception as e: + return TestResult(url, False, f"❌ Exception during parsing: {e}", None) + + def test_lot_parsing(self, url): + """Test lot page parsing""" + print(f"\n{'='*70}") + print(f"Testing Lot: {url}") + print('='*70) + + # Check cache + if not self.check_cache_exists(url): + return TestResult( + url, + False, + "❌ NOT IN CACHE - Please run scraper first to cache this URL", + None + ) + + # Get cached content + cached = self.scraper.cache.get(url, max_age_hours=999999) + content = cached['content'] + + print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)") + + # Parse + try: + data = self.scraper._parse_page(content, url) + + if not data: + return TestResult(url, False, "❌ Parsing returned None", None) + + if data.get('type') != 'lot': + return TestResult( + url, + False, + f"❌ Expected type='lot', got '{data.get('type')}'", + data + ) + + # Validate required fields + issues = [] + required_fields = { + 'lot_id': (str, lambda x: x and len(x) > 0), + 'title': (str, lambda x: x and len(x) > 3 and x not in ['...', 'N/A']), + 'location': (str, lambda x: x and len(x) > 2 and x not in ['Locatie', 'Location']), + 'current_bid': (str, lambda x: x and x not in ['€Huidig ​​bod', 'Huidig bod']), + 'closing_time': (str, lambda x: True), # Can be empty + 'images': (list, lambda x: True), # Can be empty list + } + + for field, (expected_type, validator) in required_fields.items(): + value = data.get(field) + + if value is None: + issues.append(f" ❌ {field}: MISSING (None)") + elif not isinstance(value, expected_type): + issues.append(f" ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})") + elif not validator(value): + issues.append(f" ❌ {field}: Invalid value: '{value}'") + else: + # Pretty print value + if field == 'images': + print(f" ✓ {field}: {len(value)} images") + for i, img in enumerate(value[:3], 1): + print(f" {i}. {img[:60]}...") + else: + display_value = str(value)[:60] + print(f" ✓ {field}: {display_value}") + + # Additional checks + if data.get('bid_count') is not None: + print(f" ✓ bid_count: {data.get('bid_count')}") + + if data.get('viewing_time'): + print(f" ✓ viewing_time: {data.get('viewing_time')}") + + if data.get('pickup_date'): + print(f" ✓ pickup_date: {data.get('pickup_date')}") + + if issues: + return TestResult(url, False, "\n".join(issues), data) + + return TestResult(url, True, "✅ All lot fields validated successfully", data) + + except Exception as e: + import traceback + return TestResult(url, False, f"❌ Exception during parsing: {e}\n{traceback.format_exc()}", None) + + def run_all_tests(self): + """Run all tests""" + print("\n" + "="*70) + print("TROOSTWIJK SCRAPER TEST SUITE") + print("="*70) + print("\nThis test suite uses CACHED data only - no live requests to server") + print("="*70) + + # Test auctions + print("\n" + "="*70) + print("TESTING AUCTIONS") + print("="*70) + + for url in TEST_AUCTIONS: + result = self.test_auction_parsing(url) + self.results.append(result) + + # Test lots + print("\n" + "="*70) + print("TESTING LOTS") + print("="*70) + + for url in TEST_LOTS: + result = self.test_lot_parsing(url) + self.results.append(result) + + # Summary + self.print_summary() + + def print_summary(self): + """Print test summary""" + print("\n" + "="*70) + print("TEST SUMMARY") + print("="*70) + + passed = sum(1 for r in self.results if r.success) + failed = sum(1 for r in self.results if not r.success) + total = len(self.results) + + print(f"\nTotal tests: {total}") + print(f"Passed: {passed} ✓") + print(f"Failed: {failed} ✗") + print(f"Success rate: {passed/total*100:.1f}%") + + if failed > 0: + print("\n" + "="*70) + print("FAILED TESTS:") + print("="*70) + for result in self.results: + if not result.success: + print(f"\n{result.url}") + print(result.message) + if result.data: + print("\nParsed data:") + for key, value in result.data.items(): + if key != 'lots': # Don't print full lots array + print(f" {key}: {str(value)[:80]}") + + print("\n" + "="*70) + + return failed == 0 + +def check_cache_status(): + """Check cache compression status""" + print("\n" + "="*70) + print("CACHE STATUS CHECK") + print("="*70) + + try: + with sqlite3.connect(CACHE_DB) as conn: + # Total entries + cursor = conn.execute("SELECT COUNT(*) FROM cache") + total = cursor.fetchone()[0] + + # Compressed vs uncompressed + cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1") + compressed = cursor.fetchone()[0] + + cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL") + uncompressed = cursor.fetchone()[0] + + print(f"Total cache entries: {total}") + print(f"Compressed: {compressed} ({compressed/total*100:.1f}%)") + print(f"Uncompressed: {uncompressed} ({uncompressed/total*100:.1f}%)") + + if uncompressed > 0: + print(f"\n⚠️ Warning: {uncompressed} entries are still uncompressed") + print(" Run: python migrate_compress_cache.py") + else: + print("\n✓ All cache entries are compressed!") + + # Check test URLs + print(f"\n{'='*70}") + print("TEST URL CACHE STATUS:") + print('='*70) + + all_test_urls = TEST_AUCTIONS + TEST_LOTS + cached_count = 0 + + for url in all_test_urls: + cursor = conn.execute("SELECT url FROM cache WHERE url = ?", (url,)) + if cursor.fetchone(): + print(f"✓ {url[:60]}...") + cached_count += 1 + else: + print(f"✗ {url[:60]}... (NOT CACHED)") + + print(f"\n{cached_count}/{len(all_test_urls)} test URLs are cached") + + if cached_count < len(all_test_urls): + print("\n⚠️ Some test URLs are not cached. Tests for those URLs will fail.") + print(" Run the main scraper to cache these URLs first.") + + except Exception as e: + print(f"Error checking cache status: {e}") + +if __name__ == "__main__": + # Check cache status first + check_cache_status() + + # Run tests + tester = ScraperTester() + success = tester.run_all_tests() + + # Exit with appropriate code + sys.exit(0 if success else 1)