scaev/src/test.py

#!/usr/bin/env python3
"""
Test module for debugging extraction patterns
"""

import sys
import time
import re
import json
from datetime import datetime
from pathlib import Path
from typing import Optional

import config
from cache import CacheManager
from scraper import TroostwijkScraper


def test_extraction(
        test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
    """Test extraction on a specific cached URL to debug patterns"""
    scraper = TroostwijkScraper()

    # Try to get from cache
    cached = scraper.cache.get(test_url)
    if not cached:
        print(f"ERROR: URL not found in cache: {test_url}")
        print(f"\nAvailable cached URLs:")
        try:
            for url in scraper.cache.get_recent_cached_urls(limit=10):
                print(f"  - {url}")
        except Exception as e:
            print(f"  (failed to list recent cached URLs: {e})")
        return

    content = cached['content']
    print(f"\n{'=' * 60}")
    print(f"TESTING EXTRACTION FROM: {test_url}")
    print(f"{'=' * 60}")
    print(f"Content length: {len(content)} chars")
    print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")

    # Test each extraction method
    page_data = scraper._parse_page(content, test_url)

    print(f"\n{'=' * 60}")
    print("EXTRACTED DATA:")
    print(f"{'=' * 60}")

    if not page_data:
        print("ERROR: No data extracted!")
        return

    print(f"Page Type: {page_data.get('type', 'UNKNOWN')}")
    print()

    for key, value in page_data.items():
        if key == 'images':
            print(f"{key:.<20}: {len(value)} images")
            for img in value[:3]:
                print(f"{'':.<20}  - {img}")
        elif key == 'lots':
            print(f"{key:.<20}: {len(value)} lots in auction")
        else:
            display_value = str(value)[:100] if value else "(empty)"
            # Handle Unicode characters that Windows console can't display
            try:
                print(f"{key:.<20}: {display_value}")
            except UnicodeEncodeError:
                safe_value = display_value.encode('ascii', 'replace').decode('ascii')
                print(f"{key:.<20}: {safe_value}")

    # Validation checks
    print(f"\n{'=' * 60}")
    print("VALIDATION CHECKS:")
    print(f"{'=' * 60}")

    issues = []

    if page_data.get('type') == 'lot':
        if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']:
            issues.append("[!] Current bid not extracted correctly")
        else:
            print("[OK] Current bid looks valid:", page_data.get('current_bid'))

        if page_data.get('location') in ['Locatie', 'Location', '']:
            issues.append("[!] Location not extracted correctly")
        else:
            print("[OK] Location looks valid:", page_data.get('location'))

    if page_data.get('title') in ['', '...']:
        issues.append("[!] Title not extracted correctly")
    else:
        print("[OK] Title looks valid:", page_data.get('title', '')[:50])

    if issues:
        print(f"\n[ISSUES FOUND]")
        for issue in issues:
            print(f"  {issue}")
    else:
        print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")

    # Debug: Show raw HTML snippets for problematic fields
    print(f"\n{'=' * 60}")
    print("DEBUG: RAW HTML SNIPPETS")
    print(f"{'=' * 60}")

    # Look for bid-related content
    print(f"\n1. Bid patterns in content:")
    bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
    for i, match in enumerate(bid_matches[:5], 1):
        print(f"   {i}. {match}")

    # Look for location content
    print(f"\n2. Location patterns in content:")
    loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
    for i, match in enumerate(loc_matches[:5], 1):
        print(f"   {i}. ...{match}...")

    # Look for JSON data
    print(f"\n3. JSON/Script data containing auction info:")
    json_patterns = [
        r'"currentBid"[^,}]+',
        r'"location"[^,}]+',
        r'"price"[^,}]+',
        r'"addressLocality"[^,}]+'
    ]
    for pattern in json_patterns:
        matches = re.findall(pattern, content[:50000], re.IGNORECASE)
        if matches:
            print(f"   {pattern}: {matches[:3]}")

    # Look for script tags with structured data
    script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
    if script_matches:
        print(f"\n4. Structured data (JSON-LD) found:")
        for i, script in enumerate(script_matches[:2], 1):
            try:
                data = json.loads(script)
                print(f"   Script {i}: {json.dumps(data, indent=6)[:500]}...")
            except:
                print(f"   Script {i}: {script[:300]}...")