Initial

2025-12-09 08:04:16 +01:00
commit e69563d4d6
37 changed files with 7262 additions and 0 deletions
--- a/src/test.py
+++ b/src/test.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Test module for debugging extraction patterns
+"""
+
+import sys
+import sqlite3
+import time
+import re
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import config
+from cache import CacheManager
+from scraper import TroostwijkScraper
+
+
+def test_extraction(
+        test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
+    """Test extraction on a specific cached URL to debug patterns"""
+    scraper = TroostwijkScraper()
+
+    # Try to get from cache
+    cached = scraper.cache.get(test_url)
+    if not cached:
+        print(f"ERROR: URL not found in cache: {test_url}")
+        print(f"\nAvailable cached URLs:")
+        with sqlite3.connect(config.CACHE_DB) as conn:
+            cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
+            for row in cursor.fetchall():
+                print(f"  - {row[0]}")
+        return
+
+    content = cached['content']
+    print(f"\n{'=' * 60}")
+    print(f"TESTING EXTRACTION FROM: {test_url}")
+    print(f"{'=' * 60}")
+    print(f"Content length: {len(content)} chars")
+    print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
+
+    # Test each extraction method
+    page_data = scraper._parse_page(content, test_url)
+
+    print(f"\n{'=' * 60}")
+    print("EXTRACTED DATA:")
+    print(f"{'=' * 60}")
+
+    if not page_data:
+        print("ERROR: No data extracted!")
+        return
+
+    print(f"Page Type: {page_data.get('type', 'UNKNOWN')}")
+    print()
+
+    for key, value in page_data.items():
+        if key == 'images':
+            print(f"{key:.<20}: {len(value)} images")
+            for img in value[:3]:
+                print(f"{'':.<20}  - {img}")
+        elif key == 'lots':
+            print(f"{key:.<20}: {len(value)} lots in auction")
+        else:
+            display_value = str(value)[:100] if value else "(empty)"
+            # Handle Unicode characters that Windows console can't display
+            try:
+                print(f"{key:.<20}: {display_value}")
+            except UnicodeEncodeError:
+                safe_value = display_value.encode('ascii', 'replace').decode('ascii')
+                print(f"{key:.<20}: {safe_value}")
+
+    # Validation checks
+    print(f"\n{'=' * 60}")
+    print("VALIDATION CHECKS:")
+    print(f"{'=' * 60}")
+
+    issues = []
+
+    if page_data.get('type') == 'lot':
+        if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']:
+            issues.append("[!] Current bid not extracted correctly")
+        else:
+            print("[OK] Current bid looks valid:", page_data.get('current_bid'))
+
+        if page_data.get('location') in ['Locatie', 'Location', '']:
+            issues.append("[!] Location not extracted correctly")
+        else:
+            print("[OK] Location looks valid:", page_data.get('location'))
+
+    if page_data.get('title') in ['', '...']:
+        issues.append("[!] Title not extracted correctly")
+    else:
+        print("[OK] Title looks valid:", page_data.get('title', '')[:50])
+
+    if issues:
+        print(f"\n[ISSUES FOUND]")
+        for issue in issues:
+            print(f"  {issue}")
+    else:
+        print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
+
+    # Debug: Show raw HTML snippets for problematic fields
+    print(f"\n{'=' * 60}")
+    print("DEBUG: RAW HTML SNIPPETS")
+    print(f"{'=' * 60}")
+
+    # Look for bid-related content
+    print(f"\n1. Bid patterns in content:")
+    bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
+    for i, match in enumerate(bid_matches[:5], 1):
+        print(f"   {i}. {match}")
+
+    # Look for location content
+    print(f"\n2. Location patterns in content:")
+    loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
+    for i, match in enumerate(loc_matches[:5], 1):
+        print(f"   {i}. ...{match}...")
+
+    # Look for JSON data
+    print(f"\n3. JSON/Script data containing auction info:")
+    json_patterns = [
+        r'"currentBid"[^,}]+',
+        r'"location"[^,}]+',
+        r'"price"[^,}]+',
+        r'"addressLocality"[^,}]+'
+    ]
+    for pattern in json_patterns:
+        matches = re.findall(pattern, content[:50000], re.IGNORECASE)
+        if matches:
+            print(f"   {pattern}: {matches[:3]}")
+
+    # Look for script tags with structured data
+    script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
+    if script_matches:
+        print(f"\n4. Structured data (JSON-LD) found:")
+        for i, script in enumerate(script_matches[:2], 1):
+            try:
+                data = json.loads(script)
+                print(f"   Script {i}: {json.dumps(data, indent=6)[:500]}...")
+            except:
+                print(f"   Script {i}: {script[:300]}...")