This commit is contained in:
Tour
2025-12-09 08:04:16 +01:00
commit e69563d4d6
37 changed files with 7262 additions and 0 deletions

142
src/test.py Normal file
View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
Test module for debugging extraction patterns
"""
import sys
import sqlite3
import time
import re
import json
from datetime import datetime
from pathlib import Path
from typing import Optional
import config
from cache import CacheManager
from scraper import TroostwijkScraper
def test_extraction(
test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
"""Test extraction on a specific cached URL to debug patterns"""
scraper = TroostwijkScraper()
# Try to get from cache
cached = scraper.cache.get(test_url)
if not cached:
print(f"ERROR: URL not found in cache: {test_url}")
print(f"\nAvailable cached URLs:")
with sqlite3.connect(config.CACHE_DB) as conn:
cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
for row in cursor.fetchall():
print(f" - {row[0]}")
return
content = cached['content']
print(f"\n{'=' * 60}")
print(f"TESTING EXTRACTION FROM: {test_url}")
print(f"{'=' * 60}")
print(f"Content length: {len(content)} chars")
print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
# Test each extraction method
page_data = scraper._parse_page(content, test_url)
print(f"\n{'=' * 60}")
print("EXTRACTED DATA:")
print(f"{'=' * 60}")
if not page_data:
print("ERROR: No data extracted!")
return
print(f"Page Type: {page_data.get('type', 'UNKNOWN')}")
print()
for key, value in page_data.items():
if key == 'images':
print(f"{key:.<20}: {len(value)} images")
for img in value[:3]:
print(f"{'':.<20} - {img}")
elif key == 'lots':
print(f"{key:.<20}: {len(value)} lots in auction")
else:
display_value = str(value)[:100] if value else "(empty)"
# Handle Unicode characters that Windows console can't display
try:
print(f"{key:.<20}: {display_value}")
except UnicodeEncodeError:
safe_value = display_value.encode('ascii', 'replace').decode('ascii')
print(f"{key:.<20}: {safe_value}")
# Validation checks
print(f"\n{'=' * 60}")
print("VALIDATION CHECKS:")
print(f"{'=' * 60}")
issues = []
if page_data.get('type') == 'lot':
if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']:
issues.append("[!] Current bid not extracted correctly")
else:
print("[OK] Current bid looks valid:", page_data.get('current_bid'))
if page_data.get('location') in ['Locatie', 'Location', '']:
issues.append("[!] Location not extracted correctly")
else:
print("[OK] Location looks valid:", page_data.get('location'))
if page_data.get('title') in ['', '...']:
issues.append("[!] Title not extracted correctly")
else:
print("[OK] Title looks valid:", page_data.get('title', '')[:50])
if issues:
print(f"\n[ISSUES FOUND]")
for issue in issues:
print(f" {issue}")
else:
print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
# Debug: Show raw HTML snippets for problematic fields
print(f"\n{'=' * 60}")
print("DEBUG: RAW HTML SNIPPETS")
print(f"{'=' * 60}")
# Look for bid-related content
print(f"\n1. Bid patterns in content:")
bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
for i, match in enumerate(bid_matches[:5], 1):
print(f" {i}. {match}")
# Look for location content
print(f"\n2. Location patterns in content:")
loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
for i, match in enumerate(loc_matches[:5], 1):
print(f" {i}. ...{match}...")
# Look for JSON data
print(f"\n3. JSON/Script data containing auction info:")
json_patterns = [
r'"currentBid"[^,}]+',
r'"location"[^,}]+',
r'"price"[^,}]+',
r'"addressLocality"[^,}]+'
]
for pattern in json_patterns:
matches = re.findall(pattern, content[:50000], re.IGNORECASE)
if matches:
print(f" {pattern}: {matches[:3]}")
# Look for script tags with structured data
script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
if script_matches:
print(f"\n4. Structured data (JSON-LD) found:")
for i, script in enumerate(script_matches[:2], 1):
try:
data = json.loads(script)
print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...")
except:
print(f" Script {i}: {script[:300]}...")