#!/usr/bin/env python3 """ Test module for debugging extraction patterns """ import sys import time import re import json from datetime import datetime from pathlib import Path from typing import Optional import config from cache import CacheManager from scraper import TroostwijkScraper def test_extraction( test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"): """Test extraction on a specific cached URL to debug patterns""" scraper = TroostwijkScraper() # Try to get from cache cached = scraper.cache.get(test_url) if not cached: print(f"ERROR: URL not found in cache: {test_url}") print(f"\nAvailable cached URLs:") try: for url in scraper.cache.get_recent_cached_urls(limit=10): print(f" - {url}") except Exception as e: print(f" (failed to list recent cached URLs: {e})") return content = cached['content'] print(f"\n{'=' * 60}") print(f"TESTING EXTRACTION FROM: {test_url}") print(f"{'=' * 60}") print(f"Content length: {len(content)} chars") print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours") # Test each extraction method page_data = scraper._parse_page(content, test_url) print(f"\n{'=' * 60}") print("EXTRACTED DATA:") print(f"{'=' * 60}") if not page_data: print("ERROR: No data extracted!") return print(f"Page Type: {page_data.get('type', 'UNKNOWN')}") print() for key, value in page_data.items(): if key == 'images': print(f"{key:.<20}: {len(value)} images") for img in value[:3]: print(f"{'':.<20} - {img}") elif key == 'lots': print(f"{key:.<20}: {len(value)} lots in auction") else: display_value = str(value)[:100] if value else "(empty)" # Handle Unicode characters that Windows console can't display try: print(f"{key:.<20}: {display_value}") except UnicodeEncodeError: safe_value = display_value.encode('ascii', 'replace').decode('ascii') print(f"{key:.<20}: {safe_value}") # Validation checks print(f"\n{'=' * 60}") print("VALIDATION CHECKS:") print(f"{'=' * 60}") issues = [] if page_data.get('type') == 'lot': if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']: issues.append("[!] Current bid not extracted correctly") else: print("[OK] Current bid looks valid:", page_data.get('current_bid')) if page_data.get('location') in ['Locatie', 'Location', '']: issues.append("[!] Location not extracted correctly") else: print("[OK] Location looks valid:", page_data.get('location')) if page_data.get('title') in ['', '...']: issues.append("[!] Title not extracted correctly") else: print("[OK] Title looks valid:", page_data.get('title', '')[:50]) if issues: print(f"\n[ISSUES FOUND]") for issue in issues: print(f" {issue}") else: print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]") # Debug: Show raw HTML snippets for problematic fields print(f"\n{'=' * 60}") print("DEBUG: RAW HTML SNIPPETS") print(f"{'=' * 60}") # Look for bid-related content print(f"\n1. Bid patterns in content:") bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000]) for i, match in enumerate(bid_matches[:5], 1): print(f" {i}. {match}") # Look for location content print(f"\n2. Location patterns in content:") loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE) for i, match in enumerate(loc_matches[:5], 1): print(f" {i}. ...{match}...") # Look for JSON data print(f"\n3. JSON/Script data containing auction info:") json_patterns = [ r'"currentBid"[^,}]+', r'"location"[^,}]+', r'"price"[^,}]+', r'"addressLocality"[^,}]+' ] for pattern in json_patterns: matches = re.findall(pattern, content[:50000], re.IGNORECASE) if matches: print(f" {pattern}: {matches[:3]}") # Look for script tags with structured data script_matches = re.findall(r']*type=["\']application/ld\+json["\'][^>]*>(.*?)', content, re.DOTALL) if script_matches: print(f"\n4. Structured data (JSON-LD) found:") for i, script in enumerate(script_matches[:2], 1): try: data = json.loads(script) print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...") except: print(f" Script {i}: {script[:300]}...")