Initial
This commit is contained in:
142
src/test.py
Normal file
142
src/test.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test module for debugging extraction patterns
|
||||
"""
|
||||
|
||||
import sys
|
||||
import sqlite3
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import config
|
||||
from cache import CacheManager
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
|
||||
def test_extraction(
|
||||
test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
|
||||
"""Test extraction on a specific cached URL to debug patterns"""
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
# Try to get from cache
|
||||
cached = scraper.cache.get(test_url)
|
||||
if not cached:
|
||||
print(f"ERROR: URL not found in cache: {test_url}")
|
||||
print(f"\nAvailable cached URLs:")
|
||||
with sqlite3.connect(config.CACHE_DB) as conn:
|
||||
cursor = conn.execute("SELECT url FROM cache ORDER BY timestamp DESC LIMIT 10")
|
||||
for row in cursor.fetchall():
|
||||
print(f" - {row[0]}")
|
||||
return
|
||||
|
||||
content = cached['content']
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"TESTING EXTRACTION FROM: {test_url}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Content length: {len(content)} chars")
|
||||
print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
|
||||
|
||||
# Test each extraction method
|
||||
page_data = scraper._parse_page(content, test_url)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print("EXTRACTED DATA:")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
if not page_data:
|
||||
print("ERROR: No data extracted!")
|
||||
return
|
||||
|
||||
print(f"Page Type: {page_data.get('type', 'UNKNOWN')}")
|
||||
print()
|
||||
|
||||
for key, value in page_data.items():
|
||||
if key == 'images':
|
||||
print(f"{key:.<20}: {len(value)} images")
|
||||
for img in value[:3]:
|
||||
print(f"{'':.<20} - {img}")
|
||||
elif key == 'lots':
|
||||
print(f"{key:.<20}: {len(value)} lots in auction")
|
||||
else:
|
||||
display_value = str(value)[:100] if value else "(empty)"
|
||||
# Handle Unicode characters that Windows console can't display
|
||||
try:
|
||||
print(f"{key:.<20}: {display_value}")
|
||||
except UnicodeEncodeError:
|
||||
safe_value = display_value.encode('ascii', 'replace').decode('ascii')
|
||||
print(f"{key:.<20}: {safe_value}")
|
||||
|
||||
# Validation checks
|
||||
print(f"\n{'=' * 60}")
|
||||
print("VALIDATION CHECKS:")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
issues = []
|
||||
|
||||
if page_data.get('type') == 'lot':
|
||||
if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']:
|
||||
issues.append("[!] Current bid not extracted correctly")
|
||||
else:
|
||||
print("[OK] Current bid looks valid:", page_data.get('current_bid'))
|
||||
|
||||
if page_data.get('location') in ['Locatie', 'Location', '']:
|
||||
issues.append("[!] Location not extracted correctly")
|
||||
else:
|
||||
print("[OK] Location looks valid:", page_data.get('location'))
|
||||
|
||||
if page_data.get('title') in ['', '...']:
|
||||
issues.append("[!] Title not extracted correctly")
|
||||
else:
|
||||
print("[OK] Title looks valid:", page_data.get('title', '')[:50])
|
||||
|
||||
if issues:
|
||||
print(f"\n[ISSUES FOUND]")
|
||||
for issue in issues:
|
||||
print(f" {issue}")
|
||||
else:
|
||||
print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
|
||||
|
||||
# Debug: Show raw HTML snippets for problematic fields
|
||||
print(f"\n{'=' * 60}")
|
||||
print("DEBUG: RAW HTML SNIPPETS")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# Look for bid-related content
|
||||
print(f"\n1. Bid patterns in content:")
|
||||
bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
|
||||
for i, match in enumerate(bid_matches[:5], 1):
|
||||
print(f" {i}. {match}")
|
||||
|
||||
# Look for location content
|
||||
print(f"\n2. Location patterns in content:")
|
||||
loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
|
||||
for i, match in enumerate(loc_matches[:5], 1):
|
||||
print(f" {i}. ...{match}...")
|
||||
|
||||
# Look for JSON data
|
||||
print(f"\n3. JSON/Script data containing auction info:")
|
||||
json_patterns = [
|
||||
r'"currentBid"[^,}]+',
|
||||
r'"location"[^,}]+',
|
||||
r'"price"[^,}]+',
|
||||
r'"addressLocality"[^,}]+'
|
||||
]
|
||||
for pattern in json_patterns:
|
||||
matches = re.findall(pattern, content[:50000], re.IGNORECASE)
|
||||
if matches:
|
||||
print(f" {pattern}: {matches[:3]}")
|
||||
|
||||
# Look for script tags with structured data
|
||||
script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
|
||||
if script_matches:
|
||||
print(f"\n4. Structured data (JSON-LD) found:")
|
||||
for i, script in enumerate(script_matches[:2], 1):
|
||||
try:
|
||||
data = json.loads(script)
|
||||
print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...")
|
||||
except:
|
||||
print(f" Script {i}: {script[:300]}...")
|
||||
Reference in New Issue
Block a user