Files
scaev/test/test_scraper.py
2025-12-04 14:49:58 +01:00

336 lines
12 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Test suite for Troostwijk Scraper
Tests both auction and lot parsing with cached data
Requires Python 3.10+
"""
import sys
# Require Python 3.10+
if sys.version_info < (3, 10):
print("ERROR: This script requires Python 3.10 or higher")
print(f"Current version: {sys.version}")
sys.exit(1)
import asyncio
import json
import sqlite3
from datetime import datetime
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent))
from main import TroostwijkScraper, CacheManager, CACHE_DB
# Test URLs - these will use cached data to avoid overloading the server
TEST_AUCTIONS = [
"https://www.troostwijkauctions.com/a/online-auction-cnc-lathes-machining-centres-precision-measurement-romania-A7-39813",
"https://www.troostwijkauctions.com/a/faillissement-bab-shortlease-i-ii-b-v-%E2%80%93-2024-big-ass-energieopslagsystemen-A1-39557",
"https://www.troostwijkauctions.com/a/industriele-goederen-uit-diverse-bedrijfsbeeindigingen-A1-38675",
]
TEST_LOTS = [
"https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5",
"https://www.troostwijkauctions.com/l/tos-sui-50-1000-universele-draaibank-A7-39568-9",
"https://www.troostwijkauctions.com/l/rolcontainer-%25282x%2529-A1-40191-101",
]
class TestResult:
def __init__(self, url, success, message, data=None):
self.url = url
self.success = success
self.message = message
self.data = data
class ScraperTester:
def __init__(self):
self.scraper = TroostwijkScraper()
self.results = []
def check_cache_exists(self, url):
"""Check if URL is cached"""
cached = self.scraper.cache.get(url, max_age_hours=999999) # Get even old cache
return cached is not None
def test_auction_parsing(self, url):
"""Test auction page parsing"""
print(f"\n{'='*70}")
print(f"Testing Auction: {url}")
print('='*70)
# Check cache
if not self.check_cache_exists(url):
return TestResult(
url,
False,
"❌ NOT IN CACHE - Please run scraper first to cache this URL",
None
)
# Get cached content
cached = self.scraper.cache.get(url, max_age_hours=999999)
content = cached['content']
print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)")
# Parse
try:
data = self.scraper._parse_page(content, url)
if not data:
return TestResult(url, False, "❌ Parsing returned None", None)
if data.get('type') != 'auction':
return TestResult(
url,
False,
f"❌ Expected type='auction', got '{data.get('type')}'",
data
)
# Validate required fields
issues = []
required_fields = {
'auction_id': str,
'title': str,
'location': str,
'lots_count': int,
'first_lot_closing_time': str,
}
for field, expected_type in required_fields.items():
value = data.get(field)
if value is None or value == '':
issues.append(f"{field}: MISSING or EMPTY")
elif not isinstance(value, expected_type):
issues.append(f"{field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})")
else:
# Pretty print value
display_value = str(value)[:60]
print(f"{field}: {display_value}")
if issues:
return TestResult(url, False, "\n".join(issues), data)
print(f" ✓ lots_count: {data.get('lots_count')}")
return TestResult(url, True, "✅ All auction fields validated successfully", data)
except Exception as e:
return TestResult(url, False, f"❌ Exception during parsing: {e}", None)
def test_lot_parsing(self, url):
"""Test lot page parsing"""
print(f"\n{'='*70}")
print(f"Testing Lot: {url}")
print('='*70)
# Check cache
if not self.check_cache_exists(url):
return TestResult(
url,
False,
"❌ NOT IN CACHE - Please run scraper first to cache this URL",
None
)
# Get cached content
cached = self.scraper.cache.get(url, max_age_hours=999999)
content = cached['content']
print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)")
# Parse
try:
data = self.scraper._parse_page(content, url)
if not data:
return TestResult(url, False, "❌ Parsing returned None", None)
if data.get('type') != 'lot':
return TestResult(
url,
False,
f"❌ Expected type='lot', got '{data.get('type')}'",
data
)
# Validate required fields
issues = []
required_fields = {
'lot_id': (str, lambda x: x and len(x) > 0),
'title': (str, lambda x: x and len(x) > 3 and x not in ['...', 'N/A']),
'location': (str, lambda x: x and len(x) > 2 and x not in ['Locatie', 'Location']),
'current_bid': (str, lambda x: x and x not in ['€Huidig bod', 'Huidig bod']),
'closing_time': (str, lambda x: True), # Can be empty
'images': (list, lambda x: True), # Can be empty list
}
for field, (expected_type, validator) in required_fields.items():
value = data.get(field)
if value is None:
issues.append(f"{field}: MISSING (None)")
elif not isinstance(value, expected_type):
issues.append(f"{field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})")
elif not validator(value):
issues.append(f"{field}: Invalid value: '{value}'")
else:
# Pretty print value
if field == 'images':
print(f"{field}: {len(value)} images")
for i, img in enumerate(value[:3], 1):
print(f" {i}. {img[:60]}...")
else:
display_value = str(value)[:60]
print(f"{field}: {display_value}")
# Additional checks
if data.get('bid_count') is not None:
print(f" ✓ bid_count: {data.get('bid_count')}")
if data.get('viewing_time'):
print(f" ✓ viewing_time: {data.get('viewing_time')}")
if data.get('pickup_date'):
print(f" ✓ pickup_date: {data.get('pickup_date')}")
if issues:
return TestResult(url, False, "\n".join(issues), data)
return TestResult(url, True, "✅ All lot fields validated successfully", data)
except Exception as e:
import traceback
return TestResult(url, False, f"❌ Exception during parsing: {e}\n{traceback.format_exc()}", None)
def run_all_tests(self):
"""Run all tests"""
print("\n" + "="*70)
print("TROOSTWIJK SCRAPER TEST SUITE")
print("="*70)
print("\nThis test suite uses CACHED data only - no live requests to server")
print("="*70)
# Test auctions
print("\n" + "="*70)
print("TESTING AUCTIONS")
print("="*70)
for url in TEST_AUCTIONS:
result = self.test_auction_parsing(url)
self.results.append(result)
# Test lots
print("\n" + "="*70)
print("TESTING LOTS")
print("="*70)
for url in TEST_LOTS:
result = self.test_lot_parsing(url)
self.results.append(result)
# Summary
self.print_summary()
def print_summary(self):
"""Print test summary"""
print("\n" + "="*70)
print("TEST SUMMARY")
print("="*70)
passed = sum(1 for r in self.results if r.success)
failed = sum(1 for r in self.results if not r.success)
total = len(self.results)
print(f"\nTotal tests: {total}")
print(f"Passed: {passed}")
print(f"Failed: {failed}")
print(f"Success rate: {passed/total*100:.1f}%")
if failed > 0:
print("\n" + "="*70)
print("FAILED TESTS:")
print("="*70)
for result in self.results:
if not result.success:
print(f"\n{result.url}")
print(result.message)
if result.data:
print("\nParsed data:")
for key, value in result.data.items():
if key != 'lots': # Don't print full lots array
print(f" {key}: {str(value)[:80]}")
print("\n" + "="*70)
return failed == 0
def check_cache_status():
"""Check cache compression status"""
print("\n" + "="*70)
print("CACHE STATUS CHECK")
print("="*70)
try:
with sqlite3.connect(CACHE_DB) as conn:
# Total entries
cursor = conn.execute("SELECT COUNT(*) FROM cache")
total = cursor.fetchone()[0]
# Compressed vs uncompressed
cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1")
compressed = cursor.fetchone()[0]
cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL")
uncompressed = cursor.fetchone()[0]
print(f"Total cache entries: {total}")
print(f"Compressed: {compressed} ({compressed/total*100:.1f}%)")
print(f"Uncompressed: {uncompressed} ({uncompressed/total*100:.1f}%)")
if uncompressed > 0:
print(f"\n⚠️ Warning: {uncompressed} entries are still uncompressed")
print(" Run: python migrate_compress_cache.py")
else:
print("\n✓ All cache entries are compressed!")
# Check test URLs
print(f"\n{'='*70}")
print("TEST URL CACHE STATUS:")
print('='*70)
all_test_urls = TEST_AUCTIONS + TEST_LOTS
cached_count = 0
for url in all_test_urls:
cursor = conn.execute("SELECT url FROM cache WHERE url = ?", (url,))
if cursor.fetchone():
print(f"{url[:60]}...")
cached_count += 1
else:
print(f"{url[:60]}... (NOT CACHED)")
print(f"\n{cached_count}/{len(all_test_urls)} test URLs are cached")
if cached_count < len(all_test_urls):
print("\n⚠️ Some test URLs are not cached. Tests for those URLs will fail.")
print(" Run the main scraper to cache these URLs first.")
except Exception as e:
print(f"Error checking cache status: {e}")
if __name__ == "__main__":
# Check cache status first
check_cache_status()
# Run tests
tester = ScraperTester()
success = tester.run_all_tests()
# Exit with appropriate code
sys.exit(0 if success else 1)