first
This commit is contained in:
335
test/test_scraper.py
Normal file
335
test/test_scraper.py
Normal file
@@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test suite for Troostwijk Scraper
|
||||
Tests both auction and lot parsing with cached data
|
||||
|
||||
Requires Python 3.10+
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
# Require Python 3.10+
|
||||
if sys.version_info < (3, 10):
|
||||
print("ERROR: This script requires Python 3.10 or higher")
|
||||
print(f"Current version: {sys.version}")
|
||||
sys.exit(1)
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from main import TroostwijkScraper, CacheManager, CACHE_DB
|
||||
|
||||
# Test URLs - these will use cached data to avoid overloading the server
|
||||
TEST_AUCTIONS = [
|
||||
"https://www.troostwijkauctions.com/a/online-auction-cnc-lathes-machining-centres-precision-measurement-romania-A7-39813",
|
||||
"https://www.troostwijkauctions.com/a/faillissement-bab-shortlease-i-ii-b-v-%E2%80%93-2024-big-ass-energieopslagsystemen-A1-39557",
|
||||
"https://www.troostwijkauctions.com/a/industriele-goederen-uit-diverse-bedrijfsbeeindigingen-A1-38675",
|
||||
]
|
||||
|
||||
TEST_LOTS = [
|
||||
"https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5",
|
||||
"https://www.troostwijkauctions.com/l/tos-sui-50-1000-universele-draaibank-A7-39568-9",
|
||||
"https://www.troostwijkauctions.com/l/rolcontainer-%25282x%2529-A1-40191-101",
|
||||
]
|
||||
|
||||
class TestResult:
|
||||
def __init__(self, url, success, message, data=None):
|
||||
self.url = url
|
||||
self.success = success
|
||||
self.message = message
|
||||
self.data = data
|
||||
|
||||
class ScraperTester:
|
||||
def __init__(self):
|
||||
self.scraper = TroostwijkScraper()
|
||||
self.results = []
|
||||
|
||||
def check_cache_exists(self, url):
|
||||
"""Check if URL is cached"""
|
||||
cached = self.scraper.cache.get(url, max_age_hours=999999) # Get even old cache
|
||||
return cached is not None
|
||||
|
||||
def test_auction_parsing(self, url):
|
||||
"""Test auction page parsing"""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Testing Auction: {url}")
|
||||
print('='*70)
|
||||
|
||||
# Check cache
|
||||
if not self.check_cache_exists(url):
|
||||
return TestResult(
|
||||
url,
|
||||
False,
|
||||
"❌ NOT IN CACHE - Please run scraper first to cache this URL",
|
||||
None
|
||||
)
|
||||
|
||||
# Get cached content
|
||||
cached = self.scraper.cache.get(url, max_age_hours=999999)
|
||||
content = cached['content']
|
||||
|
||||
print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)")
|
||||
|
||||
# Parse
|
||||
try:
|
||||
data = self.scraper._parse_page(content, url)
|
||||
|
||||
if not data:
|
||||
return TestResult(url, False, "❌ Parsing returned None", None)
|
||||
|
||||
if data.get('type') != 'auction':
|
||||
return TestResult(
|
||||
url,
|
||||
False,
|
||||
f"❌ Expected type='auction', got '{data.get('type')}'",
|
||||
data
|
||||
)
|
||||
|
||||
# Validate required fields
|
||||
issues = []
|
||||
required_fields = {
|
||||
'auction_id': str,
|
||||
'title': str,
|
||||
'location': str,
|
||||
'lots_count': int,
|
||||
'first_lot_closing_time': str,
|
||||
}
|
||||
|
||||
for field, expected_type in required_fields.items():
|
||||
value = data.get(field)
|
||||
if value is None or value == '':
|
||||
issues.append(f" ❌ {field}: MISSING or EMPTY")
|
||||
elif not isinstance(value, expected_type):
|
||||
issues.append(f" ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})")
|
||||
else:
|
||||
# Pretty print value
|
||||
display_value = str(value)[:60]
|
||||
print(f" ✓ {field}: {display_value}")
|
||||
|
||||
if issues:
|
||||
return TestResult(url, False, "\n".join(issues), data)
|
||||
|
||||
print(f" ✓ lots_count: {data.get('lots_count')}")
|
||||
|
||||
return TestResult(url, True, "✅ All auction fields validated successfully", data)
|
||||
|
||||
except Exception as e:
|
||||
return TestResult(url, False, f"❌ Exception during parsing: {e}", None)
|
||||
|
||||
def test_lot_parsing(self, url):
|
||||
"""Test lot page parsing"""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Testing Lot: {url}")
|
||||
print('='*70)
|
||||
|
||||
# Check cache
|
||||
if not self.check_cache_exists(url):
|
||||
return TestResult(
|
||||
url,
|
||||
False,
|
||||
"❌ NOT IN CACHE - Please run scraper first to cache this URL",
|
||||
None
|
||||
)
|
||||
|
||||
# Get cached content
|
||||
cached = self.scraper.cache.get(url, max_age_hours=999999)
|
||||
content = cached['content']
|
||||
|
||||
print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)")
|
||||
|
||||
# Parse
|
||||
try:
|
||||
data = self.scraper._parse_page(content, url)
|
||||
|
||||
if not data:
|
||||
return TestResult(url, False, "❌ Parsing returned None", None)
|
||||
|
||||
if data.get('type') != 'lot':
|
||||
return TestResult(
|
||||
url,
|
||||
False,
|
||||
f"❌ Expected type='lot', got '{data.get('type')}'",
|
||||
data
|
||||
)
|
||||
|
||||
# Validate required fields
|
||||
issues = []
|
||||
required_fields = {
|
||||
'lot_id': (str, lambda x: x and len(x) > 0),
|
||||
'title': (str, lambda x: x and len(x) > 3 and x not in ['...', 'N/A']),
|
||||
'location': (str, lambda x: x and len(x) > 2 and x not in ['Locatie', 'Location']),
|
||||
'current_bid': (str, lambda x: x and x not in ['€Huidig bod', 'Huidig bod']),
|
||||
'closing_time': (str, lambda x: True), # Can be empty
|
||||
'images': (list, lambda x: True), # Can be empty list
|
||||
}
|
||||
|
||||
for field, (expected_type, validator) in required_fields.items():
|
||||
value = data.get(field)
|
||||
|
||||
if value is None:
|
||||
issues.append(f" ❌ {field}: MISSING (None)")
|
||||
elif not isinstance(value, expected_type):
|
||||
issues.append(f" ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})")
|
||||
elif not validator(value):
|
||||
issues.append(f" ❌ {field}: Invalid value: '{value}'")
|
||||
else:
|
||||
# Pretty print value
|
||||
if field == 'images':
|
||||
print(f" ✓ {field}: {len(value)} images")
|
||||
for i, img in enumerate(value[:3], 1):
|
||||
print(f" {i}. {img[:60]}...")
|
||||
else:
|
||||
display_value = str(value)[:60]
|
||||
print(f" ✓ {field}: {display_value}")
|
||||
|
||||
# Additional checks
|
||||
if data.get('bid_count') is not None:
|
||||
print(f" ✓ bid_count: {data.get('bid_count')}")
|
||||
|
||||
if data.get('viewing_time'):
|
||||
print(f" ✓ viewing_time: {data.get('viewing_time')}")
|
||||
|
||||
if data.get('pickup_date'):
|
||||
print(f" ✓ pickup_date: {data.get('pickup_date')}")
|
||||
|
||||
if issues:
|
||||
return TestResult(url, False, "\n".join(issues), data)
|
||||
|
||||
return TestResult(url, True, "✅ All lot fields validated successfully", data)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
return TestResult(url, False, f"❌ Exception during parsing: {e}\n{traceback.format_exc()}", None)
|
||||
|
||||
def run_all_tests(self):
|
||||
"""Run all tests"""
|
||||
print("\n" + "="*70)
|
||||
print("TROOSTWIJK SCRAPER TEST SUITE")
|
||||
print("="*70)
|
||||
print("\nThis test suite uses CACHED data only - no live requests to server")
|
||||
print("="*70)
|
||||
|
||||
# Test auctions
|
||||
print("\n" + "="*70)
|
||||
print("TESTING AUCTIONS")
|
||||
print("="*70)
|
||||
|
||||
for url in TEST_AUCTIONS:
|
||||
result = self.test_auction_parsing(url)
|
||||
self.results.append(result)
|
||||
|
||||
# Test lots
|
||||
print("\n" + "="*70)
|
||||
print("TESTING LOTS")
|
||||
print("="*70)
|
||||
|
||||
for url in TEST_LOTS:
|
||||
result = self.test_lot_parsing(url)
|
||||
self.results.append(result)
|
||||
|
||||
# Summary
|
||||
self.print_summary()
|
||||
|
||||
def print_summary(self):
|
||||
"""Print test summary"""
|
||||
print("\n" + "="*70)
|
||||
print("TEST SUMMARY")
|
||||
print("="*70)
|
||||
|
||||
passed = sum(1 for r in self.results if r.success)
|
||||
failed = sum(1 for r in self.results if not r.success)
|
||||
total = len(self.results)
|
||||
|
||||
print(f"\nTotal tests: {total}")
|
||||
print(f"Passed: {passed} ✓")
|
||||
print(f"Failed: {failed} ✗")
|
||||
print(f"Success rate: {passed/total*100:.1f}%")
|
||||
|
||||
if failed > 0:
|
||||
print("\n" + "="*70)
|
||||
print("FAILED TESTS:")
|
||||
print("="*70)
|
||||
for result in self.results:
|
||||
if not result.success:
|
||||
print(f"\n{result.url}")
|
||||
print(result.message)
|
||||
if result.data:
|
||||
print("\nParsed data:")
|
||||
for key, value in result.data.items():
|
||||
if key != 'lots': # Don't print full lots array
|
||||
print(f" {key}: {str(value)[:80]}")
|
||||
|
||||
print("\n" + "="*70)
|
||||
|
||||
return failed == 0
|
||||
|
||||
def check_cache_status():
|
||||
"""Check cache compression status"""
|
||||
print("\n" + "="*70)
|
||||
print("CACHE STATUS CHECK")
|
||||
print("="*70)
|
||||
|
||||
try:
|
||||
with sqlite3.connect(CACHE_DB) as conn:
|
||||
# Total entries
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM cache")
|
||||
total = cursor.fetchone()[0]
|
||||
|
||||
# Compressed vs uncompressed
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1")
|
||||
compressed = cursor.fetchone()[0]
|
||||
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL")
|
||||
uncompressed = cursor.fetchone()[0]
|
||||
|
||||
print(f"Total cache entries: {total}")
|
||||
print(f"Compressed: {compressed} ({compressed/total*100:.1f}%)")
|
||||
print(f"Uncompressed: {uncompressed} ({uncompressed/total*100:.1f}%)")
|
||||
|
||||
if uncompressed > 0:
|
||||
print(f"\n⚠️ Warning: {uncompressed} entries are still uncompressed")
|
||||
print(" Run: python migrate_compress_cache.py")
|
||||
else:
|
||||
print("\n✓ All cache entries are compressed!")
|
||||
|
||||
# Check test URLs
|
||||
print(f"\n{'='*70}")
|
||||
print("TEST URL CACHE STATUS:")
|
||||
print('='*70)
|
||||
|
||||
all_test_urls = TEST_AUCTIONS + TEST_LOTS
|
||||
cached_count = 0
|
||||
|
||||
for url in all_test_urls:
|
||||
cursor = conn.execute("SELECT url FROM cache WHERE url = ?", (url,))
|
||||
if cursor.fetchone():
|
||||
print(f"✓ {url[:60]}...")
|
||||
cached_count += 1
|
||||
else:
|
||||
print(f"✗ {url[:60]}... (NOT CACHED)")
|
||||
|
||||
print(f"\n{cached_count}/{len(all_test_urls)} test URLs are cached")
|
||||
|
||||
if cached_count < len(all_test_urls):
|
||||
print("\n⚠️ Some test URLs are not cached. Tests for those URLs will fail.")
|
||||
print(" Run the main scraper to cache these URLs first.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error checking cache status: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Check cache status first
|
||||
check_cache_status()
|
||||
|
||||
# Run tests
|
||||
tester = ScraperTester()
|
||||
success = tester.run_all_tests()
|
||||
|
||||
# Exit with appropriate code
|
||||
sys.exit(0 if success else 1)
|
||||
Reference in New Issue
Block a user