#!/usr/bin/env python3 """ Test suite for Troostwijk Scraper Tests both auction and lot parsing with cached data Requires Python 3.10+ """ import sys # Require Python 3.10+ if sys.version_info < (3, 10): print("ERROR: This script requires Python 3.10 or higher") print(f"Current version: {sys.version}") sys.exit(1) import asyncio import json import sqlite3 from datetime import datetime from pathlib import Path # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent)) from main import TroostwijkScraper, CacheManager, CACHE_DB # Test URLs - these will use cached data to avoid overloading the server TEST_AUCTIONS = [ "https://www.troostwijkauctions.com/a/online-auction-cnc-lathes-machining-centres-precision-measurement-romania-A7-39813", "https://www.troostwijkauctions.com/a/faillissement-bab-shortlease-i-ii-b-v-%E2%80%93-2024-big-ass-energieopslagsystemen-A1-39557", "https://www.troostwijkauctions.com/a/industriele-goederen-uit-diverse-bedrijfsbeeindigingen-A1-38675", ] TEST_LOTS = [ "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", "https://www.troostwijkauctions.com/l/tos-sui-50-1000-universele-draaibank-A7-39568-9", "https://www.troostwijkauctions.com/l/rolcontainer-%25282x%2529-A1-40191-101", ] class TestResult: def __init__(self, url, success, message, data=None): self.url = url self.success = success self.message = message self.data = data class ScraperTester: def __init__(self): self.scraper = TroostwijkScraper() self.results = [] def check_cache_exists(self, url): """Check if URL is cached""" cached = self.scraper.cache.get(url, max_age_hours=999999) # Get even old cache return cached is not None def test_auction_parsing(self, url): """Test auction page parsing""" print(f"\n{'='*70}") print(f"Testing Auction: {url}") print('='*70) # Check cache if not self.check_cache_exists(url): return TestResult( url, False, "❌ NOT IN CACHE - Please run scraper first to cache this URL", None ) # Get cached content cached = self.scraper.cache.get(url, max_age_hours=999999) content = cached['content'] print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)") # Parse try: data = self.scraper._parse_page(content, url) if not data: return TestResult(url, False, "❌ Parsing returned None", None) if data.get('type') != 'auction': return TestResult( url, False, f"❌ Expected type='auction', got '{data.get('type')}'", data ) # Validate required fields issues = [] required_fields = { 'auction_id': str, 'title': str, 'location': str, 'lots_count': int, 'first_lot_closing_time': str, } for field, expected_type in required_fields.items(): value = data.get(field) if value is None or value == '': issues.append(f" ❌ {field}: MISSING or EMPTY") elif not isinstance(value, expected_type): issues.append(f" ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})") else: # Pretty print value display_value = str(value)[:60] print(f" ✓ {field}: {display_value}") if issues: return TestResult(url, False, "\n".join(issues), data) print(f" ✓ lots_count: {data.get('lots_count')}") return TestResult(url, True, "✅ All auction fields validated successfully", data) except Exception as e: return TestResult(url, False, f"❌ Exception during parsing: {e}", None) def test_lot_parsing(self, url): """Test lot page parsing""" print(f"\n{'='*70}") print(f"Testing Lot: {url}") print('='*70) # Check cache if not self.check_cache_exists(url): return TestResult( url, False, "❌ NOT IN CACHE - Please run scraper first to cache this URL", None ) # Get cached content cached = self.scraper.cache.get(url, max_age_hours=999999) content = cached['content'] print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)") # Parse try: data = self.scraper._parse_page(content, url) if not data: return TestResult(url, False, "❌ Parsing returned None", None) if data.get('type') != 'lot': return TestResult( url, False, f"❌ Expected type='lot', got '{data.get('type')}'", data ) # Validate required fields issues = [] required_fields = { 'lot_id': (str, lambda x: x and len(x) > 0), 'title': (str, lambda x: x and len(x) > 3 and x not in ['...', 'N/A']), 'location': (str, lambda x: x and len(x) > 2 and x not in ['Locatie', 'Location']), 'current_bid': (str, lambda x: x and x not in ['€Huidig ​​bod', 'Huidig bod']), 'closing_time': (str, lambda x: True), # Can be empty 'images': (list, lambda x: True), # Can be empty list } for field, (expected_type, validator) in required_fields.items(): value = data.get(field) if value is None: issues.append(f" ❌ {field}: MISSING (None)") elif not isinstance(value, expected_type): issues.append(f" ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})") elif not validator(value): issues.append(f" ❌ {field}: Invalid value: '{value}'") else: # Pretty print value if field == 'images': print(f" ✓ {field}: {len(value)} images") for i, img in enumerate(value[:3], 1): print(f" {i}. {img[:60]}...") else: display_value = str(value)[:60] print(f" ✓ {field}: {display_value}") # Additional checks if data.get('bid_count') is not None: print(f" ✓ bid_count: {data.get('bid_count')}") if data.get('viewing_time'): print(f" ✓ viewing_time: {data.get('viewing_time')}") if data.get('pickup_date'): print(f" ✓ pickup_date: {data.get('pickup_date')}") if issues: return TestResult(url, False, "\n".join(issues), data) return TestResult(url, True, "✅ All lot fields validated successfully", data) except Exception as e: import traceback return TestResult(url, False, f"❌ Exception during parsing: {e}\n{traceback.format_exc()}", None) def run_all_tests(self): """Run all tests""" print("\n" + "="*70) print("TROOSTWIJK SCRAPER TEST SUITE") print("="*70) print("\nThis test suite uses CACHED data only - no live requests to server") print("="*70) # Test auctions print("\n" + "="*70) print("TESTING AUCTIONS") print("="*70) for url in TEST_AUCTIONS: result = self.test_auction_parsing(url) self.results.append(result) # Test lots print("\n" + "="*70) print("TESTING LOTS") print("="*70) for url in TEST_LOTS: result = self.test_lot_parsing(url) self.results.append(result) # Summary self.print_summary() def print_summary(self): """Print test summary""" print("\n" + "="*70) print("TEST SUMMARY") print("="*70) passed = sum(1 for r in self.results if r.success) failed = sum(1 for r in self.results if not r.success) total = len(self.results) print(f"\nTotal tests: {total}") print(f"Passed: {passed} ✓") print(f"Failed: {failed} ✗") print(f"Success rate: {passed/total*100:.1f}%") if failed > 0: print("\n" + "="*70) print("FAILED TESTS:") print("="*70) for result in self.results: if not result.success: print(f"\n{result.url}") print(result.message) if result.data: print("\nParsed data:") for key, value in result.data.items(): if key != 'lots': # Don't print full lots array print(f" {key}: {str(value)[:80]}") print("\n" + "="*70) return failed == 0 def check_cache_status(): """Check cache compression status""" print("\n" + "="*70) print("CACHE STATUS CHECK") print("="*70) try: with sqlite3.connect(CACHE_DB) as conn: # Total entries cursor = conn.execute("SELECT COUNT(*) FROM cache") total = cursor.fetchone()[0] # Compressed vs uncompressed cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1") compressed = cursor.fetchone()[0] cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL") uncompressed = cursor.fetchone()[0] print(f"Total cache entries: {total}") print(f"Compressed: {compressed} ({compressed/total*100:.1f}%)") print(f"Uncompressed: {uncompressed} ({uncompressed/total*100:.1f}%)") if uncompressed > 0: print(f"\n⚠️ Warning: {uncompressed} entries are still uncompressed") print(" Run: python migrate_compress_cache.py") else: print("\n✓ All cache entries are compressed!") # Check test URLs print(f"\n{'='*70}") print("TEST URL CACHE STATUS:") print('='*70) all_test_urls = TEST_AUCTIONS + TEST_LOTS cached_count = 0 for url in all_test_urls: cursor = conn.execute("SELECT url FROM cache WHERE url = ?", (url,)) if cursor.fetchone(): print(f"✓ {url[:60]}...") cached_count += 1 else: print(f"✗ {url[:60]}... (NOT CACHED)") print(f"\n{cached_count}/{len(all_test_urls)} test URLs are cached") if cached_count < len(all_test_urls): print("\n⚠️ Some test URLs are not cached. Tests for those URLs will fail.") print(" Run the main scraper to cache these URLs first.") except Exception as e: print(f"Error checking cache status: {e}") if __name__ == "__main__": # Check cache status first check_cache_status() # Run tests tester = ScraperTester() success = tester.run_all_tests() # Exit with appropriate code sys.exit(0 if success else 1)