first

2025-12-04 14:49:58 +01:00
commit 79e14be37a
22 changed files with 2765 additions and 0 deletions
--- a/test/test_scraper.py
+++ b/test/test_scraper.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+"""
+Test suite for Troostwijk Scraper
+Tests both auction and lot parsing with cached data
+
+Requires Python 3.10+
+"""
+
+import sys
+
+# Require Python 3.10+
+if sys.version_info < (3, 10):
+    print("ERROR: This script requires Python 3.10 or higher")
+    print(f"Current version: {sys.version}")
+    sys.exit(1)
+
+import asyncio
+import json
+import sqlite3
+from datetime import datetime
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from main import TroostwijkScraper, CacheManager, CACHE_DB
+
+# Test URLs - these will use cached data to avoid overloading the server
+TEST_AUCTIONS = [
+    "https://www.troostwijkauctions.com/a/online-auction-cnc-lathes-machining-centres-precision-measurement-romania-A7-39813",
+    "https://www.troostwijkauctions.com/a/faillissement-bab-shortlease-i-ii-b-v-%E2%80%93-2024-big-ass-energieopslagsystemen-A1-39557",
+    "https://www.troostwijkauctions.com/a/industriele-goederen-uit-diverse-bedrijfsbeeindigingen-A1-38675",
+]
+
+TEST_LOTS = [
+    "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5",
+    "https://www.troostwijkauctions.com/l/tos-sui-50-1000-universele-draaibank-A7-39568-9",
+    "https://www.troostwijkauctions.com/l/rolcontainer-%25282x%2529-A1-40191-101",
+]
+
+class TestResult:
+    def __init__(self, url, success, message, data=None):
+        self.url = url
+        self.success = success
+        self.message = message
+        self.data = data
+
+class ScraperTester:
+    def __init__(self):
+        self.scraper = TroostwijkScraper()
+        self.results = []
+
+    def check_cache_exists(self, url):
+        """Check if URL is cached"""
+        cached = self.scraper.cache.get(url, max_age_hours=999999)  # Get even old cache
+        return cached is not None
+
+    def test_auction_parsing(self, url):
+        """Test auction page parsing"""
+        print(f"\n{'='*70}")
+        print(f"Testing Auction: {url}")
+        print('='*70)
+
+        # Check cache
+        if not self.check_cache_exists(url):
+            return TestResult(
+                url,
+                False,
+                "❌ NOT IN CACHE - Please run scraper first to cache this URL",
+                None
+            )
+
+        # Get cached content
+        cached = self.scraper.cache.get(url, max_age_hours=999999)
+        content = cached['content']
+
+        print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)")
+
+        # Parse
+        try:
+            data = self.scraper._parse_page(content, url)
+
+            if not data:
+                return TestResult(url, False, "❌ Parsing returned None", None)
+
+            if data.get('type') != 'auction':
+                return TestResult(
+                    url,
+                    False,
+                    f"❌ Expected type='auction', got '{data.get('type')}'",
+                    data
+                )
+
+            # Validate required fields
+            issues = []
+            required_fields = {
+                'auction_id': str,
+                'title': str,
+                'location': str,
+                'lots_count': int,
+                'first_lot_closing_time': str,
+            }
+
+            for field, expected_type in required_fields.items():
+                value = data.get(field)
+                if value is None or value == '':
+                    issues.append(f"  ❌ {field}: MISSING or EMPTY")
+                elif not isinstance(value, expected_type):
+                    issues.append(f"  ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})")
+                else:
+                    # Pretty print value
+                    display_value = str(value)[:60]
+                    print(f"  ✓ {field}: {display_value}")
+
+            if issues:
+                return TestResult(url, False, "\n".join(issues), data)
+
+            print(f"  ✓ lots_count: {data.get('lots_count')}")
+
+            return TestResult(url, True, "✅ All auction fields validated successfully", data)
+
+        except Exception as e:
+            return TestResult(url, False, f"❌ Exception during parsing: {e}", None)
+
+    def test_lot_parsing(self, url):
+        """Test lot page parsing"""
+        print(f"\n{'='*70}")
+        print(f"Testing Lot: {url}")
+        print('='*70)
+
+        # Check cache
+        if not self.check_cache_exists(url):
+            return TestResult(
+                url,
+                False,
+                "❌ NOT IN CACHE - Please run scraper first to cache this URL",
+                None
+            )
+
+        # Get cached content
+        cached = self.scraper.cache.get(url, max_age_hours=999999)
+        content = cached['content']
+
+        print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)")
+
+        # Parse
+        try:
+            data = self.scraper._parse_page(content, url)
+
+            if not data:
+                return TestResult(url, False, "❌ Parsing returned None", None)
+
+            if data.get('type') != 'lot':
+                return TestResult(
+                    url,
+                    False,
+                    f"❌ Expected type='lot', got '{data.get('type')}'",
+                    data
+                )
+
+            # Validate required fields
+            issues = []
+            required_fields = {
+                'lot_id': (str, lambda x: x and len(x) > 0),
+                'title': (str, lambda x: x and len(x) > 3 and x not in ['...', 'N/A']),
+                'location': (str, lambda x: x and len(x) > 2 and x not in ['Locatie', 'Location']),
+                'current_bid': (str, lambda x: x and x not in ['€Huidig bod', 'Huidig bod']),
+                'closing_time': (str, lambda x: True),  # Can be empty
+                'images': (list, lambda x: True),  # Can be empty list
+            }
+
+            for field, (expected_type, validator) in required_fields.items():
+                value = data.get(field)
+
+                if value is None:
+                    issues.append(f"  ❌ {field}: MISSING (None)")
+                elif not isinstance(value, expected_type):
+                    issues.append(f"  ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})")
+                elif not validator(value):
+                    issues.append(f"  ❌ {field}: Invalid value: '{value}'")
+                else:
+                    # Pretty print value
+                    if field == 'images':
+                        print(f"  ✓ {field}: {len(value)} images")
+                        for i, img in enumerate(value[:3], 1):
+                            print(f"      {i}. {img[:60]}...")
+                    else:
+                        display_value = str(value)[:60]
+                        print(f"  ✓ {field}: {display_value}")
+
+            # Additional checks
+            if data.get('bid_count') is not None:
+                print(f"  ✓ bid_count: {data.get('bid_count')}")
+
+            if data.get('viewing_time'):
+                print(f"  ✓ viewing_time: {data.get('viewing_time')}")
+
+            if data.get('pickup_date'):
+                print(f"  ✓ pickup_date: {data.get('pickup_date')}")
+
+            if issues:
+                return TestResult(url, False, "\n".join(issues), data)
+
+            return TestResult(url, True, "✅ All lot fields validated successfully", data)
+
+        except Exception as e:
+            import traceback
+            return TestResult(url, False, f"❌ Exception during parsing: {e}\n{traceback.format_exc()}", None)
+
+    def run_all_tests(self):
+        """Run all tests"""
+        print("\n" + "="*70)
+        print("TROOSTWIJK SCRAPER TEST SUITE")
+        print("="*70)
+        print("\nThis test suite uses CACHED data only - no live requests to server")
+        print("="*70)
+
+        # Test auctions
+        print("\n" + "="*70)
+        print("TESTING AUCTIONS")
+        print("="*70)
+
+        for url in TEST_AUCTIONS:
+            result = self.test_auction_parsing(url)
+            self.results.append(result)
+
+        # Test lots
+        print("\n" + "="*70)
+        print("TESTING LOTS")
+        print("="*70)
+
+        for url in TEST_LOTS:
+            result = self.test_lot_parsing(url)
+            self.results.append(result)
+
+        # Summary
+        self.print_summary()
+
+    def print_summary(self):
+        """Print test summary"""
+        print("\n" + "="*70)
+        print("TEST SUMMARY")
+        print("="*70)
+
+        passed = sum(1 for r in self.results if r.success)
+        failed = sum(1 for r in self.results if not r.success)
+        total = len(self.results)
+
+        print(f"\nTotal tests: {total}")
+        print(f"Passed: {passed} ✓")
+        print(f"Failed: {failed} ✗")
+        print(f"Success rate: {passed/total*100:.1f}%")
+
+        if failed > 0:
+            print("\n" + "="*70)
+            print("FAILED TESTS:")
+            print("="*70)
+            for result in self.results:
+                if not result.success:
+                    print(f"\n{result.url}")
+                    print(result.message)
+                    if result.data:
+                        print("\nParsed data:")
+                        for key, value in result.data.items():
+                            if key != 'lots':  # Don't print full lots array
+                                print(f"  {key}: {str(value)[:80]}")
+
+        print("\n" + "="*70)
+
+        return failed == 0
+
+def check_cache_status():
+    """Check cache compression status"""
+    print("\n" + "="*70)
+    print("CACHE STATUS CHECK")
+    print("="*70)
+
+    try:
+        with sqlite3.connect(CACHE_DB) as conn:
+            # Total entries
+            cursor = conn.execute("SELECT COUNT(*) FROM cache")
+            total = cursor.fetchone()[0]
+
+            # Compressed vs uncompressed
+            cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1")
+            compressed = cursor.fetchone()[0]
+
+            cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL")
+            uncompressed = cursor.fetchone()[0]
+
+            print(f"Total cache entries: {total}")
+            print(f"Compressed: {compressed} ({compressed/total*100:.1f}%)")
+            print(f"Uncompressed: {uncompressed} ({uncompressed/total*100:.1f}%)")
+
+            if uncompressed > 0:
+                print(f"\n⚠️  Warning: {uncompressed} entries are still uncompressed")
+                print("   Run: python migrate_compress_cache.py")
+            else:
+                print("\n✓ All cache entries are compressed!")
+
+            # Check test URLs
+            print(f"\n{'='*70}")
+            print("TEST URL CACHE STATUS:")
+            print('='*70)
+
+            all_test_urls = TEST_AUCTIONS + TEST_LOTS
+            cached_count = 0
+
+            for url in all_test_urls:
+                cursor = conn.execute("SELECT url FROM cache WHERE url = ?", (url,))
+                if cursor.fetchone():
+                    print(f"✓ {url[:60]}...")
+                    cached_count += 1
+                else:
+                    print(f"✗ {url[:60]}... (NOT CACHED)")
+
+            print(f"\n{cached_count}/{len(all_test_urls)} test URLs are cached")
+
+            if cached_count < len(all_test_urls):
+                print("\n⚠️  Some test URLs are not cached. Tests for those URLs will fail.")
+                print("   Run the main scraper to cache these URLs first.")
+
+    except Exception as e:
+        print(f"Error checking cache status: {e}")
+
+if __name__ == "__main__":
+    # Check cache status first
+    check_cache_status()
+
+    # Run tests
+    tester = ScraperTester()
+    success = tester.run_all_tests()
+
+    # Exit with appropriate code
+    sys.exit(0 if success else 1)