scaev/test/test_scraper.py

#!/usr/bin/env python3
"""
Test suite for Troostwijk Scraper
Tests both auction and lot parsing with cached data

Requires Python 3.10+
"""

import sys

# Require Python 3.10+
if sys.version_info < (3, 10):
    print("ERROR: This script requires Python 3.10 or higher")
    print(f"Current version: {sys.version}")
    sys.exit(1)

import asyncio
import json
import sqlite3
from datetime import datetime
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent))

from main import TroostwijkScraper, CacheManager, CACHE_DB

# Test URLs - these will use cached data to avoid overloading the server
TEST_AUCTIONS = [
    "https://www.troostwijkauctions.com/a/online-auction-cnc-lathes-machining-centres-precision-measurement-romania-A7-39813",
    "https://www.troostwijkauctions.com/a/faillissement-bab-shortlease-i-ii-b-v-%E2%80%93-2024-big-ass-energieopslagsystemen-A1-39557",
    "https://www.troostwijkauctions.com/a/industriele-goederen-uit-diverse-bedrijfsbeeindigingen-A1-38675",
]

TEST_LOTS = [
    "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5",
    "https://www.troostwijkauctions.com/l/tos-sui-50-1000-universele-draaibank-A7-39568-9",
    "https://www.troostwijkauctions.com/l/rolcontainer-%25282x%2529-A1-40191-101",
]

class TestResult:
    def __init__(self, url, success, message, data=None):
        self.url = url
        self.success = success
        self.message = message
        self.data = data

class ScraperTester:
    def __init__(self):
        self.scraper = TroostwijkScraper()
        self.results = []

    def check_cache_exists(self, url):
        """Check if URL is cached"""
        cached = self.scraper.cache.get(url, max_age_hours=999999)  # Get even old cache
        return cached is not None

    def test_auction_parsing(self, url):
        """Test auction page parsing"""
        print(f"\n{'='*70}")
        print(f"Testing Auction: {url}")
        print('='*70)

        # Check cache
        if not self.check_cache_exists(url):
            return TestResult(
                url,
                False,
                "❌ NOT IN CACHE - Please run scraper first to cache this URL",
                None
            )

        # Get cached content
        cached = self.scraper.cache.get(url, max_age_hours=999999)
        content = cached['content']

        print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)")

        # Parse
        try:
            data = self.scraper._parse_page(content, url)

            if not data:
                return TestResult(url, False, "❌ Parsing returned None", None)

            if data.get('type') != 'auction':
                return TestResult(
                    url,
                    False,
                    f"❌ Expected type='auction', got '{data.get('type')}'",
                    data
                )

            # Validate required fields
            issues = []
            required_fields = {
                'auction_id': str,
                'title': str,
                'location': str,
                'lots_count': int,
                'first_lot_closing_time': str,
            }

            for field, expected_type in required_fields.items():
                value = data.get(field)
                if value is None or value == '':
                    issues.append(f"  ❌ {field}: MISSING or EMPTY")
                elif not isinstance(value, expected_type):
                    issues.append(f"  ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})")
                else:
                    # Pretty print value
                    display_value = str(value)[:60]
                    print(f"  ✓ {field}: {display_value}")

            if issues:
                return TestResult(url, False, "\n".join(issues), data)

            print(f"  ✓ lots_count: {data.get('lots_count')}")

            return TestResult(url, True, "✅ All auction fields validated successfully", data)

        except Exception as e:
            return TestResult(url, False, f"❌ Exception during parsing: {e}", None)

    def test_lot_parsing(self, url):
        """Test lot page parsing"""
        print(f"\n{'='*70}")
        print(f"Testing Lot: {url}")
        print('='*70)

        # Check cache
        if not self.check_cache_exists(url):
            return TestResult(
                url,
                False,
                "❌ NOT IN CACHE - Please run scraper first to cache this URL",
                None
            )

        # Get cached content
        cached = self.scraper.cache.get(url, max_age_hours=999999)
        content = cached['content']

        print(f"✓ Cache hit (age: {(datetime.now().timestamp() - cached['timestamp']) / 3600:.1f} hours)")

        # Parse
        try:
            data = self.scraper._parse_page(content, url)

            if not data:
                return TestResult(url, False, "❌ Parsing returned None", None)

            if data.get('type') != 'lot':
                return TestResult(
                    url,
                    False,
                    f"❌ Expected type='lot', got '{data.get('type')}'",
                    data
                )

            # Validate required fields
            issues = []
            required_fields = {
                'lot_id': (str, lambda x: x and len(x) > 0),
                'title': (str, lambda x: x and len(x) > 3 and x not in ['...', 'N/A']),
                'location': (str, lambda x: x and len(x) > 2 and x not in ['Locatie', 'Location']),
                'current_bid': (str, lambda x: x and x not in ['€Huidig bod', 'Huidig bod']),
                'closing_time': (str, lambda x: True),  # Can be empty
                'images': (list, lambda x: True),  # Can be empty list
            }

            for field, (expected_type, validator) in required_fields.items():
                value = data.get(field)

                if value is None:
                    issues.append(f"  ❌ {field}: MISSING (None)")
                elif not isinstance(value, expected_type):
                    issues.append(f"  ❌ {field}: Wrong type (expected {expected_type.__name__}, got {type(value).__name__})")
                elif not validator(value):
                    issues.append(f"  ❌ {field}: Invalid value: '{value}'")
                else:
                    # Pretty print value
                    if field == 'images':
                        print(f"  ✓ {field}: {len(value)} images")
                        for i, img in enumerate(value[:3], 1):
                            print(f"      {i}. {img[:60]}...")
                    else:
                        display_value = str(value)[:60]
                        print(f"  ✓ {field}: {display_value}")

            # Additional checks
            if data.get('bid_count') is not None:
                print(f"  ✓ bid_count: {data.get('bid_count')}")

            if data.get('viewing_time'):
                print(f"  ✓ viewing_time: {data.get('viewing_time')}")

            if data.get('pickup_date'):
                print(f"  ✓ pickup_date: {data.get('pickup_date')}")

            if issues:
                return TestResult(url, False, "\n".join(issues), data)

            return TestResult(url, True, "✅ All lot fields validated successfully", data)

        except Exception as e:
            import traceback
            return TestResult(url, False, f"❌ Exception during parsing: {e}\n{traceback.format_exc()}", None)

    def run_all_tests(self):
        """Run all tests"""
        print("\n" + "="*70)
        print("TROOSTWIJK SCRAPER TEST SUITE")
        print("="*70)
        print("\nThis test suite uses CACHED data only - no live requests to server")
        print("="*70)

        # Test auctions
        print("\n" + "="*70)
        print("TESTING AUCTIONS")
        print("="*70)

        for url in TEST_AUCTIONS:
            result = self.test_auction_parsing(url)
            self.results.append(result)

        # Test lots
        print("\n" + "="*70)
        print("TESTING LOTS")
        print("="*70)

        for url in TEST_LOTS:
            result = self.test_lot_parsing(url)
            self.results.append(result)

        # Summary
        self.print_summary()

    def print_summary(self):
        """Print test summary"""
        print("\n" + "="*70)
        print("TEST SUMMARY")
        print("="*70)

        passed = sum(1 for r in self.results if r.success)
        failed = sum(1 for r in self.results if not r.success)
        total = len(self.results)

        print(f"\nTotal tests: {total}")
        print(f"Passed: {passed} ✓")
        print(f"Failed: {failed} ✗")
        print(f"Success rate: {passed/total*100:.1f}%")

        if failed > 0:
            print("\n" + "="*70)
            print("FAILED TESTS:")
            print("="*70)
            for result in self.results:
                if not result.success:
                    print(f"\n{result.url}")
                    print(result.message)
                    if result.data:
                        print("\nParsed data:")
                        for key, value in result.data.items():
                            if key != 'lots':  # Don't print full lots array
                                print(f"  {key}: {str(value)[:80]}")

        print("\n" + "="*70)

        return failed == 0

def check_cache_status():
    """Check cache compression status"""
    print("\n" + "="*70)
    print("CACHE STATUS CHECK")
    print("="*70)

    try:
        with sqlite3.connect(CACHE_DB) as conn:
            # Total entries
            cursor = conn.execute("SELECT COUNT(*) FROM cache")
            total = cursor.fetchone()[0]

            # Compressed vs uncompressed
            cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 1")
            compressed = cursor.fetchone()[0]

            cursor = conn.execute("SELECT COUNT(*) FROM cache WHERE compressed = 0 OR compressed IS NULL")
            uncompressed = cursor.fetchone()[0]

            print(f"Total cache entries: {total}")
            print(f"Compressed: {compressed} ({compressed/total*100:.1f}%)")
            print(f"Uncompressed: {uncompressed} ({uncompressed/total*100:.1f}%)")

            if uncompressed > 0:
                print(f"\n⚠️  Warning: {uncompressed} entries are still uncompressed")
                print("   Run: python migrate_compress_cache.py")
            else:
                print("\n✓ All cache entries are compressed!")

            # Check test URLs
            print(f"\n{'='*70}")
            print("TEST URL CACHE STATUS:")
            print('='*70)

            all_test_urls = TEST_AUCTIONS + TEST_LOTS
            cached_count = 0

            for url in all_test_urls:
                cursor = conn.execute("SELECT url FROM cache WHERE url = ?", (url,))
                if cursor.fetchone():
                    print(f"✓ {url[:60]}...")
                    cached_count += 1
                else:
                    print(f"✗ {url[:60]}... (NOT CACHED)")

            print(f"\n{cached_count}/{len(all_test_urls)} test URLs are cached")

            if cached_count < len(all_test_urls):
                print("\n⚠️  Some test URLs are not cached. Tests for those URLs will fail.")
                print("   Run the main scraper to cache these URLs first.")

    except Exception as e:
        print(f"Error checking cache status: {e}")

if __name__ == "__main__":
    # Check cache status first
    check_cache_status()

    # Run tests
    tester = ScraperTester()
    success = tester.run_all_tests()

    # Exit with appropriate code
    sys.exit(0 if success else 1)