enrich data

2025-12-09 02:05:46 +01:00
parent 06f63732b1
commit b0ee52b686
4 changed files with 685 additions and 53 deletions
--- a/test/test_cache_behavior.py
+++ b/test/test_cache_behavior.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+"""
+Test cache behavior - verify page is only fetched once and data persists offline
+"""
+
+import sys
+import os
+import asyncio
+import sqlite3
+import time
+from pathlib import Path
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
+
+from cache import CacheManager
+from scraper import TroostwijkScraper
+import config
+
+
+class TestCacheBehavior:
+    """Test suite for cache and offline functionality"""
+
+    def __init__(self):
+        self.test_db = "test_cache.db"
+        self.original_db = config.CACHE_DB
+        self.cache = None
+        self.scraper = None
+
+    def setup(self):
+        """Setup test environment"""
+        print("\n" + "="*60)
+        print("TEST SETUP")
+        print("="*60)
+
+        # Use test database
+        config.CACHE_DB = self.test_db
+
+        # Ensure offline mode is disabled for tests
+        config.OFFLINE = False
+
+        # Clean up old test database
+        if os.path.exists(self.test_db):
+            os.remove(self.test_db)
+            print(f"  * Removed old test database")
+
+        # Initialize cache and scraper
+        self.cache = CacheManager()
+        self.scraper = TroostwijkScraper()
+        self.scraper.offline = False  # Explicitly disable offline mode
+
+        print(f"  * Created test database: {self.test_db}")
+        print(f"  * Initialized cache and scraper")
+        print(f"  * Offline mode: DISABLED")
+
+    def teardown(self):
+        """Cleanup test environment"""
+        print("\n" + "="*60)
+        print("TEST TEARDOWN")
+        print("="*60)
+
+        # Restore original database path
+        config.CACHE_DB = self.original_db
+
+        # Keep test database for inspection
+        print(f"  * Test database preserved: {self.test_db}")
+        print(f"  * Restored original database path")
+
+    async def test_page_fetched_once(self):
+        """Test that a page is only fetched from network once"""
+        print("\n" + "="*60)
+        print("TEST 1: Page Fetched Only Once")
+        print("="*60)
+
+        # Pick a real lot URL to test with
+        test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7"
+
+        print(f"\nTest URL: {test_url}")
+
+        # First visit - should fetch from network
+        print("\n--- FIRST VISIT (should fetch from network) ---")
+        start_time = time.time()
+
+        async with asyncio.timeout(60):  # 60 second timeout
+            page_data_1 = await self._scrape_single_page(test_url)
+
+        first_visit_time = time.time() - start_time
+
+        if not page_data_1:
+            print("  [FAIL] First visit returned no data")
+            return False
+
+        print(f"  [OK] First visit completed in {first_visit_time:.2f}s")
+        print(f"  [OK] Got lot data: {page_data_1.get('title', 'N/A')[:60]}...")
+
+        # Check closing time was captured
+        closing_time_1 = page_data_1.get('closing_time')
+        print(f"  [OK] Closing time: {closing_time_1}")
+
+        # Second visit - should use cache
+        print("\n--- SECOND VISIT (should use cache) ---")
+        start_time = time.time()
+
+        async with asyncio.timeout(30):  # Should be much faster
+            page_data_2 = await self._scrape_single_page(test_url)
+
+        second_visit_time = time.time() - start_time
+
+        if not page_data_2:
+            print("  [FAIL] Second visit returned no data")
+            return False
+
+        print(f"  [OK] Second visit completed in {second_visit_time:.2f}s")
+
+        # Verify data matches
+        if page_data_1.get('lot_id') != page_data_2.get('lot_id'):
+            print(f"  [FAIL] Lot IDs don't match")
+            return False
+
+        closing_time_2 = page_data_2.get('closing_time')
+        print(f"  [OK] Closing time: {closing_time_2}")
+
+        if closing_time_1 != closing_time_2:
+            print(f"  [FAIL] Closing times don't match!")
+            print(f"    First:  {closing_time_1}")
+            print(f"    Second: {closing_time_2}")
+            return False
+
+        # Verify second visit was significantly faster (used cache)
+        if second_visit_time >= first_visit_time * 0.5:
+            print(f"  [WARN] Second visit not significantly faster")
+            print(f"    First:  {first_visit_time:.2f}s")
+            print(f"    Second: {second_visit_time:.2f}s")
+        else:
+            print(f"  [OK] Second visit was {(first_visit_time / second_visit_time):.1f}x faster (cache working!)")
+
+        # Verify resource cache has entries
+        conn = sqlite3.connect(self.test_db)
+        cursor = conn.execute("SELECT COUNT(*) FROM resource_cache")
+        resource_count = cursor.fetchone()[0]
+        conn.close()
+
+        print(f"  [OK] Cached {resource_count} resources")
+
+        print("\n[PASS] TEST 1 PASSED: Page fetched only once, data persists")
+        return True
+
+    async def test_offline_mode(self):
+        """Test that offline mode works with cached data"""
+        print("\n" + "="*60)
+        print("TEST 2: Offline Mode with Cached Data")
+        print("="*60)
+
+        # Use the same URL from test 1 (should be cached)
+        test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7"
+
+        # Enable offline mode
+        original_offline = config.OFFLINE
+        config.OFFLINE = True
+        self.scraper.offline = True
+
+        print(f"\nTest URL: {test_url}")
+        print("  * Offline mode: ENABLED")
+
+        try:
+            # Try to scrape in offline mode
+            print("\n--- OFFLINE SCRAPE (should use DB/cache only) ---")
+            start_time = time.time()
+
+            async with asyncio.timeout(30):
+                page_data = await self._scrape_single_page(test_url)
+
+            offline_time = time.time() - start_time
+
+            if not page_data:
+                print("  [FAIL] Offline mode returned no data")
+                return False
+
+            print(f"  [OK] Offline scrape completed in {offline_time:.2f}s")
+            print(f"  [OK] Got lot data: {page_data.get('title', 'N/A')[:60]}...")
+
+            # Check closing time is available
+            closing_time = page_data.get('closing_time')
+            if not closing_time:
+                print(f"  [FAIL] No closing time in offline mode")
+                return False
+
+            print(f"  [OK] Closing time preserved: {closing_time}")
+
+            # Verify essential fields are present
+            essential_fields = ['lot_id', 'title', 'url', 'location']
+            missing_fields = [f for f in essential_fields if not page_data.get(f)]
+
+            if missing_fields:
+                print(f"  [FAIL] Missing essential fields: {missing_fields}")
+                return False
+
+            print(f"  [OK] All essential fields present")
+
+            # Check database has the lot
+            conn = sqlite3.connect(self.test_db)
+            cursor = conn.execute("SELECT closing_time FROM lots WHERE url = ?", (test_url,))
+            row = cursor.fetchone()
+            conn.close()
+
+            if not row:
+                print(f"  [FAIL] Lot not found in database")
+                return False
+
+            db_closing_time = row[0]
+            print(f"  [OK] Database has closing time: {db_closing_time}")
+
+            if db_closing_time != closing_time:
+                print(f"  [FAIL] Closing time mismatch")
+                print(f"    Scraped: {closing_time}")
+                print(f"    Database: {db_closing_time}")
+                return False
+
+            print("\n[PASS] TEST 2 PASSED: Offline mode works, closing time preserved")
+            return True
+
+        finally:
+            # Restore offline mode
+            config.OFFLINE = original_offline
+            self.scraper.offline = original_offline
+
+    async def _scrape_single_page(self, url):
+        """Helper to scrape a single page"""
+        from playwright.async_api import async_playwright
+
+        if config.OFFLINE or self.scraper.offline:
+            # Offline mode - use crawl_page directly
+            return await self.scraper.crawl_page(page=None, url=url)
+
+        # Online mode - need browser
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            page = await browser.new_page()
+
+            try:
+                result = await self.scraper.crawl_page(page, url)
+                return result
+            finally:
+                await browser.close()
+
+    async def run_all_tests(self):
+        """Run all tests"""
+        print("\n" + "="*70)
+        print("CACHE BEHAVIOR TEST SUITE")
+        print("="*70)
+
+        self.setup()
+
+        results = []
+
+        try:
+            # Test 1: Page fetched once
+            result1 = await self.test_page_fetched_once()
+            results.append(("Page Fetched Once", result1))
+
+            # Test 2: Offline mode
+            result2 = await self.test_offline_mode()
+            results.append(("Offline Mode", result2))
+
+        except Exception as e:
+            print(f"\n[ERROR] TEST SUITE ERROR: {e}")
+            import traceback
+            traceback.print_exc()
+
+        finally:
+            self.teardown()
+
+        # Print summary
+        print("\n" + "="*70)
+        print("TEST SUMMARY")
+        print("="*70)
+
+        all_passed = True
+        for test_name, passed in results:
+            status = "[PASS]" if passed else "[FAIL]"
+            print(f"  {status}: {test_name}")
+            if not passed:
+                all_passed = False
+
+        print("="*70)
+
+        if all_passed:
+            print("\n*** ALL TESTS PASSED! ***")
+            return 0
+        else:
+            print("\n*** SOME TESTS FAILED ***")
+            return 1
+
+
+async def main():
+    """Run tests"""
+    tester = TestCacheBehavior()
+    exit_code = await tester.run_all_tests()
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())