#!/usr/bin/env python3 """ Test cache behavior - verify page is only fetched once and data persists offline """ import sys import os import asyncio import sqlite3 import time from pathlib import Path # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from cache import CacheManager from scraper import TroostwijkScraper import config class TestCacheBehavior: """Test suite for cache and offline functionality""" def __init__(self): self.test_db = "test_cache.db" self.original_db = config.CACHE_DB self.cache = None self.scraper = None def setup(self): """Setup test environment""" print("\n" + "="*60) print("TEST SETUP") print("="*60) # Use test database config.CACHE_DB = self.test_db # Ensure offline mode is disabled for tests config.OFFLINE = False # Clean up old test database if os.path.exists(self.test_db): os.remove(self.test_db) print(f" * Removed old test database") # Initialize cache and scraper self.cache = CacheManager() self.scraper = TroostwijkScraper() self.scraper.offline = False # Explicitly disable offline mode print(f" * Created test database: {self.test_db}") print(f" * Initialized cache and scraper") print(f" * Offline mode: DISABLED") def teardown(self): """Cleanup test environment""" print("\n" + "="*60) print("TEST TEARDOWN") print("="*60) # Restore original database path config.CACHE_DB = self.original_db # Keep test database for inspection print(f" * Test database preserved: {self.test_db}") print(f" * Restored original database path") async def test_page_fetched_once(self): """Test that a page is only fetched from network once""" print("\n" + "="*60) print("TEST 1: Page Fetched Only Once") print("="*60) # Pick a real lot URL to test with test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7" print(f"\nTest URL: {test_url}") # First visit - should fetch from network print("\n--- FIRST VISIT (should fetch from network) ---") start_time = time.time() async with asyncio.timeout(60): # 60 second timeout page_data_1 = await self._scrape_single_page(test_url) first_visit_time = time.time() - start_time if not page_data_1: print(" [FAIL] First visit returned no data") return False print(f" [OK] First visit completed in {first_visit_time:.2f}s") print(f" [OK] Got lot data: {page_data_1.get('title', 'N/A')[:60]}...") # Check closing time was captured closing_time_1 = page_data_1.get('closing_time') print(f" [OK] Closing time: {closing_time_1}") # Second visit - should use cache print("\n--- SECOND VISIT (should use cache) ---") start_time = time.time() async with asyncio.timeout(30): # Should be much faster page_data_2 = await self._scrape_single_page(test_url) second_visit_time = time.time() - start_time if not page_data_2: print(" [FAIL] Second visit returned no data") return False print(f" [OK] Second visit completed in {second_visit_time:.2f}s") # Verify data matches if page_data_1.get('lot_id') != page_data_2.get('lot_id'): print(f" [FAIL] Lot IDs don't match") return False closing_time_2 = page_data_2.get('closing_time') print(f" [OK] Closing time: {closing_time_2}") if closing_time_1 != closing_time_2: print(f" [FAIL] Closing times don't match!") print(f" First: {closing_time_1}") print(f" Second: {closing_time_2}") return False # Verify second visit was significantly faster (used cache) if second_visit_time >= first_visit_time * 0.5: print(f" [WARN] Second visit not significantly faster") print(f" First: {first_visit_time:.2f}s") print(f" Second: {second_visit_time:.2f}s") else: print(f" [OK] Second visit was {(first_visit_time / second_visit_time):.1f}x faster (cache working!)") # Verify resource cache has entries conn = sqlite3.connect(self.test_db) cursor = conn.execute("SELECT COUNT(*) FROM resource_cache") resource_count = cursor.fetchone()[0] conn.close() print(f" [OK] Cached {resource_count} resources") print("\n[PASS] TEST 1 PASSED: Page fetched only once, data persists") return True async def test_offline_mode(self): """Test that offline mode works with cached data""" print("\n" + "="*60) print("TEST 2: Offline Mode with Cached Data") print("="*60) # Use the same URL from test 1 (should be cached) test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7" # Enable offline mode original_offline = config.OFFLINE config.OFFLINE = True self.scraper.offline = True print(f"\nTest URL: {test_url}") print(" * Offline mode: ENABLED") try: # Try to scrape in offline mode print("\n--- OFFLINE SCRAPE (should use DB/cache only) ---") start_time = time.time() async with asyncio.timeout(30): page_data = await self._scrape_single_page(test_url) offline_time = time.time() - start_time if not page_data: print(" [FAIL] Offline mode returned no data") return False print(f" [OK] Offline scrape completed in {offline_time:.2f}s") print(f" [OK] Got lot data: {page_data.get('title', 'N/A')[:60]}...") # Check closing time is available closing_time = page_data.get('closing_time') if not closing_time: print(f" [FAIL] No closing time in offline mode") return False print(f" [OK] Closing time preserved: {closing_time}") # Verify essential fields are present essential_fields = ['lot_id', 'title', 'url', 'location'] missing_fields = [f for f in essential_fields if not page_data.get(f)] if missing_fields: print(f" [FAIL] Missing essential fields: {missing_fields}") return False print(f" [OK] All essential fields present") # Check database has the lot conn = sqlite3.connect(self.test_db) cursor = conn.execute("SELECT closing_time FROM lots WHERE url = ?", (test_url,)) row = cursor.fetchone() conn.close() if not row: print(f" [FAIL] Lot not found in database") return False db_closing_time = row[0] print(f" [OK] Database has closing time: {db_closing_time}") if db_closing_time != closing_time: print(f" [FAIL] Closing time mismatch") print(f" Scraped: {closing_time}") print(f" Database: {db_closing_time}") return False print("\n[PASS] TEST 2 PASSED: Offline mode works, closing time preserved") return True finally: # Restore offline mode config.OFFLINE = original_offline self.scraper.offline = original_offline async def _scrape_single_page(self, url): """Helper to scrape a single page""" from playwright.async_api import async_playwright if config.OFFLINE or self.scraper.offline: # Offline mode - use crawl_page directly return await self.scraper.crawl_page(page=None, url=url) # Online mode - need browser async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: result = await self.scraper.crawl_page(page, url) return result finally: await browser.close() async def run_all_tests(self): """Run all tests""" print("\n" + "="*70) print("CACHE BEHAVIOR TEST SUITE") print("="*70) self.setup() results = [] try: # Test 1: Page fetched once result1 = await self.test_page_fetched_once() results.append(("Page Fetched Once", result1)) # Test 2: Offline mode result2 = await self.test_offline_mode() results.append(("Offline Mode", result2)) except Exception as e: print(f"\n[ERROR] TEST SUITE ERROR: {e}") import traceback traceback.print_exc() finally: self.teardown() # Print summary print("\n" + "="*70) print("TEST SUMMARY") print("="*70) all_passed = True for test_name, passed in results: status = "[PASS]" if passed else "[FAIL]" print(f" {status}: {test_name}") if not passed: all_passed = False print("="*70) if all_passed: print("\n*** ALL TESTS PASSED! ***") return 0 else: print("\n*** SOME TESTS FAILED ***") return 1 async def main(): """Run tests""" tester = TestCacheBehavior() exit_code = await tester.run_all_tests() sys.exit(exit_code) if __name__ == "__main__": asyncio.run(main())