enrich data

This commit is contained in:
Tour
2025-12-09 02:05:46 +01:00
parent 06f63732b1
commit b0ee52b686
4 changed files with 685 additions and 53 deletions

View File

@@ -134,10 +134,13 @@ class CacheManager:
reserve_met INTEGER,
view_count INTEGER,
api_data_json TEXT,
next_scrape_at INTEGER,
scrape_priority INTEGER DEFAULT 0,
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_closing_time ON lots(closing_time)")
# Images table
conn.execute("""
@@ -200,14 +203,74 @@ class CacheManager:
"""Run database migrations to add new columns to existing tables"""
print("Checking for database migrations...")
# Check and add api_data_json column to lots table
# Check and add new columns to lots table
cursor = conn.execute("PRAGMA table_info(lots)")
lots_columns = {row[1] for row in cursor.fetchall()}
migrations_applied = False
if 'api_data_json' not in lots_columns:
print(" > Adding api_data_json column to lots table...")
conn.execute("ALTER TABLE lots ADD COLUMN api_data_json TEXT")
print(" * Migration complete")
migrations_applied = True
if 'next_scrape_at' not in lots_columns:
print(" > Adding next_scrape_at column to lots table...")
conn.execute("ALTER TABLE lots ADD COLUMN next_scrape_at INTEGER")
migrations_applied = True
if 'scrape_priority' not in lots_columns:
print(" > Adding scrape_priority column to lots table...")
conn.execute("ALTER TABLE lots ADD COLUMN scrape_priority INTEGER DEFAULT 0")
migrations_applied = True
# Check resource_cache table structure
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='resource_cache'")
resource_cache_exists = cursor.fetchone() is not None
if resource_cache_exists:
# Check if table has correct structure
cursor = conn.execute("PRAGMA table_info(resource_cache)")
resource_columns = {row[1] for row in cursor.fetchall()}
# Expected columns
expected_columns = {'url', 'content', 'content_type', 'status_code', 'headers', 'timestamp', 'size_bytes', 'local_path'}
if resource_columns != expected_columns:
print(" > Rebuilding resource_cache table with correct schema...")
# Backup old data count
cursor = conn.execute("SELECT COUNT(*) FROM resource_cache")
old_count = cursor.fetchone()[0]
print(f" (Preserving {old_count} cached resources)")
# Drop and recreate with correct schema
conn.execute("DROP TABLE IF EXISTS resource_cache")
conn.execute("""
CREATE TABLE resource_cache (
url TEXT PRIMARY KEY,
content BLOB,
content_type TEXT,
status_code INTEGER,
headers TEXT,
timestamp REAL,
size_bytes INTEGER,
local_path TEXT
)
""")
conn.execute("CREATE INDEX idx_resource_timestamp ON resource_cache(timestamp)")
conn.execute("CREATE INDEX idx_resource_content_type ON resource_cache(content_type)")
migrations_applied = True
print(" * resource_cache table rebuilt")
# Create indexes after migrations (when columns exist)
try:
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_priority ON lots(scrape_priority DESC)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_next_scrape ON lots(next_scrape_at)")
except:
pass # Indexes might already exist
if migrations_applied:
print(" * Migrations complete")
else:
print(" * Database schema is up to date")
@@ -310,8 +373,8 @@ class CacheManager:
year_manufactured, condition_score, condition_description,
serial_number, manufacturer, damage_description,
followers_count, estimated_min_price, estimated_max_price, lot_condition, appearance,
scraped_at, api_data_json)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
scraped_at, api_data_json, next_scrape_at, scrape_priority)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
lot_data['lot_id'],
lot_data.get('auction_id', ''),
@@ -347,7 +410,9 @@ class CacheManager:
lot_data.get('lot_condition', ''),
lot_data.get('appearance', ''),
lot_data['scraped_at'],
lot_data.get('api_data_json')
lot_data.get('api_data_json'),
lot_data.get('next_scrape_at'),
lot_data.get('scrape_priority', 0)
))
conn.commit()
@@ -387,62 +452,38 @@ class CacheManager:
conn.commit()
def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
headers: Optional[Dict] = None, local_path: Optional[str] = None):
"""Save a web resource (JS, CSS, image, font, etc.) to cache"""
headers: Optional[Dict] = None, local_path: Optional[str] = None, cache_key: Optional[str] = None):
"""Save a web resource (JS, CSS, image, font, etc.) to cache
Args:
cache_key: Optional composite key (url + body hash for POST requests)
"""
with sqlite3.connect(self.db_path) as conn:
headers_json = json.dumps(headers) if headers else None
size_bytes = len(content) if content else 0
# Use cache_key if provided, otherwise use url
key = cache_key if cache_key else url
conn.execute("""
INSERT OR REPLACE INTO resource_cache
(url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
""", (key, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
conn.commit()
def get_resource(self, url: str) -> Optional[Dict]:
"""Get a cached resource"""
def get_resource(self, url: str, cache_key: Optional[str] = None) -> Optional[Dict]:
"""Get a cached resource
Args:
cache_key: Optional composite key to lookup
"""
with sqlite3.connect(self.db_path) as conn:
key = cache_key if cache_key else url
cursor = conn.execute("""
SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
FROM resource_cache WHERE url = ?
""", (url,))
row = cursor.fetchone()
if row:
return {
'content': row[0],
'content_type': row[1],
'status_code': row[2],
'headers': json.loads(row[3]) if row[3] else {},
'timestamp': row[4],
'size_bytes': row[5],
'local_path': row[6],
'cached': True
}
return None
def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
headers: Optional[Dict] = None, local_path: Optional[str] = None):
"""Save a web resource (JS, CSS, image, font, etc.) to cache"""
with sqlite3.connect(self.db_path) as conn:
headers_json = json.dumps(headers) if headers else None
size_bytes = len(content) if content else 0
conn.execute("""
INSERT OR REPLACE INTO resource_cache
(url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
conn.commit()
def get_resource(self, url: str) -> Optional[Dict]:
"""Get a cached resource"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
FROM resource_cache WHERE url = ?
""", (url,))
""", (key,))
row = cursor.fetchone()
if row:

171
src/priority.py Normal file
View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""
Priority calculation for intelligent scraping
"""
import time
from datetime import datetime
from typing import Optional, Tuple
def parse_closing_time(closing_time_str: Optional[str]) -> Optional[int]:
"""Parse closing time string to unix timestamp"""
if not closing_time_str:
return None
try:
# Try various date formats
formats = [
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%d %H:%M',
'%d-%m-%Y %H:%M',
]
for fmt in formats:
try:
dt = datetime.strptime(closing_time_str, fmt)
return int(dt.timestamp())
except:
continue
return None
except:
return None
def calculate_ttl(closing_timestamp: int, current_time: Optional[int] = None) -> int:
"""
Calculate Time-To-Live (TTL) for cache based on time until closing
Strategy:
- Closing in > 7 days: Scrape once per day (TTL = 24 hours)
- Closing in 3-7 days: Scrape every 12 hours
- Closing in 1-3 days: Scrape every 6 hours
- Closing in 12-24 hours: Scrape every 3 hours
- Closing in 6-12 hours: Scrape every 2 hours
- Closing in 1-6 hours: Scrape every 30 minutes
- Closing in < 1 hour: Scrape every 10 minutes
- Already closed: TTL = infinite (no need to rescrape)
"""
if current_time is None:
current_time = int(time.time())
time_until_close = closing_timestamp - current_time
# Already closed - very low priority
if time_until_close <= 0:
return 999999999 # Effectively infinite TTL
# Convert to hours
hours_until_close = time_until_close / 3600
if hours_until_close > 168: # > 7 days
return 24 * 3600 # 24 hours
elif hours_until_close > 72: # 3-7 days
return 12 * 3600 # 12 hours
elif hours_until_close > 24: # 1-3 days
return 6 * 3600 # 6 hours
elif hours_until_close > 12: # 12-24 hours
return 3 * 3600 # 3 hours
elif hours_until_close > 6: # 6-12 hours
return 2 * 3600 # 2 hours
elif hours_until_close > 1: # 1-6 hours
return 30 * 60 # 30 minutes
else: # < 1 hour - URGENT!
return 10 * 60 # 10 minutes
def calculate_priority(
closing_time_str: Optional[str],
scraped_at: Optional[int],
current_time: Optional[int] = None
) -> Tuple[int, int]:
"""
Calculate scrape priority and next_scrape_at timestamp
Returns:
(priority, next_scrape_at)
Priority Scale:
10000+ = Never scraped (highest priority)
9000+ = Closing within 1 hour
8000+ = Closing within 6 hours
7000+ = Closing within 24 hours
6000+ = Closing within 3 days
5000+ = Closing within 7 days
1000+ = Due for re-scrape (TTL expired)
0-999 = Recently scraped, not due yet
-1000 = Already closed
"""
if current_time is None:
current_time = int(time.time())
# Never scraped = highest priority
if scraped_at is None or scraped_at == 0:
closing_timestamp = parse_closing_time(closing_time_str)
if closing_timestamp:
ttl = calculate_ttl(closing_timestamp, current_time)
next_scrape = current_time # Scrape immediately
time_until_close = closing_timestamp - current_time
# Boost priority based on urgency
if time_until_close <= 0:
return (10000, next_scrape) # Closed but never scraped
elif time_until_close < 3600:
return (19000, next_scrape) # < 1 hour - CRITICAL
elif time_until_close < 6 * 3600:
return (18000, next_scrape) # < 6 hours
elif time_until_close < 24 * 3600:
return (17000, next_scrape) # < 24 hours
elif time_until_close < 3 * 24 * 3600:
return (16000, next_scrape) # < 3 days
else:
return (15000, next_scrape) # > 3 days but never scraped
else:
return (15000, current_time) # No closing time, high priority anyway
# Already scraped - calculate based on TTL
closing_timestamp = parse_closing_time(closing_time_str)
if not closing_timestamp:
# No closing time - scrape once per day
ttl = 24 * 3600
next_scrape = scraped_at + ttl
time_until_rescrape = next_scrape - current_time
if time_until_rescrape <= 0:
return (1000, current_time) # Due for rescrape
else:
return (500, next_scrape) # Not due yet
# Has closing time - intelligent TTL
time_until_close = closing_timestamp - current_time
# Already closed
if time_until_close <= 0:
return (-1000, 999999999) # Very low priority, never rescrape
# Calculate TTL and next scrape time
ttl = calculate_ttl(closing_timestamp, current_time)
next_scrape = scraped_at + ttl
time_until_rescrape = next_scrape - current_time
# Priority based on urgency and TTL
if time_until_rescrape <= 0:
# Due for rescrape - urgency-based priority
if time_until_close < 3600:
return (9000, current_time) # < 1 hour - URGENT
elif time_until_close < 6 * 3600:
return (8000, current_time) # < 6 hours
elif time_until_close < 24 * 3600:
return (7000, current_time) # < 24 hours
elif time_until_close < 3 * 24 * 3600:
return (6000, current_time) # < 3 days
elif time_until_close < 7 * 24 * 3600:
return (5000, current_time) # < 7 days
else:
return (1000, current_time) # > 7 days, but due
else:
# Not due yet - low priority
return (min(999, int(time_until_close / 3600)), next_scrape)

View File

@@ -10,7 +10,7 @@ import random
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Set
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import urljoin
from playwright.async_api import async_playwright, Page
@@ -27,6 +27,7 @@ from graphql_client import (
extract_enriched_attributes
)
from bid_history_client import fetch_bid_history, parse_bid_history
from priority import calculate_priority, parse_closing_time
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
@@ -535,6 +536,17 @@ class TroostwijkScraper:
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
print(f" Location: {page_data.get('location', 'N/A')}")
# Calculate and store priority for next scrape
current_time = int(time.time())
priority, next_scrape = calculate_priority(
page_data.get('closing_time'),
current_time, # Just scraped now
current_time
)
page_data['scrape_priority'] = priority
page_data['next_scrape_at'] = next_scrape
self.cache.save_lot(page_data)
images = page_data.get('images', [])
@@ -575,6 +587,77 @@ class TroostwijkScraper:
return page_data
def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]:
"""
Prioritize lots based on closing time and scrape history
Returns list of (priority, url, description) tuples sorted by priority (highest first)
"""
import sqlite3
prioritized = []
current_time = int(time.time())
conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor()
for url in lot_urls:
# Extract lot_id from URL
lot_id = self.parser.extract_lot_id(url)
# Try to get existing data from database
cursor.execute("""
SELECT closing_time, scraped_at, scrape_priority, next_scrape_at
FROM lots WHERE lot_id = ? OR url = ?
""", (lot_id, url))
row = cursor.fetchone()
if row:
closing_time, scraped_at, existing_priority, next_scrape_at = row
# Parse scraped_at (it might be a string timestamp)
if isinstance(scraped_at, str):
try:
scraped_at = int(datetime.strptime(scraped_at, '%Y-%m-%d %H:%M:%S').timestamp())
except:
scraped_at = None
else:
closing_time = None
scraped_at = None
# Calculate priority
priority, next_scrape = calculate_priority(closing_time, scraped_at, current_time)
# Create description
if scraped_at is None:
desc = "Never scraped"
elif priority >= 15000:
desc = "Never scraped (high urgency)"
elif priority >= 9000:
desc = "URGENT: <1hr to close"
elif priority >= 8000:
desc = "High: <6hr to close"
elif priority >= 7000:
desc = "Medium: <24hr to close"
elif priority >= 5000:
desc = "Normal: <7d to close"
elif priority >= 1000:
desc = "Due for rescrape"
elif priority < 0:
desc = "Already closed"
else:
desc = f"Recently scraped"
prioritized.append((priority, url, desc))
conn.close()
# Sort by priority (highest first)
prioritized.sort(key=lambda x: x[0], reverse=True)
return prioritized
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
"""Main crawl function"""
if self.offline:
@@ -633,6 +716,24 @@ class TroostwijkScraper:
# Set up COMPREHENSIVE resource interception (cache EVERYTHING)
resource_stats = {'cached': 0, 'fetched': 0, 'failed': 0}
request_bodies = {} # Store POST request bodies by URL for cache key generation
async def handle_request(request):
"""Intercept requests to capture POST bodies for GraphQL"""
try:
if request.method == 'POST' and 'graphql' in request.url:
# Store the POST body
post_data = request.post_data
if post_data:
# Create hash of POST body for cache key
import hashlib
body_hash = hashlib.md5(post_data.encode() if isinstance(post_data, str) else post_data).hexdigest()[:16]
cache_key = f"{request.url}#{body_hash}"
request_bodies[request.url] = (cache_key, post_data)
except:
pass
page.on('request', handle_request)
async def handle_response(response):
"""Intercept ALL resources and cache them"""
@@ -658,13 +759,19 @@ class TroostwijkScraper:
try:
body = await response.body()
# Determine cache key (use composite key for GraphQL POST requests)
cache_key = None
if 'graphql' in url and url in request_bodies:
cache_key, post_data = request_bodies[url]
# Save to resource cache
self.cache.save_resource(
url=url,
content=body,
content_type=content_type,
status_code=status,
headers=headers
headers=headers,
cache_key=cache_key
)
resource_stats['cached'] += 1
@@ -746,14 +853,24 @@ class TroostwijkScraper:
print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
print(f"{'='*60}")
# Phase 3: Scrape each lot page
# Phase 2.5: Sort lots by priority (closing time + TTL)
print("\n" + "="*60)
print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
print("PHASE 2.5: CALCULATING SCRAPE PRIORITIES")
print("="*60)
sorted_lots = self._prioritize_lots(all_lot_urls)
print(f" > Sorted {len(sorted_lots)} lots by priority")
print(f" > Highest priority: {sorted_lots[0][2] if sorted_lots else 'N/A'}")
print(f" > Lowest priority: {sorted_lots[-1][2] if sorted_lots else 'N/A'}")
# Phase 3: Scrape each lot page (in priority order)
print("\n" + "="*60)
print("PHASE 3: SCRAPING LOTS (PRIORITY ORDER)")
print("="*60)
results = []
for i, lot_url in enumerate(all_lot_urls):
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
for i, (priority, lot_url, priority_desc) in enumerate(sorted_lots):
print(f"\n[{i+1:>3}/{len(sorted_lots)}] [P:{priority}] ", end="")
page_data = await self.crawl_page(page, lot_url)
if page_data:
results.append(page_data)

303
test/test_cache_behavior.py Normal file
View File

@@ -0,0 +1,303 @@
#!/usr/bin/env python3
"""
Test cache behavior - verify page is only fetched once and data persists offline
"""
import sys
import os
import asyncio
import sqlite3
import time
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from cache import CacheManager
from scraper import TroostwijkScraper
import config
class TestCacheBehavior:
"""Test suite for cache and offline functionality"""
def __init__(self):
self.test_db = "test_cache.db"
self.original_db = config.CACHE_DB
self.cache = None
self.scraper = None
def setup(self):
"""Setup test environment"""
print("\n" + "="*60)
print("TEST SETUP")
print("="*60)
# Use test database
config.CACHE_DB = self.test_db
# Ensure offline mode is disabled for tests
config.OFFLINE = False
# Clean up old test database
if os.path.exists(self.test_db):
os.remove(self.test_db)
print(f" * Removed old test database")
# Initialize cache and scraper
self.cache = CacheManager()
self.scraper = TroostwijkScraper()
self.scraper.offline = False # Explicitly disable offline mode
print(f" * Created test database: {self.test_db}")
print(f" * Initialized cache and scraper")
print(f" * Offline mode: DISABLED")
def teardown(self):
"""Cleanup test environment"""
print("\n" + "="*60)
print("TEST TEARDOWN")
print("="*60)
# Restore original database path
config.CACHE_DB = self.original_db
# Keep test database for inspection
print(f" * Test database preserved: {self.test_db}")
print(f" * Restored original database path")
async def test_page_fetched_once(self):
"""Test that a page is only fetched from network once"""
print("\n" + "="*60)
print("TEST 1: Page Fetched Only Once")
print("="*60)
# Pick a real lot URL to test with
test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7"
print(f"\nTest URL: {test_url}")
# First visit - should fetch from network
print("\n--- FIRST VISIT (should fetch from network) ---")
start_time = time.time()
async with asyncio.timeout(60): # 60 second timeout
page_data_1 = await self._scrape_single_page(test_url)
first_visit_time = time.time() - start_time
if not page_data_1:
print(" [FAIL] First visit returned no data")
return False
print(f" [OK] First visit completed in {first_visit_time:.2f}s")
print(f" [OK] Got lot data: {page_data_1.get('title', 'N/A')[:60]}...")
# Check closing time was captured
closing_time_1 = page_data_1.get('closing_time')
print(f" [OK] Closing time: {closing_time_1}")
# Second visit - should use cache
print("\n--- SECOND VISIT (should use cache) ---")
start_time = time.time()
async with asyncio.timeout(30): # Should be much faster
page_data_2 = await self._scrape_single_page(test_url)
second_visit_time = time.time() - start_time
if not page_data_2:
print(" [FAIL] Second visit returned no data")
return False
print(f" [OK] Second visit completed in {second_visit_time:.2f}s")
# Verify data matches
if page_data_1.get('lot_id') != page_data_2.get('lot_id'):
print(f" [FAIL] Lot IDs don't match")
return False
closing_time_2 = page_data_2.get('closing_time')
print(f" [OK] Closing time: {closing_time_2}")
if closing_time_1 != closing_time_2:
print(f" [FAIL] Closing times don't match!")
print(f" First: {closing_time_1}")
print(f" Second: {closing_time_2}")
return False
# Verify second visit was significantly faster (used cache)
if second_visit_time >= first_visit_time * 0.5:
print(f" [WARN] Second visit not significantly faster")
print(f" First: {first_visit_time:.2f}s")
print(f" Second: {second_visit_time:.2f}s")
else:
print(f" [OK] Second visit was {(first_visit_time / second_visit_time):.1f}x faster (cache working!)")
# Verify resource cache has entries
conn = sqlite3.connect(self.test_db)
cursor = conn.execute("SELECT COUNT(*) FROM resource_cache")
resource_count = cursor.fetchone()[0]
conn.close()
print(f" [OK] Cached {resource_count} resources")
print("\n[PASS] TEST 1 PASSED: Page fetched only once, data persists")
return True
async def test_offline_mode(self):
"""Test that offline mode works with cached data"""
print("\n" + "="*60)
print("TEST 2: Offline Mode with Cached Data")
print("="*60)
# Use the same URL from test 1 (should be cached)
test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7"
# Enable offline mode
original_offline = config.OFFLINE
config.OFFLINE = True
self.scraper.offline = True
print(f"\nTest URL: {test_url}")
print(" * Offline mode: ENABLED")
try:
# Try to scrape in offline mode
print("\n--- OFFLINE SCRAPE (should use DB/cache only) ---")
start_time = time.time()
async with asyncio.timeout(30):
page_data = await self._scrape_single_page(test_url)
offline_time = time.time() - start_time
if not page_data:
print(" [FAIL] Offline mode returned no data")
return False
print(f" [OK] Offline scrape completed in {offline_time:.2f}s")
print(f" [OK] Got lot data: {page_data.get('title', 'N/A')[:60]}...")
# Check closing time is available
closing_time = page_data.get('closing_time')
if not closing_time:
print(f" [FAIL] No closing time in offline mode")
return False
print(f" [OK] Closing time preserved: {closing_time}")
# Verify essential fields are present
essential_fields = ['lot_id', 'title', 'url', 'location']
missing_fields = [f for f in essential_fields if not page_data.get(f)]
if missing_fields:
print(f" [FAIL] Missing essential fields: {missing_fields}")
return False
print(f" [OK] All essential fields present")
# Check database has the lot
conn = sqlite3.connect(self.test_db)
cursor = conn.execute("SELECT closing_time FROM lots WHERE url = ?", (test_url,))
row = cursor.fetchone()
conn.close()
if not row:
print(f" [FAIL] Lot not found in database")
return False
db_closing_time = row[0]
print(f" [OK] Database has closing time: {db_closing_time}")
if db_closing_time != closing_time:
print(f" [FAIL] Closing time mismatch")
print(f" Scraped: {closing_time}")
print(f" Database: {db_closing_time}")
return False
print("\n[PASS] TEST 2 PASSED: Offline mode works, closing time preserved")
return True
finally:
# Restore offline mode
config.OFFLINE = original_offline
self.scraper.offline = original_offline
async def _scrape_single_page(self, url):
"""Helper to scrape a single page"""
from playwright.async_api import async_playwright
if config.OFFLINE or self.scraper.offline:
# Offline mode - use crawl_page directly
return await self.scraper.crawl_page(page=None, url=url)
# Online mode - need browser
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
result = await self.scraper.crawl_page(page, url)
return result
finally:
await browser.close()
async def run_all_tests(self):
"""Run all tests"""
print("\n" + "="*70)
print("CACHE BEHAVIOR TEST SUITE")
print("="*70)
self.setup()
results = []
try:
# Test 1: Page fetched once
result1 = await self.test_page_fetched_once()
results.append(("Page Fetched Once", result1))
# Test 2: Offline mode
result2 = await self.test_offline_mode()
results.append(("Offline Mode", result2))
except Exception as e:
print(f"\n[ERROR] TEST SUITE ERROR: {e}")
import traceback
traceback.print_exc()
finally:
self.teardown()
# Print summary
print("\n" + "="*70)
print("TEST SUMMARY")
print("="*70)
all_passed = True
for test_name, passed in results:
status = "[PASS]" if passed else "[FAIL]"
print(f" {status}: {test_name}")
if not passed:
all_passed = False
print("="*70)
if all_passed:
print("\n*** ALL TESTS PASSED! ***")
return 0
else:
print("\n*** SOME TESTS FAILED ***")
return 1
async def main():
"""Run tests"""
tester = TestCacheBehavior()
exit_code = await tester.run_all_tests()
sys.exit(exit_code)
if __name__ == "__main__":
asyncio.run(main())