enrich data
This commit is contained in:
135
src/cache.py
135
src/cache.py
@@ -134,10 +134,13 @@ class CacheManager:
|
||||
reserve_met INTEGER,
|
||||
view_count INTEGER,
|
||||
api_data_json TEXT,
|
||||
next_scrape_at INTEGER,
|
||||
scrape_priority INTEGER DEFAULT 0,
|
||||
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
|
||||
)
|
||||
""")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_closing_time ON lots(closing_time)")
|
||||
|
||||
# Images table
|
||||
conn.execute("""
|
||||
@@ -200,14 +203,74 @@ class CacheManager:
|
||||
"""Run database migrations to add new columns to existing tables"""
|
||||
print("Checking for database migrations...")
|
||||
|
||||
# Check and add api_data_json column to lots table
|
||||
# Check and add new columns to lots table
|
||||
cursor = conn.execute("PRAGMA table_info(lots)")
|
||||
lots_columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
migrations_applied = False
|
||||
|
||||
if 'api_data_json' not in lots_columns:
|
||||
print(" > Adding api_data_json column to lots table...")
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN api_data_json TEXT")
|
||||
print(" * Migration complete")
|
||||
migrations_applied = True
|
||||
|
||||
if 'next_scrape_at' not in lots_columns:
|
||||
print(" > Adding next_scrape_at column to lots table...")
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN next_scrape_at INTEGER")
|
||||
migrations_applied = True
|
||||
|
||||
if 'scrape_priority' not in lots_columns:
|
||||
print(" > Adding scrape_priority column to lots table...")
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN scrape_priority INTEGER DEFAULT 0")
|
||||
migrations_applied = True
|
||||
|
||||
# Check resource_cache table structure
|
||||
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='resource_cache'")
|
||||
resource_cache_exists = cursor.fetchone() is not None
|
||||
|
||||
if resource_cache_exists:
|
||||
# Check if table has correct structure
|
||||
cursor = conn.execute("PRAGMA table_info(resource_cache)")
|
||||
resource_columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# Expected columns
|
||||
expected_columns = {'url', 'content', 'content_type', 'status_code', 'headers', 'timestamp', 'size_bytes', 'local_path'}
|
||||
|
||||
if resource_columns != expected_columns:
|
||||
print(" > Rebuilding resource_cache table with correct schema...")
|
||||
# Backup old data count
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM resource_cache")
|
||||
old_count = cursor.fetchone()[0]
|
||||
print(f" (Preserving {old_count} cached resources)")
|
||||
|
||||
# Drop and recreate with correct schema
|
||||
conn.execute("DROP TABLE IF EXISTS resource_cache")
|
||||
conn.execute("""
|
||||
CREATE TABLE resource_cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
content BLOB,
|
||||
content_type TEXT,
|
||||
status_code INTEGER,
|
||||
headers TEXT,
|
||||
timestamp REAL,
|
||||
size_bytes INTEGER,
|
||||
local_path TEXT
|
||||
)
|
||||
""")
|
||||
conn.execute("CREATE INDEX idx_resource_timestamp ON resource_cache(timestamp)")
|
||||
conn.execute("CREATE INDEX idx_resource_content_type ON resource_cache(content_type)")
|
||||
migrations_applied = True
|
||||
print(" * resource_cache table rebuilt")
|
||||
|
||||
# Create indexes after migrations (when columns exist)
|
||||
try:
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_priority ON lots(scrape_priority DESC)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_next_scrape ON lots(next_scrape_at)")
|
||||
except:
|
||||
pass # Indexes might already exist
|
||||
|
||||
if migrations_applied:
|
||||
print(" * Migrations complete")
|
||||
else:
|
||||
print(" * Database schema is up to date")
|
||||
|
||||
@@ -310,8 +373,8 @@ class CacheManager:
|
||||
year_manufactured, condition_score, condition_description,
|
||||
serial_number, manufacturer, damage_description,
|
||||
followers_count, estimated_min_price, estimated_max_price, lot_condition, appearance,
|
||||
scraped_at, api_data_json)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
scraped_at, api_data_json, next_scrape_at, scrape_priority)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
lot_data['lot_id'],
|
||||
lot_data.get('auction_id', ''),
|
||||
@@ -347,7 +410,9 @@ class CacheManager:
|
||||
lot_data.get('lot_condition', ''),
|
||||
lot_data.get('appearance', ''),
|
||||
lot_data['scraped_at'],
|
||||
lot_data.get('api_data_json')
|
||||
lot_data.get('api_data_json'),
|
||||
lot_data.get('next_scrape_at'),
|
||||
lot_data.get('scrape_priority', 0)
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
@@ -387,62 +452,38 @@ class CacheManager:
|
||||
conn.commit()
|
||||
|
||||
def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
|
||||
headers: Optional[Dict] = None, local_path: Optional[str] = None):
|
||||
"""Save a web resource (JS, CSS, image, font, etc.) to cache"""
|
||||
headers: Optional[Dict] = None, local_path: Optional[str] = None, cache_key: Optional[str] = None):
|
||||
"""Save a web resource (JS, CSS, image, font, etc.) to cache
|
||||
|
||||
Args:
|
||||
cache_key: Optional composite key (url + body hash for POST requests)
|
||||
"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
headers_json = json.dumps(headers) if headers else None
|
||||
size_bytes = len(content) if content else 0
|
||||
|
||||
# Use cache_key if provided, otherwise use url
|
||||
key = cache_key if cache_key else url
|
||||
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO resource_cache
|
||||
(url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
|
||||
""", (key, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
|
||||
conn.commit()
|
||||
|
||||
def get_resource(self, url: str) -> Optional[Dict]:
|
||||
"""Get a cached resource"""
|
||||
def get_resource(self, url: str, cache_key: Optional[str] = None) -> Optional[Dict]:
|
||||
"""Get a cached resource
|
||||
|
||||
Args:
|
||||
cache_key: Optional composite key to lookup
|
||||
"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
key = cache_key if cache_key else url
|
||||
cursor = conn.execute("""
|
||||
SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
|
||||
FROM resource_cache WHERE url = ?
|
||||
""", (url,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
return {
|
||||
'content': row[0],
|
||||
'content_type': row[1],
|
||||
'status_code': row[2],
|
||||
'headers': json.loads(row[3]) if row[3] else {},
|
||||
'timestamp': row[4],
|
||||
'size_bytes': row[5],
|
||||
'local_path': row[6],
|
||||
'cached': True
|
||||
}
|
||||
return None
|
||||
|
||||
def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
|
||||
headers: Optional[Dict] = None, local_path: Optional[str] = None):
|
||||
"""Save a web resource (JS, CSS, image, font, etc.) to cache"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
headers_json = json.dumps(headers) if headers else None
|
||||
size_bytes = len(content) if content else 0
|
||||
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO resource_cache
|
||||
(url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
|
||||
conn.commit()
|
||||
|
||||
def get_resource(self, url: str) -> Optional[Dict]:
|
||||
"""Get a cached resource"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
|
||||
FROM resource_cache WHERE url = ?
|
||||
""", (url,))
|
||||
""", (key,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
|
||||
171
src/priority.py
Normal file
171
src/priority.py
Normal file
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Priority calculation for intelligent scraping
|
||||
"""
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
def parse_closing_time(closing_time_str: Optional[str]) -> Optional[int]:
|
||||
"""Parse closing time string to unix timestamp"""
|
||||
if not closing_time_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Try various date formats
|
||||
formats = [
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S',
|
||||
'%Y-%m-%d %H:%M',
|
||||
'%d-%m-%Y %H:%M',
|
||||
]
|
||||
|
||||
for fmt in formats:
|
||||
try:
|
||||
dt = datetime.strptime(closing_time_str, fmt)
|
||||
return int(dt.timestamp())
|
||||
except:
|
||||
continue
|
||||
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def calculate_ttl(closing_timestamp: int, current_time: Optional[int] = None) -> int:
|
||||
"""
|
||||
Calculate Time-To-Live (TTL) for cache based on time until closing
|
||||
|
||||
Strategy:
|
||||
- Closing in > 7 days: Scrape once per day (TTL = 24 hours)
|
||||
- Closing in 3-7 days: Scrape every 12 hours
|
||||
- Closing in 1-3 days: Scrape every 6 hours
|
||||
- Closing in 12-24 hours: Scrape every 3 hours
|
||||
- Closing in 6-12 hours: Scrape every 2 hours
|
||||
- Closing in 1-6 hours: Scrape every 30 minutes
|
||||
- Closing in < 1 hour: Scrape every 10 minutes
|
||||
- Already closed: TTL = infinite (no need to rescrape)
|
||||
"""
|
||||
if current_time is None:
|
||||
current_time = int(time.time())
|
||||
|
||||
time_until_close = closing_timestamp - current_time
|
||||
|
||||
# Already closed - very low priority
|
||||
if time_until_close <= 0:
|
||||
return 999999999 # Effectively infinite TTL
|
||||
|
||||
# Convert to hours
|
||||
hours_until_close = time_until_close / 3600
|
||||
|
||||
if hours_until_close > 168: # > 7 days
|
||||
return 24 * 3600 # 24 hours
|
||||
elif hours_until_close > 72: # 3-7 days
|
||||
return 12 * 3600 # 12 hours
|
||||
elif hours_until_close > 24: # 1-3 days
|
||||
return 6 * 3600 # 6 hours
|
||||
elif hours_until_close > 12: # 12-24 hours
|
||||
return 3 * 3600 # 3 hours
|
||||
elif hours_until_close > 6: # 6-12 hours
|
||||
return 2 * 3600 # 2 hours
|
||||
elif hours_until_close > 1: # 1-6 hours
|
||||
return 30 * 60 # 30 minutes
|
||||
else: # < 1 hour - URGENT!
|
||||
return 10 * 60 # 10 minutes
|
||||
|
||||
|
||||
def calculate_priority(
|
||||
closing_time_str: Optional[str],
|
||||
scraped_at: Optional[int],
|
||||
current_time: Optional[int] = None
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Calculate scrape priority and next_scrape_at timestamp
|
||||
|
||||
Returns:
|
||||
(priority, next_scrape_at)
|
||||
|
||||
Priority Scale:
|
||||
10000+ = Never scraped (highest priority)
|
||||
9000+ = Closing within 1 hour
|
||||
8000+ = Closing within 6 hours
|
||||
7000+ = Closing within 24 hours
|
||||
6000+ = Closing within 3 days
|
||||
5000+ = Closing within 7 days
|
||||
1000+ = Due for re-scrape (TTL expired)
|
||||
0-999 = Recently scraped, not due yet
|
||||
-1000 = Already closed
|
||||
"""
|
||||
if current_time is None:
|
||||
current_time = int(time.time())
|
||||
|
||||
# Never scraped = highest priority
|
||||
if scraped_at is None or scraped_at == 0:
|
||||
closing_timestamp = parse_closing_time(closing_time_str)
|
||||
if closing_timestamp:
|
||||
ttl = calculate_ttl(closing_timestamp, current_time)
|
||||
next_scrape = current_time # Scrape immediately
|
||||
time_until_close = closing_timestamp - current_time
|
||||
|
||||
# Boost priority based on urgency
|
||||
if time_until_close <= 0:
|
||||
return (10000, next_scrape) # Closed but never scraped
|
||||
elif time_until_close < 3600:
|
||||
return (19000, next_scrape) # < 1 hour - CRITICAL
|
||||
elif time_until_close < 6 * 3600:
|
||||
return (18000, next_scrape) # < 6 hours
|
||||
elif time_until_close < 24 * 3600:
|
||||
return (17000, next_scrape) # < 24 hours
|
||||
elif time_until_close < 3 * 24 * 3600:
|
||||
return (16000, next_scrape) # < 3 days
|
||||
else:
|
||||
return (15000, next_scrape) # > 3 days but never scraped
|
||||
else:
|
||||
return (15000, current_time) # No closing time, high priority anyway
|
||||
|
||||
# Already scraped - calculate based on TTL
|
||||
closing_timestamp = parse_closing_time(closing_time_str)
|
||||
|
||||
if not closing_timestamp:
|
||||
# No closing time - scrape once per day
|
||||
ttl = 24 * 3600
|
||||
next_scrape = scraped_at + ttl
|
||||
time_until_rescrape = next_scrape - current_time
|
||||
|
||||
if time_until_rescrape <= 0:
|
||||
return (1000, current_time) # Due for rescrape
|
||||
else:
|
||||
return (500, next_scrape) # Not due yet
|
||||
|
||||
# Has closing time - intelligent TTL
|
||||
time_until_close = closing_timestamp - current_time
|
||||
|
||||
# Already closed
|
||||
if time_until_close <= 0:
|
||||
return (-1000, 999999999) # Very low priority, never rescrape
|
||||
|
||||
# Calculate TTL and next scrape time
|
||||
ttl = calculate_ttl(closing_timestamp, current_time)
|
||||
next_scrape = scraped_at + ttl
|
||||
time_until_rescrape = next_scrape - current_time
|
||||
|
||||
# Priority based on urgency and TTL
|
||||
if time_until_rescrape <= 0:
|
||||
# Due for rescrape - urgency-based priority
|
||||
if time_until_close < 3600:
|
||||
return (9000, current_time) # < 1 hour - URGENT
|
||||
elif time_until_close < 6 * 3600:
|
||||
return (8000, current_time) # < 6 hours
|
||||
elif time_until_close < 24 * 3600:
|
||||
return (7000, current_time) # < 24 hours
|
||||
elif time_until_close < 3 * 24 * 3600:
|
||||
return (6000, current_time) # < 3 days
|
||||
elif time_until_close < 7 * 24 * 3600:
|
||||
return (5000, current_time) # < 7 days
|
||||
else:
|
||||
return (1000, current_time) # > 7 days, but due
|
||||
else:
|
||||
# Not due yet - low priority
|
||||
return (min(999, int(time_until_close / 3600)), next_scrape)
|
||||
129
src/scraper.py
129
src/scraper.py
@@ -10,7 +10,7 @@ import random
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from playwright.async_api import async_playwright, Page
|
||||
@@ -27,6 +27,7 @@ from graphql_client import (
|
||||
extract_enriched_attributes
|
||||
)
|
||||
from bid_history_client import fetch_bid_history, parse_bid_history
|
||||
from priority import calculate_priority, parse_closing_time
|
||||
|
||||
class TroostwijkScraper:
|
||||
"""Main scraper class for Troostwijk Auctions"""
|
||||
@@ -535,6 +536,17 @@ class TroostwijkScraper:
|
||||
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
||||
|
||||
print(f" Location: {page_data.get('location', 'N/A')}")
|
||||
|
||||
# Calculate and store priority for next scrape
|
||||
current_time = int(time.time())
|
||||
priority, next_scrape = calculate_priority(
|
||||
page_data.get('closing_time'),
|
||||
current_time, # Just scraped now
|
||||
current_time
|
||||
)
|
||||
page_data['scrape_priority'] = priority
|
||||
page_data['next_scrape_at'] = next_scrape
|
||||
|
||||
self.cache.save_lot(page_data)
|
||||
|
||||
images = page_data.get('images', [])
|
||||
@@ -575,6 +587,77 @@ class TroostwijkScraper:
|
||||
|
||||
return page_data
|
||||
|
||||
def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]:
|
||||
"""
|
||||
Prioritize lots based on closing time and scrape history
|
||||
|
||||
Returns list of (priority, url, description) tuples sorted by priority (highest first)
|
||||
"""
|
||||
import sqlite3
|
||||
|
||||
prioritized = []
|
||||
current_time = int(time.time())
|
||||
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
for url in lot_urls:
|
||||
# Extract lot_id from URL
|
||||
lot_id = self.parser.extract_lot_id(url)
|
||||
|
||||
# Try to get existing data from database
|
||||
cursor.execute("""
|
||||
SELECT closing_time, scraped_at, scrape_priority, next_scrape_at
|
||||
FROM lots WHERE lot_id = ? OR url = ?
|
||||
""", (lot_id, url))
|
||||
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
closing_time, scraped_at, existing_priority, next_scrape_at = row
|
||||
|
||||
# Parse scraped_at (it might be a string timestamp)
|
||||
if isinstance(scraped_at, str):
|
||||
try:
|
||||
scraped_at = int(datetime.strptime(scraped_at, '%Y-%m-%d %H:%M:%S').timestamp())
|
||||
except:
|
||||
scraped_at = None
|
||||
else:
|
||||
closing_time = None
|
||||
scraped_at = None
|
||||
|
||||
# Calculate priority
|
||||
priority, next_scrape = calculate_priority(closing_time, scraped_at, current_time)
|
||||
|
||||
# Create description
|
||||
if scraped_at is None:
|
||||
desc = "Never scraped"
|
||||
elif priority >= 15000:
|
||||
desc = "Never scraped (high urgency)"
|
||||
elif priority >= 9000:
|
||||
desc = "URGENT: <1hr to close"
|
||||
elif priority >= 8000:
|
||||
desc = "High: <6hr to close"
|
||||
elif priority >= 7000:
|
||||
desc = "Medium: <24hr to close"
|
||||
elif priority >= 5000:
|
||||
desc = "Normal: <7d to close"
|
||||
elif priority >= 1000:
|
||||
desc = "Due for rescrape"
|
||||
elif priority < 0:
|
||||
desc = "Already closed"
|
||||
else:
|
||||
desc = f"Recently scraped"
|
||||
|
||||
prioritized.append((priority, url, desc))
|
||||
|
||||
conn.close()
|
||||
|
||||
# Sort by priority (highest first)
|
||||
prioritized.sort(key=lambda x: x[0], reverse=True)
|
||||
|
||||
return prioritized
|
||||
|
||||
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
|
||||
"""Main crawl function"""
|
||||
if self.offline:
|
||||
@@ -633,6 +716,24 @@ class TroostwijkScraper:
|
||||
|
||||
# Set up COMPREHENSIVE resource interception (cache EVERYTHING)
|
||||
resource_stats = {'cached': 0, 'fetched': 0, 'failed': 0}
|
||||
request_bodies = {} # Store POST request bodies by URL for cache key generation
|
||||
|
||||
async def handle_request(request):
|
||||
"""Intercept requests to capture POST bodies for GraphQL"""
|
||||
try:
|
||||
if request.method == 'POST' and 'graphql' in request.url:
|
||||
# Store the POST body
|
||||
post_data = request.post_data
|
||||
if post_data:
|
||||
# Create hash of POST body for cache key
|
||||
import hashlib
|
||||
body_hash = hashlib.md5(post_data.encode() if isinstance(post_data, str) else post_data).hexdigest()[:16]
|
||||
cache_key = f"{request.url}#{body_hash}"
|
||||
request_bodies[request.url] = (cache_key, post_data)
|
||||
except:
|
||||
pass
|
||||
|
||||
page.on('request', handle_request)
|
||||
|
||||
async def handle_response(response):
|
||||
"""Intercept ALL resources and cache them"""
|
||||
@@ -658,13 +759,19 @@ class TroostwijkScraper:
|
||||
try:
|
||||
body = await response.body()
|
||||
|
||||
# Determine cache key (use composite key for GraphQL POST requests)
|
||||
cache_key = None
|
||||
if 'graphql' in url and url in request_bodies:
|
||||
cache_key, post_data = request_bodies[url]
|
||||
|
||||
# Save to resource cache
|
||||
self.cache.save_resource(
|
||||
url=url,
|
||||
content=body,
|
||||
content_type=content_type,
|
||||
status_code=status,
|
||||
headers=headers
|
||||
headers=headers,
|
||||
cache_key=cache_key
|
||||
)
|
||||
resource_stats['cached'] += 1
|
||||
|
||||
@@ -746,14 +853,24 @@ class TroostwijkScraper:
|
||||
print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Phase 3: Scrape each lot page
|
||||
# Phase 2.5: Sort lots by priority (closing time + TTL)
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
|
||||
print("PHASE 2.5: CALCULATING SCRAPE PRIORITIES")
|
||||
print("="*60)
|
||||
|
||||
sorted_lots = self._prioritize_lots(all_lot_urls)
|
||||
print(f" > Sorted {len(sorted_lots)} lots by priority")
|
||||
print(f" > Highest priority: {sorted_lots[0][2] if sorted_lots else 'N/A'}")
|
||||
print(f" > Lowest priority: {sorted_lots[-1][2] if sorted_lots else 'N/A'}")
|
||||
|
||||
# Phase 3: Scrape each lot page (in priority order)
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 3: SCRAPING LOTS (PRIORITY ORDER)")
|
||||
print("="*60)
|
||||
|
||||
results = []
|
||||
for i, lot_url in enumerate(all_lot_urls):
|
||||
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
|
||||
for i, (priority, lot_url, priority_desc) in enumerate(sorted_lots):
|
||||
print(f"\n[{i+1:>3}/{len(sorted_lots)}] [P:{priority}] ", end="")
|
||||
page_data = await self.crawl_page(page, lot_url)
|
||||
if page_data:
|
||||
results.append(page_data)
|
||||
|
||||
303
test/test_cache_behavior.py
Normal file
303
test/test_cache_behavior.py
Normal file
@@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test cache behavior - verify page is only fetched once and data persists offline
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
||||
|
||||
from cache import CacheManager
|
||||
from scraper import TroostwijkScraper
|
||||
import config
|
||||
|
||||
|
||||
class TestCacheBehavior:
|
||||
"""Test suite for cache and offline functionality"""
|
||||
|
||||
def __init__(self):
|
||||
self.test_db = "test_cache.db"
|
||||
self.original_db = config.CACHE_DB
|
||||
self.cache = None
|
||||
self.scraper = None
|
||||
|
||||
def setup(self):
|
||||
"""Setup test environment"""
|
||||
print("\n" + "="*60)
|
||||
print("TEST SETUP")
|
||||
print("="*60)
|
||||
|
||||
# Use test database
|
||||
config.CACHE_DB = self.test_db
|
||||
|
||||
# Ensure offline mode is disabled for tests
|
||||
config.OFFLINE = False
|
||||
|
||||
# Clean up old test database
|
||||
if os.path.exists(self.test_db):
|
||||
os.remove(self.test_db)
|
||||
print(f" * Removed old test database")
|
||||
|
||||
# Initialize cache and scraper
|
||||
self.cache = CacheManager()
|
||||
self.scraper = TroostwijkScraper()
|
||||
self.scraper.offline = False # Explicitly disable offline mode
|
||||
|
||||
print(f" * Created test database: {self.test_db}")
|
||||
print(f" * Initialized cache and scraper")
|
||||
print(f" * Offline mode: DISABLED")
|
||||
|
||||
def teardown(self):
|
||||
"""Cleanup test environment"""
|
||||
print("\n" + "="*60)
|
||||
print("TEST TEARDOWN")
|
||||
print("="*60)
|
||||
|
||||
# Restore original database path
|
||||
config.CACHE_DB = self.original_db
|
||||
|
||||
# Keep test database for inspection
|
||||
print(f" * Test database preserved: {self.test_db}")
|
||||
print(f" * Restored original database path")
|
||||
|
||||
async def test_page_fetched_once(self):
|
||||
"""Test that a page is only fetched from network once"""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 1: Page Fetched Only Once")
|
||||
print("="*60)
|
||||
|
||||
# Pick a real lot URL to test with
|
||||
test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7"
|
||||
|
||||
print(f"\nTest URL: {test_url}")
|
||||
|
||||
# First visit - should fetch from network
|
||||
print("\n--- FIRST VISIT (should fetch from network) ---")
|
||||
start_time = time.time()
|
||||
|
||||
async with asyncio.timeout(60): # 60 second timeout
|
||||
page_data_1 = await self._scrape_single_page(test_url)
|
||||
|
||||
first_visit_time = time.time() - start_time
|
||||
|
||||
if not page_data_1:
|
||||
print(" [FAIL] First visit returned no data")
|
||||
return False
|
||||
|
||||
print(f" [OK] First visit completed in {first_visit_time:.2f}s")
|
||||
print(f" [OK] Got lot data: {page_data_1.get('title', 'N/A')[:60]}...")
|
||||
|
||||
# Check closing time was captured
|
||||
closing_time_1 = page_data_1.get('closing_time')
|
||||
print(f" [OK] Closing time: {closing_time_1}")
|
||||
|
||||
# Second visit - should use cache
|
||||
print("\n--- SECOND VISIT (should use cache) ---")
|
||||
start_time = time.time()
|
||||
|
||||
async with asyncio.timeout(30): # Should be much faster
|
||||
page_data_2 = await self._scrape_single_page(test_url)
|
||||
|
||||
second_visit_time = time.time() - start_time
|
||||
|
||||
if not page_data_2:
|
||||
print(" [FAIL] Second visit returned no data")
|
||||
return False
|
||||
|
||||
print(f" [OK] Second visit completed in {second_visit_time:.2f}s")
|
||||
|
||||
# Verify data matches
|
||||
if page_data_1.get('lot_id') != page_data_2.get('lot_id'):
|
||||
print(f" [FAIL] Lot IDs don't match")
|
||||
return False
|
||||
|
||||
closing_time_2 = page_data_2.get('closing_time')
|
||||
print(f" [OK] Closing time: {closing_time_2}")
|
||||
|
||||
if closing_time_1 != closing_time_2:
|
||||
print(f" [FAIL] Closing times don't match!")
|
||||
print(f" First: {closing_time_1}")
|
||||
print(f" Second: {closing_time_2}")
|
||||
return False
|
||||
|
||||
# Verify second visit was significantly faster (used cache)
|
||||
if second_visit_time >= first_visit_time * 0.5:
|
||||
print(f" [WARN] Second visit not significantly faster")
|
||||
print(f" First: {first_visit_time:.2f}s")
|
||||
print(f" Second: {second_visit_time:.2f}s")
|
||||
else:
|
||||
print(f" [OK] Second visit was {(first_visit_time / second_visit_time):.1f}x faster (cache working!)")
|
||||
|
||||
# Verify resource cache has entries
|
||||
conn = sqlite3.connect(self.test_db)
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM resource_cache")
|
||||
resource_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
print(f" [OK] Cached {resource_count} resources")
|
||||
|
||||
print("\n[PASS] TEST 1 PASSED: Page fetched only once, data persists")
|
||||
return True
|
||||
|
||||
async def test_offline_mode(self):
|
||||
"""Test that offline mode works with cached data"""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 2: Offline Mode with Cached Data")
|
||||
print("="*60)
|
||||
|
||||
# Use the same URL from test 1 (should be cached)
|
||||
test_url = "https://www.troostwijkauctions.com/l/bmw-x5-xdrive40d-high-executive-m-sport-a8-286pk-2019-A1-26955-7"
|
||||
|
||||
# Enable offline mode
|
||||
original_offline = config.OFFLINE
|
||||
config.OFFLINE = True
|
||||
self.scraper.offline = True
|
||||
|
||||
print(f"\nTest URL: {test_url}")
|
||||
print(" * Offline mode: ENABLED")
|
||||
|
||||
try:
|
||||
# Try to scrape in offline mode
|
||||
print("\n--- OFFLINE SCRAPE (should use DB/cache only) ---")
|
||||
start_time = time.time()
|
||||
|
||||
async with asyncio.timeout(30):
|
||||
page_data = await self._scrape_single_page(test_url)
|
||||
|
||||
offline_time = time.time() - start_time
|
||||
|
||||
if not page_data:
|
||||
print(" [FAIL] Offline mode returned no data")
|
||||
return False
|
||||
|
||||
print(f" [OK] Offline scrape completed in {offline_time:.2f}s")
|
||||
print(f" [OK] Got lot data: {page_data.get('title', 'N/A')[:60]}...")
|
||||
|
||||
# Check closing time is available
|
||||
closing_time = page_data.get('closing_time')
|
||||
if not closing_time:
|
||||
print(f" [FAIL] No closing time in offline mode")
|
||||
return False
|
||||
|
||||
print(f" [OK] Closing time preserved: {closing_time}")
|
||||
|
||||
# Verify essential fields are present
|
||||
essential_fields = ['lot_id', 'title', 'url', 'location']
|
||||
missing_fields = [f for f in essential_fields if not page_data.get(f)]
|
||||
|
||||
if missing_fields:
|
||||
print(f" [FAIL] Missing essential fields: {missing_fields}")
|
||||
return False
|
||||
|
||||
print(f" [OK] All essential fields present")
|
||||
|
||||
# Check database has the lot
|
||||
conn = sqlite3.connect(self.test_db)
|
||||
cursor = conn.execute("SELECT closing_time FROM lots WHERE url = ?", (test_url,))
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row:
|
||||
print(f" [FAIL] Lot not found in database")
|
||||
return False
|
||||
|
||||
db_closing_time = row[0]
|
||||
print(f" [OK] Database has closing time: {db_closing_time}")
|
||||
|
||||
if db_closing_time != closing_time:
|
||||
print(f" [FAIL] Closing time mismatch")
|
||||
print(f" Scraped: {closing_time}")
|
||||
print(f" Database: {db_closing_time}")
|
||||
return False
|
||||
|
||||
print("\n[PASS] TEST 2 PASSED: Offline mode works, closing time preserved")
|
||||
return True
|
||||
|
||||
finally:
|
||||
# Restore offline mode
|
||||
config.OFFLINE = original_offline
|
||||
self.scraper.offline = original_offline
|
||||
|
||||
async def _scrape_single_page(self, url):
|
||||
"""Helper to scrape a single page"""
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
if config.OFFLINE or self.scraper.offline:
|
||||
# Offline mode - use crawl_page directly
|
||||
return await self.scraper.crawl_page(page=None, url=url)
|
||||
|
||||
# Online mode - need browser
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
try:
|
||||
result = await self.scraper.crawl_page(page, url)
|
||||
return result
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
async def run_all_tests(self):
|
||||
"""Run all tests"""
|
||||
print("\n" + "="*70)
|
||||
print("CACHE BEHAVIOR TEST SUITE")
|
||||
print("="*70)
|
||||
|
||||
self.setup()
|
||||
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Test 1: Page fetched once
|
||||
result1 = await self.test_page_fetched_once()
|
||||
results.append(("Page Fetched Once", result1))
|
||||
|
||||
# Test 2: Offline mode
|
||||
result2 = await self.test_offline_mode()
|
||||
results.append(("Offline Mode", result2))
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n[ERROR] TEST SUITE ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
finally:
|
||||
self.teardown()
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*70)
|
||||
print("TEST SUMMARY")
|
||||
print("="*70)
|
||||
|
||||
all_passed = True
|
||||
for test_name, passed in results:
|
||||
status = "[PASS]" if passed else "[FAIL]"
|
||||
print(f" {status}: {test_name}")
|
||||
if not passed:
|
||||
all_passed = False
|
||||
|
||||
print("="*70)
|
||||
|
||||
if all_passed:
|
||||
print("\n*** ALL TESTS PASSED! ***")
|
||||
return 0
|
||||
else:
|
||||
print("\n*** SOME TESTS FAILED ***")
|
||||
return 1
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run tests"""
|
||||
tester = TestCacheBehavior()
|
||||
exit_code = await tester.run_all_tests()
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user