diff --git a/src/cache.py b/src/cache.py index 0d00130..f31e93c 100644 --- a/src/cache.py +++ b/src/cache.py @@ -6,6 +6,7 @@ Cache Manager module for SQLite-based caching and data storage import sqlite3 import time import zlib +import json from datetime import datetime from typing import Dict, List, Optional @@ -21,7 +22,7 @@ class CacheManager: def _init_db(self): """Initialize cache and data storage database with consolidated schema""" with sqlite3.connect(self.db_path) as conn: - # Cache table + # HTML page cache table (existing) conn.execute(""" CREATE TABLE IF NOT EXISTS cache ( url TEXT PRIMARY KEY, @@ -34,6 +35,26 @@ class CacheManager: CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp) """) + # Resource cache table (NEW: for ALL web resources - JS, CSS, images, fonts, etc.) + conn.execute(""" + CREATE TABLE IF NOT EXISTS resource_cache ( + url TEXT PRIMARY KEY, + content BLOB, + content_type TEXT, + status_code INTEGER, + headers TEXT, + timestamp REAL, + size_bytes INTEGER, + local_path TEXT + ) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_resource_timestamp ON resource_cache(timestamp) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_resource_content_type ON resource_cache(content_type) + """) + # Auctions table - consolidated schema conn.execute(""" CREATE TABLE IF NOT EXISTS auctions ( @@ -170,8 +191,26 @@ class CacheManager: ON bid_history(bidder_id) """) + # MIGRATIONS: Add new columns to existing tables + self._run_migrations(conn) + conn.commit() + def _run_migrations(self, conn): + """Run database migrations to add new columns to existing tables""" + print("Checking for database migrations...") + + # Check and add api_data_json column to lots table + cursor = conn.execute("PRAGMA table_info(lots)") + lots_columns = {row[1] for row in cursor.fetchall()} + + if 'api_data_json' not in lots_columns: + print(" > Adding api_data_json column to lots table...") + conn.execute("ALTER TABLE lots ADD COLUMN api_data_json TEXT") + print(" * Migration complete") + else: + print(" * Database schema is up to date") + def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]: """Get cached page if it exists and is not too old""" with sqlite3.connect(self.db_path) as conn: @@ -345,4 +384,76 @@ class CacheManager: INSERT OR IGNORE INTO images (lot_id, url, downloaded) VALUES (?, ?, 0) """, (lot_id, url)) - conn.commit() \ No newline at end of file + conn.commit() + + def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200, + headers: Optional[Dict] = None, local_path: Optional[str] = None): + """Save a web resource (JS, CSS, image, font, etc.) to cache""" + with sqlite3.connect(self.db_path) as conn: + headers_json = json.dumps(headers) if headers else None + size_bytes = len(content) if content else 0 + + conn.execute(""" + INSERT OR REPLACE INTO resource_cache + (url, content, content_type, status_code, headers, timestamp, size_bytes, local_path) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path)) + conn.commit() + + def get_resource(self, url: str) -> Optional[Dict]: + """Get a cached resource""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path + FROM resource_cache WHERE url = ? + """, (url,)) + row = cursor.fetchone() + + if row: + return { + 'content': row[0], + 'content_type': row[1], + 'status_code': row[2], + 'headers': json.loads(row[3]) if row[3] else {}, + 'timestamp': row[4], + 'size_bytes': row[5], + 'local_path': row[6], + 'cached': True + } + return None + + def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200, + headers: Optional[Dict] = None, local_path: Optional[str] = None): + """Save a web resource (JS, CSS, image, font, etc.) to cache""" + with sqlite3.connect(self.db_path) as conn: + headers_json = json.dumps(headers) if headers else None + size_bytes = len(content) if content else 0 + + conn.execute(""" + INSERT OR REPLACE INTO resource_cache + (url, content, content_type, status_code, headers, timestamp, size_bytes, local_path) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path)) + conn.commit() + + def get_resource(self, url: str) -> Optional[Dict]: + """Get a cached resource""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path + FROM resource_cache WHERE url = ? + """, (url,)) + row = cursor.fetchone() + + if row: + return { + 'content': row[0], + 'content_type': row[1], + 'status_code': row[2], + 'headers': json.loads(row[3]) if row[3] else {}, + 'timestamp': row[4], + 'size_bytes': row[5], + 'local_path': row[6], + 'cached': True + } + return None \ No newline at end of file diff --git a/src/scraper.py b/src/scraper.py index 718fbda..3eae799 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -631,29 +631,67 @@ class TroostwijkScraper: 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' }) - # Set up GraphQL API interception + # Set up COMPREHENSIVE resource interception (cache EVERYTHING) + resource_stats = {'cached': 0, 'fetched': 0, 'failed': 0} + async def handle_response(response): - """Intercept GraphQL API responses""" - if 'graphql' in response.url and response.status == 200: - try: - body = await response.body() - body_text = body.decode('utf-8') + """Intercept ALL resources and cache them""" + try: + url = response.url + status = response.status - # Try to extract lot_id from the request to key our cache - # The URL pattern is typically: .../storefront/graphql - # We'll store by lot_id which we extract from the response data - data = json.loads(body_text) + # Get content type + headers = await response.all_headers() + content_type = headers.get('content-type', '').split(';')[0].strip() - # Check if this is a lot details query - if 'data' in data and 'lot' in data.get('data', {}): - lot_data = data['data']['lot'] - lot_slug = lot_data.get('urlSlug', '') - if lot_slug: - self.intercepted_api_data[lot_slug] = body_text - print(f" >> Intercepted API data for: {lot_slug}") - except Exception as e: - # Silent fail - interception is opportunistic - pass + # Determine if we should cache this resource + cacheable_types = [ + 'text/html', 'text/css', 'text/javascript', 'application/javascript', + 'application/json', 'application/x-javascript', 'image/', 'font/', + 'application/font', 'video/', 'audio/', 'application/xml', 'text/xml', + 'image/svg+xml' + ] + + should_cache = any(content_type.startswith(ct) for ct in cacheable_types) + + if should_cache and status == 200: + try: + body = await response.body() + + # Save to resource cache + self.cache.save_resource( + url=url, + content=body, + content_type=content_type, + status_code=status, + headers=headers + ) + resource_stats['cached'] += 1 + + # Special handling for GraphQL responses + if 'graphql' in url and 'application/json' in content_type: + try: + body_text = body.decode('utf-8') + data = json.loads(body_text) + + # Check if this is a lot details query + if 'data' in data and 'lot' in data.get('data', {}): + lot_data = data['data']['lot'] + lot_slug = lot_data.get('urlSlug', '') + if lot_slug: + self.intercepted_api_data[lot_slug] = body_text + print(f" >> Intercepted GraphQL for: {lot_slug}") + except: + pass + + except Exception as e: + resource_stats['failed'] += 1 + else: + resource_stats['fetched'] += 1 + + except Exception as e: + # Silent fail - interception is opportunistic + pass page.on('response', handle_response) @@ -721,6 +759,16 @@ class TroostwijkScraper: results.append(page_data) await browser.close() + + # Print resource caching statistics + print(f"\n{'='*60}") + print(f"RESOURCE CACHE STATISTICS") + print(f"{'='*60}") + print(f" Cached: {resource_stats['cached']} resources") + print(f" Fetched (not cached): {resource_stats['fetched']}") + print(f" Failed: {resource_stats['failed']}") + print(f"{'='*60}") + return results def export_to_files(self) -> Dict[str, str]: