enrich data

This commit is contained in:
Tour
2025-12-09 01:39:36 +01:00
parent 83d0fc1329
commit 06f63732b1
2 changed files with 181 additions and 22 deletions

View File

@@ -6,6 +6,7 @@ Cache Manager module for SQLite-based caching and data storage
import sqlite3
import time
import zlib
import json
from datetime import datetime
from typing import Dict, List, Optional
@@ -21,7 +22,7 @@ class CacheManager:
def _init_db(self):
"""Initialize cache and data storage database with consolidated schema"""
with sqlite3.connect(self.db_path) as conn:
# Cache table
# HTML page cache table (existing)
conn.execute("""
CREATE TABLE IF NOT EXISTS cache (
url TEXT PRIMARY KEY,
@@ -34,6 +35,26 @@ class CacheManager:
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
""")
# Resource cache table (NEW: for ALL web resources - JS, CSS, images, fonts, etc.)
conn.execute("""
CREATE TABLE IF NOT EXISTS resource_cache (
url TEXT PRIMARY KEY,
content BLOB,
content_type TEXT,
status_code INTEGER,
headers TEXT,
timestamp REAL,
size_bytes INTEGER,
local_path TEXT
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_resource_timestamp ON resource_cache(timestamp)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_resource_content_type ON resource_cache(content_type)
""")
# Auctions table - consolidated schema
conn.execute("""
CREATE TABLE IF NOT EXISTS auctions (
@@ -170,8 +191,26 @@ class CacheManager:
ON bid_history(bidder_id)
""")
# MIGRATIONS: Add new columns to existing tables
self._run_migrations(conn)
conn.commit()
def _run_migrations(self, conn):
"""Run database migrations to add new columns to existing tables"""
print("Checking for database migrations...")
# Check and add api_data_json column to lots table
cursor = conn.execute("PRAGMA table_info(lots)")
lots_columns = {row[1] for row in cursor.fetchall()}
if 'api_data_json' not in lots_columns:
print(" > Adding api_data_json column to lots table...")
conn.execute("ALTER TABLE lots ADD COLUMN api_data_json TEXT")
print(" * Migration complete")
else:
print(" * Database schema is up to date")
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
"""Get cached page if it exists and is not too old"""
with sqlite3.connect(self.db_path) as conn:
@@ -345,4 +384,76 @@ class CacheManager:
INSERT OR IGNORE INTO images (lot_id, url, downloaded)
VALUES (?, ?, 0)
""", (lot_id, url))
conn.commit()
conn.commit()
def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
headers: Optional[Dict] = None, local_path: Optional[str] = None):
"""Save a web resource (JS, CSS, image, font, etc.) to cache"""
with sqlite3.connect(self.db_path) as conn:
headers_json = json.dumps(headers) if headers else None
size_bytes = len(content) if content else 0
conn.execute("""
INSERT OR REPLACE INTO resource_cache
(url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
conn.commit()
def get_resource(self, url: str) -> Optional[Dict]:
"""Get a cached resource"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
FROM resource_cache WHERE url = ?
""", (url,))
row = cursor.fetchone()
if row:
return {
'content': row[0],
'content_type': row[1],
'status_code': row[2],
'headers': json.loads(row[3]) if row[3] else {},
'timestamp': row[4],
'size_bytes': row[5],
'local_path': row[6],
'cached': True
}
return None
def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
headers: Optional[Dict] = None, local_path: Optional[str] = None):
"""Save a web resource (JS, CSS, image, font, etc.) to cache"""
with sqlite3.connect(self.db_path) as conn:
headers_json = json.dumps(headers) if headers else None
size_bytes = len(content) if content else 0
conn.execute("""
INSERT OR REPLACE INTO resource_cache
(url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
conn.commit()
def get_resource(self, url: str) -> Optional[Dict]:
"""Get a cached resource"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
FROM resource_cache WHERE url = ?
""", (url,))
row = cursor.fetchone()
if row:
return {
'content': row[0],
'content_type': row[1],
'status_code': row[2],
'headers': json.loads(row[3]) if row[3] else {},
'timestamp': row[4],
'size_bytes': row[5],
'local_path': row[6],
'cached': True
}
return None

View File

@@ -631,29 +631,67 @@ class TroostwijkScraper:
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
})
# Set up GraphQL API interception
# Set up COMPREHENSIVE resource interception (cache EVERYTHING)
resource_stats = {'cached': 0, 'fetched': 0, 'failed': 0}
async def handle_response(response):
"""Intercept GraphQL API responses"""
if 'graphql' in response.url and response.status == 200:
try:
body = await response.body()
body_text = body.decode('utf-8')
"""Intercept ALL resources and cache them"""
try:
url = response.url
status = response.status
# Try to extract lot_id from the request to key our cache
# The URL pattern is typically: .../storefront/graphql
# We'll store by lot_id which we extract from the response data
data = json.loads(body_text)
# Get content type
headers = await response.all_headers()
content_type = headers.get('content-type', '').split(';')[0].strip()
# Check if this is a lot details query
if 'data' in data and 'lot' in data.get('data', {}):
lot_data = data['data']['lot']
lot_slug = lot_data.get('urlSlug', '')
if lot_slug:
self.intercepted_api_data[lot_slug] = body_text
print(f" >> Intercepted API data for: {lot_slug}")
except Exception as e:
# Silent fail - interception is opportunistic
pass
# Determine if we should cache this resource
cacheable_types = [
'text/html', 'text/css', 'text/javascript', 'application/javascript',
'application/json', 'application/x-javascript', 'image/', 'font/',
'application/font', 'video/', 'audio/', 'application/xml', 'text/xml',
'image/svg+xml'
]
should_cache = any(content_type.startswith(ct) for ct in cacheable_types)
if should_cache and status == 200:
try:
body = await response.body()
# Save to resource cache
self.cache.save_resource(
url=url,
content=body,
content_type=content_type,
status_code=status,
headers=headers
)
resource_stats['cached'] += 1
# Special handling for GraphQL responses
if 'graphql' in url and 'application/json' in content_type:
try:
body_text = body.decode('utf-8')
data = json.loads(body_text)
# Check if this is a lot details query
if 'data' in data and 'lot' in data.get('data', {}):
lot_data = data['data']['lot']
lot_slug = lot_data.get('urlSlug', '')
if lot_slug:
self.intercepted_api_data[lot_slug] = body_text
print(f" >> Intercepted GraphQL for: {lot_slug}")
except:
pass
except Exception as e:
resource_stats['failed'] += 1
else:
resource_stats['fetched'] += 1
except Exception as e:
# Silent fail - interception is opportunistic
pass
page.on('response', handle_response)
@@ -721,6 +759,16 @@ class TroostwijkScraper:
results.append(page_data)
await browser.close()
# Print resource caching statistics
print(f"\n{'='*60}")
print(f"RESOURCE CACHE STATISTICS")
print(f"{'='*60}")
print(f" Cached: {resource_stats['cached']} resources")
print(f" Fetched (not cached): {resource_stats['fetched']}")
print(f" Failed: {resource_stats['failed']}")
print(f"{'='*60}")
return results
def export_to_files(self) -> Dict[str, str]: