enrich data

2025-12-09 01:39:36 +01:00
parent 83d0fc1329
commit 06f63732b1
2 changed files with 181 additions and 22 deletions
--- a/src/cache.py
+++ b/src/cache.py
@@ -6,6 +6,7 @@ Cache Manager module for SQLite-based caching and data storage
 import sqlite3
 import time
 import zlib
+import json
 from datetime import datetime
 from typing import Dict, List, Optional

@@ -21,7 +22,7 @@ class CacheManager:
    def _init_db(self):
        """Initialize cache and data storage database with consolidated schema"""
        with sqlite3.connect(self.db_path) as conn:
-            # Cache table
+            # HTML page cache table (existing)
            conn.execute("""
                CREATE TABLE IF NOT EXISTS cache (
                    url TEXT PRIMARY KEY,
@@ -34,6 +35,26 @@ class CacheManager:
                CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
            """)

+            # Resource cache table (NEW: for ALL web resources - JS, CSS, images, fonts, etc.)
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS resource_cache (
+                    url TEXT PRIMARY KEY,
+                    content BLOB,
+                    content_type TEXT,
+                    status_code INTEGER,
+                    headers TEXT,
+                    timestamp REAL,
+                    size_bytes INTEGER,
+                    local_path TEXT
+                )
+            """)
+            conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_resource_timestamp ON resource_cache(timestamp)
+            """)
+            conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_resource_content_type ON resource_cache(content_type)
+            """)
+
            # Auctions table - consolidated schema
            conn.execute("""
                CREATE TABLE IF NOT EXISTS auctions (
@@ -170,8 +191,26 @@ class CacheManager:
                ON bid_history(bidder_id)
            """)

+            # MIGRATIONS: Add new columns to existing tables
+            self._run_migrations(conn)
+
            conn.commit()

+    def _run_migrations(self, conn):
+        """Run database migrations to add new columns to existing tables"""
+        print("Checking for database migrations...")
+
+        # Check and add api_data_json column to lots table
+        cursor = conn.execute("PRAGMA table_info(lots)")
+        lots_columns = {row[1] for row in cursor.fetchall()}
+
+        if 'api_data_json' not in lots_columns:
+            print("  > Adding api_data_json column to lots table...")
+            conn.execute("ALTER TABLE lots ADD COLUMN api_data_json TEXT")
+            print("  * Migration complete")
+        else:
+            print("  * Database schema is up to date")
+
    def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
        """Get cached page if it exists and is not too old"""
        with sqlite3.connect(self.db_path) as conn:
@@ -345,4 +384,76 @@ class CacheManager:
                    INSERT OR IGNORE INTO images (lot_id, url, downloaded)
                    VALUES (?, ?, 0)
                """, (lot_id, url))
-            conn.commit()
+            conn.commit()
+
+    def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
+                     headers: Optional[Dict] = None, local_path: Optional[str] = None):
+        """Save a web resource (JS, CSS, image, font, etc.) to cache"""
+        with sqlite3.connect(self.db_path) as conn:
+            headers_json = json.dumps(headers) if headers else None
+            size_bytes = len(content) if content else 0
+
+            conn.execute("""
+                INSERT OR REPLACE INTO resource_cache
+                (url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            """, (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
+            conn.commit()
+
+    def get_resource(self, url: str) -> Optional[Dict]:
+        """Get a cached resource"""
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.execute("""
+                SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
+                FROM resource_cache WHERE url = ?
+            """, (url,))
+            row = cursor.fetchone()
+
+            if row:
+                return {
+                    'content': row[0],
+                    'content_type': row[1],
+                    'status_code': row[2],
+                    'headers': json.loads(row[3]) if row[3] else {},
+                    'timestamp': row[4],
+                    'size_bytes': row[5],
+                    'local_path': row[6],
+                    'cached': True
+                }
+        return None
+
+    def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
+                     headers: Optional[Dict] = None, local_path: Optional[str] = None):
+        """Save a web resource (JS, CSS, image, font, etc.) to cache"""
+        with sqlite3.connect(self.db_path) as conn:
+            headers_json = json.dumps(headers) if headers else None
+            size_bytes = len(content) if content else 0
+
+            conn.execute("""
+                INSERT OR REPLACE INTO resource_cache
+                (url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            """, (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
+            conn.commit()
+
+    def get_resource(self, url: str) -> Optional[Dict]:
+        """Get a cached resource"""
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.execute("""
+                SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
+                FROM resource_cache WHERE url = ?
+            """, (url,))
+            row = cursor.fetchone()
+
+            if row:
+                return {
+                    'content': row[0],
+                    'content_type': row[1],
+                    'status_code': row[2],
+                    'headers': json.loads(row[3]) if row[3] else {},
+                    'timestamp': row[4],
+                    'size_bytes': row[5],
+                    'local_path': row[6],
+                    'cached': True
+                }
+        return None
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -631,29 +631,67 @@ class TroostwijkScraper:
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
            })

-            # Set up GraphQL API interception
+            # Set up COMPREHENSIVE resource interception (cache EVERYTHING)
+            resource_stats = {'cached': 0, 'fetched': 0, 'failed': 0}
+
            async def handle_response(response):
-                """Intercept GraphQL API responses"""
-                if 'graphql' in response.url and response.status == 200:
-                    try:
-                        body = await response.body()
-                        body_text = body.decode('utf-8')
+                """Intercept ALL resources and cache them"""
+                try:
+                    url = response.url
+                    status = response.status

-                        # Try to extract lot_id from the request to key our cache
-                        # The URL pattern is typically: .../storefront/graphql
-                        # We'll store by lot_id which we extract from the response data
-                        data = json.loads(body_text)
+                    # Get content type
+                    headers = await response.all_headers()
+                    content_type = headers.get('content-type', '').split(';')[0].strip()

-                        # Check if this is a lot details query
-                        if 'data' in data and 'lot' in data.get('data', {}):
-                            lot_data = data['data']['lot']
-                            lot_slug = lot_data.get('urlSlug', '')
-                            if lot_slug:
-                                self.intercepted_api_data[lot_slug] = body_text
-                                print(f"  >> Intercepted API data for: {lot_slug}")
-                    except Exception as e:
-                        # Silent fail - interception is opportunistic
-                        pass
+                    # Determine if we should cache this resource
+                    cacheable_types = [
+                        'text/html', 'text/css', 'text/javascript', 'application/javascript',
+                        'application/json', 'application/x-javascript', 'image/', 'font/',
+                        'application/font', 'video/', 'audio/', 'application/xml', 'text/xml',
+                        'image/svg+xml'
+                    ]
+
+                    should_cache = any(content_type.startswith(ct) for ct in cacheable_types)
+
+                    if should_cache and status == 200:
+                        try:
+                            body = await response.body()
+
+                            # Save to resource cache
+                            self.cache.save_resource(
+                                url=url,
+                                content=body,
+                                content_type=content_type,
+                                status_code=status,
+                                headers=headers
+                            )
+                            resource_stats['cached'] += 1
+
+                            # Special handling for GraphQL responses
+                            if 'graphql' in url and 'application/json' in content_type:
+                                try:
+                                    body_text = body.decode('utf-8')
+                                    data = json.loads(body_text)
+
+                                    # Check if this is a lot details query
+                                    if 'data' in data and 'lot' in data.get('data', {}):
+                                        lot_data = data['data']['lot']
+                                        lot_slug = lot_data.get('urlSlug', '')
+                                        if lot_slug:
+                                            self.intercepted_api_data[lot_slug] = body_text
+                                            print(f"  >> Intercepted GraphQL for: {lot_slug}")
+                                except:
+                                    pass
+
+                        except Exception as e:
+                            resource_stats['failed'] += 1
+                    else:
+                        resource_stats['fetched'] += 1
+
+                except Exception as e:
+                    # Silent fail - interception is opportunistic
+                    pass

            page.on('response', handle_response)

@@ -721,6 +759,16 @@ class TroostwijkScraper:
                    results.append(page_data)

            await browser.close()
+
+            # Print resource caching statistics
+            print(f"\n{'='*60}")
+            print(f"RESOURCE CACHE STATISTICS")
+            print(f"{'='*60}")
+            print(f"  Cached: {resource_stats['cached']} resources")
+            print(f"  Fetched (not cached): {resource_stats['fetched']}")
+            print(f"  Failed: {resource_stats['failed']}")
+            print(f"{'='*60}")
+
            return results

    def export_to_files(self) -> Dict[str, str]: