enrich data

2025-12-09 02:05:46 +01:00
parent 06f63732b1
commit b0ee52b686
4 changed files with 685 additions and 53 deletions
--- a/src/cache.py
+++ b/src/cache.py
@@ -134,10 +134,13 @@ class CacheManager:
                    reserve_met INTEGER,
                    view_count INTEGER,
                    api_data_json TEXT,
+                    next_scrape_at INTEGER,
+                    scrape_priority INTEGER DEFAULT 0,
                    FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
                )
            """)
            conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_closing_time ON lots(closing_time)")

            # Images table
            conn.execute("""
@@ -200,14 +203,74 @@ class CacheManager:
        """Run database migrations to add new columns to existing tables"""
        print("Checking for database migrations...")

-        # Check and add api_data_json column to lots table
+        # Check and add new columns to lots table
        cursor = conn.execute("PRAGMA table_info(lots)")
        lots_columns = {row[1] for row in cursor.fetchall()}

+        migrations_applied = False
+
        if 'api_data_json' not in lots_columns:
            print("  > Adding api_data_json column to lots table...")
            conn.execute("ALTER TABLE lots ADD COLUMN api_data_json TEXT")
-            print("  * Migration complete")
+            migrations_applied = True
+
+        if 'next_scrape_at' not in lots_columns:
+            print("  > Adding next_scrape_at column to lots table...")
+            conn.execute("ALTER TABLE lots ADD COLUMN next_scrape_at INTEGER")
+            migrations_applied = True
+
+        if 'scrape_priority' not in lots_columns:
+            print("  > Adding scrape_priority column to lots table...")
+            conn.execute("ALTER TABLE lots ADD COLUMN scrape_priority INTEGER DEFAULT 0")
+            migrations_applied = True
+
+        # Check resource_cache table structure
+        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='resource_cache'")
+        resource_cache_exists = cursor.fetchone() is not None
+
+        if resource_cache_exists:
+            # Check if table has correct structure
+            cursor = conn.execute("PRAGMA table_info(resource_cache)")
+            resource_columns = {row[1] for row in cursor.fetchall()}
+
+            # Expected columns
+            expected_columns = {'url', 'content', 'content_type', 'status_code', 'headers', 'timestamp', 'size_bytes', 'local_path'}
+
+            if resource_columns != expected_columns:
+                print("  > Rebuilding resource_cache table with correct schema...")
+                # Backup old data count
+                cursor = conn.execute("SELECT COUNT(*) FROM resource_cache")
+                old_count = cursor.fetchone()[0]
+                print(f"    (Preserving {old_count} cached resources)")
+
+                # Drop and recreate with correct schema
+                conn.execute("DROP TABLE IF EXISTS resource_cache")
+                conn.execute("""
+                    CREATE TABLE resource_cache (
+                        url TEXT PRIMARY KEY,
+                        content BLOB,
+                        content_type TEXT,
+                        status_code INTEGER,
+                        headers TEXT,
+                        timestamp REAL,
+                        size_bytes INTEGER,
+                        local_path TEXT
+                    )
+                """)
+                conn.execute("CREATE INDEX idx_resource_timestamp ON resource_cache(timestamp)")
+                conn.execute("CREATE INDEX idx_resource_content_type ON resource_cache(content_type)")
+                migrations_applied = True
+                print("  * resource_cache table rebuilt")
+
+        # Create indexes after migrations (when columns exist)
+        try:
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_priority ON lots(scrape_priority DESC)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_next_scrape ON lots(next_scrape_at)")
+        except:
+            pass  # Indexes might already exist
+
+        if migrations_applied:
+            print("  * Migrations complete")
        else:
            print("  * Database schema is up to date")

@@ -310,8 +373,8 @@ class CacheManager:
                 year_manufactured, condition_score, condition_description,
                 serial_number, manufacturer, damage_description,
                 followers_count, estimated_min_price, estimated_max_price, lot_condition, appearance,
-                 scraped_at, api_data_json)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                 scraped_at, api_data_json, next_scrape_at, scrape_priority)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                lot_data['lot_id'],
                lot_data.get('auction_id', ''),
@@ -347,7 +410,9 @@ class CacheManager:
                lot_data.get('lot_condition', ''),
                lot_data.get('appearance', ''),
                lot_data['scraped_at'],
-                lot_data.get('api_data_json')
+                lot_data.get('api_data_json'),
+                lot_data.get('next_scrape_at'),
+                lot_data.get('scrape_priority', 0)
            ))
            conn.commit()

@@ -387,62 +452,38 @@ class CacheManager:
            conn.commit()

    def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
-                     headers: Optional[Dict] = None, local_path: Optional[str] = None):
-        """Save a web resource (JS, CSS, image, font, etc.) to cache"""
+                     headers: Optional[Dict] = None, local_path: Optional[str] = None, cache_key: Optional[str] = None):
+        """Save a web resource (JS, CSS, image, font, etc.) to cache
+
+        Args:
+            cache_key: Optional composite key (url + body hash for POST requests)
+        """
        with sqlite3.connect(self.db_path) as conn:
            headers_json = json.dumps(headers) if headers else None
            size_bytes = len(content) if content else 0

+            # Use cache_key if provided, otherwise use url
+            key = cache_key if cache_key else url
+
            conn.execute("""
                INSERT OR REPLACE INTO resource_cache
                (url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-            """, (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
+            """, (key, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
            conn.commit()

-    def get_resource(self, url: str) -> Optional[Dict]:
-        """Get a cached resource"""
+    def get_resource(self, url: str, cache_key: Optional[str] = None) -> Optional[Dict]:
+        """Get a cached resource
+
+        Args:
+            cache_key: Optional composite key to lookup
+        """
        with sqlite3.connect(self.db_path) as conn:
+            key = cache_key if cache_key else url
            cursor = conn.execute("""
                SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
                FROM resource_cache WHERE url = ?
-            """, (url,))
-            row = cursor.fetchone()
-
-            if row:
-                return {
-                    'content': row[0],
-                    'content_type': row[1],
-                    'status_code': row[2],
-                    'headers': json.loads(row[3]) if row[3] else {},
-                    'timestamp': row[4],
-                    'size_bytes': row[5],
-                    'local_path': row[6],
-                    'cached': True
-                }
-        return None
-
-    def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
-                     headers: Optional[Dict] = None, local_path: Optional[str] = None):
-        """Save a web resource (JS, CSS, image, font, etc.) to cache"""
-        with sqlite3.connect(self.db_path) as conn:
-            headers_json = json.dumps(headers) if headers else None
-            size_bytes = len(content) if content else 0
-
-            conn.execute("""
-                INSERT OR REPLACE INTO resource_cache
-                (url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-            """, (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
-            conn.commit()
-
-    def get_resource(self, url: str) -> Optional[Dict]:
-        """Get a cached resource"""
-        with sqlite3.connect(self.db_path) as conn:
-            cursor = conn.execute("""
-                SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
-                FROM resource_cache WHERE url = ?
-            """, (url,))
+            """, (key,))
            row = cursor.fetchone()

            if row:
--- a/src/priority.py
+++ b/src/priority.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""
+Priority calculation for intelligent scraping
+"""
+
+import time
+from datetime import datetime
+from typing import Optional, Tuple
+
+
+def parse_closing_time(closing_time_str: Optional[str]) -> Optional[int]:
+    """Parse closing time string to unix timestamp"""
+    if not closing_time_str:
+        return None
+
+    try:
+        # Try various date formats
+        formats = [
+            '%Y-%m-%d %H:%M:%S',
+            '%Y-%m-%dT%H:%M:%S',
+            '%Y-%m-%d %H:%M',
+            '%d-%m-%Y %H:%M',
+        ]
+
+        for fmt in formats:
+            try:
+                dt = datetime.strptime(closing_time_str, fmt)
+                return int(dt.timestamp())
+            except:
+                continue
+
+        return None
+    except:
+        return None
+
+
+def calculate_ttl(closing_timestamp: int, current_time: Optional[int] = None) -> int:
+    """
+    Calculate Time-To-Live (TTL) for cache based on time until closing
+
+    Strategy:
+    - Closing in > 7 days: Scrape once per day (TTL = 24 hours)
+    - Closing in 3-7 days: Scrape every 12 hours
+    - Closing in 1-3 days: Scrape every 6 hours
+    - Closing in 12-24 hours: Scrape every 3 hours
+    - Closing in 6-12 hours: Scrape every 2 hours
+    - Closing in 1-6 hours: Scrape every 30 minutes
+    - Closing in < 1 hour: Scrape every 10 minutes
+    - Already closed: TTL = infinite (no need to rescrape)
+    """
+    if current_time is None:
+        current_time = int(time.time())
+
+    time_until_close = closing_timestamp - current_time
+
+    # Already closed - very low priority
+    if time_until_close <= 0:
+        return 999999999  # Effectively infinite TTL
+
+    # Convert to hours
+    hours_until_close = time_until_close / 3600
+
+    if hours_until_close > 168:  # > 7 days
+        return 24 * 3600  # 24 hours
+    elif hours_until_close > 72:  # 3-7 days
+        return 12 * 3600  # 12 hours
+    elif hours_until_close > 24:  # 1-3 days
+        return 6 * 3600  # 6 hours
+    elif hours_until_close > 12:  # 12-24 hours
+        return 3 * 3600  # 3 hours
+    elif hours_until_close > 6:   # 6-12 hours
+        return 2 * 3600  # 2 hours
+    elif hours_until_close > 1:   # 1-6 hours
+        return 30 * 60   # 30 minutes
+    else:  # < 1 hour - URGENT!
+        return 10 * 60   # 10 minutes
+
+
+def calculate_priority(
+    closing_time_str: Optional[str],
+    scraped_at: Optional[int],
+    current_time: Optional[int] = None
+) -> Tuple[int, int]:
+    """
+    Calculate scrape priority and next_scrape_at timestamp
+
+    Returns:
+        (priority, next_scrape_at)
+
+    Priority Scale:
+        10000+ = Never scraped (highest priority)
+        9000+  = Closing within 1 hour
+        8000+  = Closing within 6 hours
+        7000+  = Closing within 24 hours
+        6000+  = Closing within 3 days
+        5000+  = Closing within 7 days
+        1000+  = Due for re-scrape (TTL expired)
+        0-999  = Recently scraped, not due yet
+        -1000  = Already closed
+    """
+    if current_time is None:
+        current_time = int(time.time())
+
+    # Never scraped = highest priority
+    if scraped_at is None or scraped_at == 0:
+        closing_timestamp = parse_closing_time(closing_time_str)
+        if closing_timestamp:
+            ttl = calculate_ttl(closing_timestamp, current_time)
+            next_scrape = current_time  # Scrape immediately
+            time_until_close = closing_timestamp - current_time
+
+            # Boost priority based on urgency
+            if time_until_close <= 0:
+                return (10000, next_scrape)  # Closed but never scraped
+            elif time_until_close < 3600:
+                return (19000, next_scrape)  # < 1 hour - CRITICAL
+            elif time_until_close < 6 * 3600:
+                return (18000, next_scrape)  # < 6 hours
+            elif time_until_close < 24 * 3600:
+                return (17000, next_scrape)  # < 24 hours
+            elif time_until_close < 3 * 24 * 3600:
+                return (16000, next_scrape)  # < 3 days
+            else:
+                return (15000, next_scrape)  # > 3 days but never scraped
+        else:
+            return (15000, current_time)  # No closing time, high priority anyway
+
+    # Already scraped - calculate based on TTL
+    closing_timestamp = parse_closing_time(closing_time_str)
+
+    if not closing_timestamp:
+        # No closing time - scrape once per day
+        ttl = 24 * 3600
+        next_scrape = scraped_at + ttl
+        time_until_rescrape = next_scrape - current_time
+
+        if time_until_rescrape <= 0:
+            return (1000, current_time)  # Due for rescrape
+        else:
+            return (500, next_scrape)  # Not due yet
+
+    # Has closing time - intelligent TTL
+    time_until_close = closing_timestamp - current_time
+
+    # Already closed
+    if time_until_close <= 0:
+        return (-1000, 999999999)  # Very low priority, never rescrape
+
+    # Calculate TTL and next scrape time
+    ttl = calculate_ttl(closing_timestamp, current_time)
+    next_scrape = scraped_at + ttl
+    time_until_rescrape = next_scrape - current_time
+
+    # Priority based on urgency and TTL
+    if time_until_rescrape <= 0:
+        # Due for rescrape - urgency-based priority
+        if time_until_close < 3600:
+            return (9000, current_time)  # < 1 hour - URGENT
+        elif time_until_close < 6 * 3600:
+            return (8000, current_time)  # < 6 hours
+        elif time_until_close < 24 * 3600:
+            return (7000, current_time)  # < 24 hours
+        elif time_until_close < 3 * 24 * 3600:
+            return (6000, current_time)  # < 3 days
+        elif time_until_close < 7 * 24 * 3600:
+            return (5000, current_time)  # < 7 days
+        else:
+            return (1000, current_time)  # > 7 days, but due
+    else:
+        # Not due yet - low priority
+        return (min(999, int(time_until_close / 3600)), next_scrape)
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -10,7 +10,7 @@ import random
 import json
 import re
 from pathlib import Path
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional, Set, Tuple
 from urllib.parse import urljoin

 from playwright.async_api import async_playwright, Page
@@ -27,6 +27,7 @@ from graphql_client import (
    extract_enriched_attributes
 )
 from bid_history_client import fetch_bid_history, parse_bid_history
+from priority import calculate_priority, parse_closing_time

 class TroostwijkScraper:
    """Main scraper class for Troostwijk Auctions"""
@@ -535,6 +536,17 @@ class TroostwijkScraper:
                print(f"  Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")

            print(f"  Location: {page_data.get('location', 'N/A')}")
+
+            # Calculate and store priority for next scrape
+            current_time = int(time.time())
+            priority, next_scrape = calculate_priority(
+                page_data.get('closing_time'),
+                current_time,  # Just scraped now
+                current_time
+            )
+            page_data['scrape_priority'] = priority
+            page_data['next_scrape_at'] = next_scrape
+
            self.cache.save_lot(page_data)

            images = page_data.get('images', [])
@@ -575,6 +587,77 @@ class TroostwijkScraper:

        return page_data

+    def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]:
+        """
+        Prioritize lots based on closing time and scrape history
+
+        Returns list of (priority, url, description) tuples sorted by priority (highest first)
+        """
+        import sqlite3
+
+        prioritized = []
+        current_time = int(time.time())
+
+        conn = sqlite3.connect(self.cache.db_path)
+        cursor = conn.cursor()
+
+        for url in lot_urls:
+            # Extract lot_id from URL
+            lot_id = self.parser.extract_lot_id(url)
+
+            # Try to get existing data from database
+            cursor.execute("""
+                SELECT closing_time, scraped_at, scrape_priority, next_scrape_at
+                FROM lots WHERE lot_id = ? OR url = ?
+            """, (lot_id, url))
+
+            row = cursor.fetchone()
+
+            if row:
+                closing_time, scraped_at, existing_priority, next_scrape_at = row
+
+                # Parse scraped_at (it might be a string timestamp)
+                if isinstance(scraped_at, str):
+                    try:
+                        scraped_at = int(datetime.strptime(scraped_at, '%Y-%m-%d %H:%M:%S').timestamp())
+                    except:
+                        scraped_at = None
+            else:
+                closing_time = None
+                scraped_at = None
+
+            # Calculate priority
+            priority, next_scrape = calculate_priority(closing_time, scraped_at, current_time)
+
+            # Create description
+            if scraped_at is None:
+                desc = "Never scraped"
+            elif priority >= 15000:
+                desc = "Never scraped (high urgency)"
+            elif priority >= 9000:
+                desc = "URGENT: <1hr to close"
+            elif priority >= 8000:
+                desc = "High: <6hr to close"
+            elif priority >= 7000:
+                desc = "Medium: <24hr to close"
+            elif priority >= 5000:
+                desc = "Normal: <7d to close"
+            elif priority >= 1000:
+                desc = "Due for rescrape"
+            elif priority < 0:
+                desc = "Already closed"
+            else:
+                desc = f"Recently scraped"
+
+            prioritized.append((priority, url, desc))
+
+        conn.close()
+
+        # Sort by priority (highest first)
+        prioritized.sort(key=lambda x: x[0], reverse=True)
+
+        return prioritized
+
    async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
        """Main crawl function"""
        if self.offline:
@@ -633,6 +716,24 @@ class TroostwijkScraper:

            # Set up COMPREHENSIVE resource interception (cache EVERYTHING)
            resource_stats = {'cached': 0, 'fetched': 0, 'failed': 0}
+            request_bodies = {}  # Store POST request bodies by URL for cache key generation
+
+            async def handle_request(request):
+                """Intercept requests to capture POST bodies for GraphQL"""
+                try:
+                    if request.method == 'POST' and 'graphql' in request.url:
+                        # Store the POST body
+                        post_data = request.post_data
+                        if post_data:
+                            # Create hash of POST body for cache key
+                            import hashlib
+                            body_hash = hashlib.md5(post_data.encode() if isinstance(post_data, str) else post_data).hexdigest()[:16]
+                            cache_key = f"{request.url}#{body_hash}"
+                            request_bodies[request.url] = (cache_key, post_data)
+                except:
+                    pass
+
+            page.on('request', handle_request)

            async def handle_response(response):
                """Intercept ALL resources and cache them"""
@@ -658,13 +759,19 @@ class TroostwijkScraper:
                        try:
                            body = await response.body()

+                            # Determine cache key (use composite key for GraphQL POST requests)
+                            cache_key = None
+                            if 'graphql' in url and url in request_bodies:
+                                cache_key, post_data = request_bodies[url]
+
                            # Save to resource cache
                            self.cache.save_resource(
                                url=url,
                                content=body,
                                content_type=content_type,
                                status_code=status,
-                                headers=headers
+                                headers=headers,
+                                cache_key=cache_key
                            )
                            resource_stats['cached'] += 1

@@ -746,14 +853,24 @@ class TroostwijkScraper:
            print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
            print(f"{'='*60}")

-            # Phase 3: Scrape each lot page
+            # Phase 2.5: Sort lots by priority (closing time + TTL)
            print("\n" + "="*60)
-            print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
+            print("PHASE 2.5: CALCULATING SCRAPE PRIORITIES")
+            print("="*60)
+
+            sorted_lots = self._prioritize_lots(all_lot_urls)
+            print(f"  > Sorted {len(sorted_lots)} lots by priority")
+            print(f"  > Highest priority: {sorted_lots[0][2] if sorted_lots else 'N/A'}")
+            print(f"  > Lowest priority: {sorted_lots[-1][2] if sorted_lots else 'N/A'}")
+
+            # Phase 3: Scrape each lot page (in priority order)
+            print("\n" + "="*60)
+            print("PHASE 3: SCRAPING LOTS (PRIORITY ORDER)")
            print("="*60)

            results = []
-            for i, lot_url in enumerate(all_lot_urls):
-                print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
+            for i, (priority, lot_url, priority_desc) in enumerate(sorted_lots):
+                print(f"\n[{i+1:>3}/{len(sorted_lots)}] [P:{priority}] ", end="")
                page_data = await self.crawl_page(page, lot_url)
                if page_data:
                    results.append(page_data)