enrich data

This commit is contained in:
Tour
2025-12-09 02:05:46 +01:00
parent 06f63732b1
commit b0ee52b686
4 changed files with 685 additions and 53 deletions

View File

@@ -134,10 +134,13 @@ class CacheManager:
reserve_met INTEGER,
view_count INTEGER,
api_data_json TEXT,
next_scrape_at INTEGER,
scrape_priority INTEGER DEFAULT 0,
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_closing_time ON lots(closing_time)")
# Images table
conn.execute("""
@@ -200,14 +203,74 @@ class CacheManager:
"""Run database migrations to add new columns to existing tables"""
print("Checking for database migrations...")
# Check and add api_data_json column to lots table
# Check and add new columns to lots table
cursor = conn.execute("PRAGMA table_info(lots)")
lots_columns = {row[1] for row in cursor.fetchall()}
migrations_applied = False
if 'api_data_json' not in lots_columns:
print(" > Adding api_data_json column to lots table...")
conn.execute("ALTER TABLE lots ADD COLUMN api_data_json TEXT")
print(" * Migration complete")
migrations_applied = True
if 'next_scrape_at' not in lots_columns:
print(" > Adding next_scrape_at column to lots table...")
conn.execute("ALTER TABLE lots ADD COLUMN next_scrape_at INTEGER")
migrations_applied = True
if 'scrape_priority' not in lots_columns:
print(" > Adding scrape_priority column to lots table...")
conn.execute("ALTER TABLE lots ADD COLUMN scrape_priority INTEGER DEFAULT 0")
migrations_applied = True
# Check resource_cache table structure
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='resource_cache'")
resource_cache_exists = cursor.fetchone() is not None
if resource_cache_exists:
# Check if table has correct structure
cursor = conn.execute("PRAGMA table_info(resource_cache)")
resource_columns = {row[1] for row in cursor.fetchall()}
# Expected columns
expected_columns = {'url', 'content', 'content_type', 'status_code', 'headers', 'timestamp', 'size_bytes', 'local_path'}
if resource_columns != expected_columns:
print(" > Rebuilding resource_cache table with correct schema...")
# Backup old data count
cursor = conn.execute("SELECT COUNT(*) FROM resource_cache")
old_count = cursor.fetchone()[0]
print(f" (Preserving {old_count} cached resources)")
# Drop and recreate with correct schema
conn.execute("DROP TABLE IF EXISTS resource_cache")
conn.execute("""
CREATE TABLE resource_cache (
url TEXT PRIMARY KEY,
content BLOB,
content_type TEXT,
status_code INTEGER,
headers TEXT,
timestamp REAL,
size_bytes INTEGER,
local_path TEXT
)
""")
conn.execute("CREATE INDEX idx_resource_timestamp ON resource_cache(timestamp)")
conn.execute("CREATE INDEX idx_resource_content_type ON resource_cache(content_type)")
migrations_applied = True
print(" * resource_cache table rebuilt")
# Create indexes after migrations (when columns exist)
try:
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_priority ON lots(scrape_priority DESC)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_next_scrape ON lots(next_scrape_at)")
except:
pass # Indexes might already exist
if migrations_applied:
print(" * Migrations complete")
else:
print(" * Database schema is up to date")
@@ -310,8 +373,8 @@ class CacheManager:
year_manufactured, condition_score, condition_description,
serial_number, manufacturer, damage_description,
followers_count, estimated_min_price, estimated_max_price, lot_condition, appearance,
scraped_at, api_data_json)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
scraped_at, api_data_json, next_scrape_at, scrape_priority)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
lot_data['lot_id'],
lot_data.get('auction_id', ''),
@@ -347,7 +410,9 @@ class CacheManager:
lot_data.get('lot_condition', ''),
lot_data.get('appearance', ''),
lot_data['scraped_at'],
lot_data.get('api_data_json')
lot_data.get('api_data_json'),
lot_data.get('next_scrape_at'),
lot_data.get('scrape_priority', 0)
))
conn.commit()
@@ -387,62 +452,38 @@ class CacheManager:
conn.commit()
def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
headers: Optional[Dict] = None, local_path: Optional[str] = None):
"""Save a web resource (JS, CSS, image, font, etc.) to cache"""
headers: Optional[Dict] = None, local_path: Optional[str] = None, cache_key: Optional[str] = None):
"""Save a web resource (JS, CSS, image, font, etc.) to cache
Args:
cache_key: Optional composite key (url + body hash for POST requests)
"""
with sqlite3.connect(self.db_path) as conn:
headers_json = json.dumps(headers) if headers else None
size_bytes = len(content) if content else 0
# Use cache_key if provided, otherwise use url
key = cache_key if cache_key else url
conn.execute("""
INSERT OR REPLACE INTO resource_cache
(url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
""", (key, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
conn.commit()
def get_resource(self, url: str) -> Optional[Dict]:
"""Get a cached resource"""
def get_resource(self, url: str, cache_key: Optional[str] = None) -> Optional[Dict]:
"""Get a cached resource
Args:
cache_key: Optional composite key to lookup
"""
with sqlite3.connect(self.db_path) as conn:
key = cache_key if cache_key else url
cursor = conn.execute("""
SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
FROM resource_cache WHERE url = ?
""", (url,))
row = cursor.fetchone()
if row:
return {
'content': row[0],
'content_type': row[1],
'status_code': row[2],
'headers': json.loads(row[3]) if row[3] else {},
'timestamp': row[4],
'size_bytes': row[5],
'local_path': row[6],
'cached': True
}
return None
def save_resource(self, url: str, content: bytes, content_type: str, status_code: int = 200,
headers: Optional[Dict] = None, local_path: Optional[str] = None):
"""Save a web resource (JS, CSS, image, font, etc.) to cache"""
with sqlite3.connect(self.db_path) as conn:
headers_json = json.dumps(headers) if headers else None
size_bytes = len(content) if content else 0
conn.execute("""
INSERT OR REPLACE INTO resource_cache
(url, content, content_type, status_code, headers, timestamp, size_bytes, local_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (url, content, content_type, status_code, headers_json, time.time(), size_bytes, local_path))
conn.commit()
def get_resource(self, url: str) -> Optional[Dict]:
"""Get a cached resource"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT content, content_type, status_code, headers, timestamp, size_bytes, local_path
FROM resource_cache WHERE url = ?
""", (url,))
""", (key,))
row = cursor.fetchone()
if row:

171
src/priority.py Normal file
View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""
Priority calculation for intelligent scraping
"""
import time
from datetime import datetime
from typing import Optional, Tuple
def parse_closing_time(closing_time_str: Optional[str]) -> Optional[int]:
"""Parse closing time string to unix timestamp"""
if not closing_time_str:
return None
try:
# Try various date formats
formats = [
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%d %H:%M',
'%d-%m-%Y %H:%M',
]
for fmt in formats:
try:
dt = datetime.strptime(closing_time_str, fmt)
return int(dt.timestamp())
except:
continue
return None
except:
return None
def calculate_ttl(closing_timestamp: int, current_time: Optional[int] = None) -> int:
"""
Calculate Time-To-Live (TTL) for cache based on time until closing
Strategy:
- Closing in > 7 days: Scrape once per day (TTL = 24 hours)
- Closing in 3-7 days: Scrape every 12 hours
- Closing in 1-3 days: Scrape every 6 hours
- Closing in 12-24 hours: Scrape every 3 hours
- Closing in 6-12 hours: Scrape every 2 hours
- Closing in 1-6 hours: Scrape every 30 minutes
- Closing in < 1 hour: Scrape every 10 minutes
- Already closed: TTL = infinite (no need to rescrape)
"""
if current_time is None:
current_time = int(time.time())
time_until_close = closing_timestamp - current_time
# Already closed - very low priority
if time_until_close <= 0:
return 999999999 # Effectively infinite TTL
# Convert to hours
hours_until_close = time_until_close / 3600
if hours_until_close > 168: # > 7 days
return 24 * 3600 # 24 hours
elif hours_until_close > 72: # 3-7 days
return 12 * 3600 # 12 hours
elif hours_until_close > 24: # 1-3 days
return 6 * 3600 # 6 hours
elif hours_until_close > 12: # 12-24 hours
return 3 * 3600 # 3 hours
elif hours_until_close > 6: # 6-12 hours
return 2 * 3600 # 2 hours
elif hours_until_close > 1: # 1-6 hours
return 30 * 60 # 30 minutes
else: # < 1 hour - URGENT!
return 10 * 60 # 10 minutes
def calculate_priority(
closing_time_str: Optional[str],
scraped_at: Optional[int],
current_time: Optional[int] = None
) -> Tuple[int, int]:
"""
Calculate scrape priority and next_scrape_at timestamp
Returns:
(priority, next_scrape_at)
Priority Scale:
10000+ = Never scraped (highest priority)
9000+ = Closing within 1 hour
8000+ = Closing within 6 hours
7000+ = Closing within 24 hours
6000+ = Closing within 3 days
5000+ = Closing within 7 days
1000+ = Due for re-scrape (TTL expired)
0-999 = Recently scraped, not due yet
-1000 = Already closed
"""
if current_time is None:
current_time = int(time.time())
# Never scraped = highest priority
if scraped_at is None or scraped_at == 0:
closing_timestamp = parse_closing_time(closing_time_str)
if closing_timestamp:
ttl = calculate_ttl(closing_timestamp, current_time)
next_scrape = current_time # Scrape immediately
time_until_close = closing_timestamp - current_time
# Boost priority based on urgency
if time_until_close <= 0:
return (10000, next_scrape) # Closed but never scraped
elif time_until_close < 3600:
return (19000, next_scrape) # < 1 hour - CRITICAL
elif time_until_close < 6 * 3600:
return (18000, next_scrape) # < 6 hours
elif time_until_close < 24 * 3600:
return (17000, next_scrape) # < 24 hours
elif time_until_close < 3 * 24 * 3600:
return (16000, next_scrape) # < 3 days
else:
return (15000, next_scrape) # > 3 days but never scraped
else:
return (15000, current_time) # No closing time, high priority anyway
# Already scraped - calculate based on TTL
closing_timestamp = parse_closing_time(closing_time_str)
if not closing_timestamp:
# No closing time - scrape once per day
ttl = 24 * 3600
next_scrape = scraped_at + ttl
time_until_rescrape = next_scrape - current_time
if time_until_rescrape <= 0:
return (1000, current_time) # Due for rescrape
else:
return (500, next_scrape) # Not due yet
# Has closing time - intelligent TTL
time_until_close = closing_timestamp - current_time
# Already closed
if time_until_close <= 0:
return (-1000, 999999999) # Very low priority, never rescrape
# Calculate TTL and next scrape time
ttl = calculate_ttl(closing_timestamp, current_time)
next_scrape = scraped_at + ttl
time_until_rescrape = next_scrape - current_time
# Priority based on urgency and TTL
if time_until_rescrape <= 0:
# Due for rescrape - urgency-based priority
if time_until_close < 3600:
return (9000, current_time) # < 1 hour - URGENT
elif time_until_close < 6 * 3600:
return (8000, current_time) # < 6 hours
elif time_until_close < 24 * 3600:
return (7000, current_time) # < 24 hours
elif time_until_close < 3 * 24 * 3600:
return (6000, current_time) # < 3 days
elif time_until_close < 7 * 24 * 3600:
return (5000, current_time) # < 7 days
else:
return (1000, current_time) # > 7 days, but due
else:
# Not due yet - low priority
return (min(999, int(time_until_close / 3600)), next_scrape)

View File

@@ -10,7 +10,7 @@ import random
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Set
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import urljoin
from playwright.async_api import async_playwright, Page
@@ -27,6 +27,7 @@ from graphql_client import (
extract_enriched_attributes
)
from bid_history_client import fetch_bid_history, parse_bid_history
from priority import calculate_priority, parse_closing_time
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
@@ -535,6 +536,17 @@ class TroostwijkScraper:
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
print(f" Location: {page_data.get('location', 'N/A')}")
# Calculate and store priority for next scrape
current_time = int(time.time())
priority, next_scrape = calculate_priority(
page_data.get('closing_time'),
current_time, # Just scraped now
current_time
)
page_data['scrape_priority'] = priority
page_data['next_scrape_at'] = next_scrape
self.cache.save_lot(page_data)
images = page_data.get('images', [])
@@ -575,6 +587,77 @@ class TroostwijkScraper:
return page_data
def _prioritize_lots(self, lot_urls: List[str]) -> List[Tuple[int, str, str]]:
"""
Prioritize lots based on closing time and scrape history
Returns list of (priority, url, description) tuples sorted by priority (highest first)
"""
import sqlite3
prioritized = []
current_time = int(time.time())
conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor()
for url in lot_urls:
# Extract lot_id from URL
lot_id = self.parser.extract_lot_id(url)
# Try to get existing data from database
cursor.execute("""
SELECT closing_time, scraped_at, scrape_priority, next_scrape_at
FROM lots WHERE lot_id = ? OR url = ?
""", (lot_id, url))
row = cursor.fetchone()
if row:
closing_time, scraped_at, existing_priority, next_scrape_at = row
# Parse scraped_at (it might be a string timestamp)
if isinstance(scraped_at, str):
try:
scraped_at = int(datetime.strptime(scraped_at, '%Y-%m-%d %H:%M:%S').timestamp())
except:
scraped_at = None
else:
closing_time = None
scraped_at = None
# Calculate priority
priority, next_scrape = calculate_priority(closing_time, scraped_at, current_time)
# Create description
if scraped_at is None:
desc = "Never scraped"
elif priority >= 15000:
desc = "Never scraped (high urgency)"
elif priority >= 9000:
desc = "URGENT: <1hr to close"
elif priority >= 8000:
desc = "High: <6hr to close"
elif priority >= 7000:
desc = "Medium: <24hr to close"
elif priority >= 5000:
desc = "Normal: <7d to close"
elif priority >= 1000:
desc = "Due for rescrape"
elif priority < 0:
desc = "Already closed"
else:
desc = f"Recently scraped"
prioritized.append((priority, url, desc))
conn.close()
# Sort by priority (highest first)
prioritized.sort(key=lambda x: x[0], reverse=True)
return prioritized
async def crawl_auctions(self, max_pages: int = MAX_PAGES) -> List[Dict]:
"""Main crawl function"""
if self.offline:
@@ -633,6 +716,24 @@ class TroostwijkScraper:
# Set up COMPREHENSIVE resource interception (cache EVERYTHING)
resource_stats = {'cached': 0, 'fetched': 0, 'failed': 0}
request_bodies = {} # Store POST request bodies by URL for cache key generation
async def handle_request(request):
"""Intercept requests to capture POST bodies for GraphQL"""
try:
if request.method == 'POST' and 'graphql' in request.url:
# Store the POST body
post_data = request.post_data
if post_data:
# Create hash of POST body for cache key
import hashlib
body_hash = hashlib.md5(post_data.encode() if isinstance(post_data, str) else post_data).hexdigest()[:16]
cache_key = f"{request.url}#{body_hash}"
request_bodies[request.url] = (cache_key, post_data)
except:
pass
page.on('request', handle_request)
async def handle_response(response):
"""Intercept ALL resources and cache them"""
@@ -658,13 +759,19 @@ class TroostwijkScraper:
try:
body = await response.body()
# Determine cache key (use composite key for GraphQL POST requests)
cache_key = None
if 'graphql' in url and url in request_bodies:
cache_key, post_data = request_bodies[url]
# Save to resource cache
self.cache.save_resource(
url=url,
content=body,
content_type=content_type,
status_code=status,
headers=headers
headers=headers,
cache_key=cache_key
)
resource_stats['cached'] += 1
@@ -746,14 +853,24 @@ class TroostwijkScraper:
print(f"PHASE 2 COMPLETE: {len(all_lot_urls)} UNIQUE LOTS")
print(f"{'='*60}")
# Phase 3: Scrape each lot page
# Phase 2.5: Sort lots by priority (closing time + TTL)
print("\n" + "="*60)
print("PHASE 3: SCRAPING INDIVIDUAL LOT PAGES")
print("PHASE 2.5: CALCULATING SCRAPE PRIORITIES")
print("="*60)
sorted_lots = self._prioritize_lots(all_lot_urls)
print(f" > Sorted {len(sorted_lots)} lots by priority")
print(f" > Highest priority: {sorted_lots[0][2] if sorted_lots else 'N/A'}")
print(f" > Lowest priority: {sorted_lots[-1][2] if sorted_lots else 'N/A'}")
# Phase 3: Scrape each lot page (in priority order)
print("\n" + "="*60)
print("PHASE 3: SCRAPING LOTS (PRIORITY ORDER)")
print("="*60)
results = []
for i, lot_url in enumerate(all_lot_urls):
print(f"\n[{i+1:>3}/{len(all_lot_urls)}] ", end="")
for i, (priority, lot_url, priority_desc) in enumerate(sorted_lots):
print(f"\n[{i+1:>3}/{len(sorted_lots)}] [P:{priority}] ", end="")
page_data = await self.crawl_page(page, lot_url)
if page_data:
results.append(page_data)