remove-field-scraped_at_timestamp
This commit is contained in:
175
src/scraper.py
175
src/scraper.py
@@ -84,12 +84,15 @@ class TroostwijkScraper:
|
||||
|
||||
self.last_request_time = time.time()
|
||||
|
||||
async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]:
|
||||
async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[Dict]:
|
||||
"""Get page content with caching and strict rate limiting
|
||||
|
||||
Args:
|
||||
fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
|
||||
(useful for auction listing pages where we just need HTML structure)
|
||||
|
||||
Returns:
|
||||
Dict with 'content' and 'from_cache' keys
|
||||
"""
|
||||
if use_cache:
|
||||
cache_start = time.time()
|
||||
@@ -97,7 +100,7 @@ class TroostwijkScraper:
|
||||
if cached:
|
||||
cache_time = (time.time() - cache_start) * 1000
|
||||
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
||||
return cached['content']
|
||||
return {'content': cached['content'], 'from_cache': True}
|
||||
|
||||
await self._rate_limit()
|
||||
|
||||
@@ -118,7 +121,7 @@ class TroostwijkScraper:
|
||||
total_time = time.time() - fetch_start
|
||||
self.cache.set(url, content, 200)
|
||||
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
|
||||
return content
|
||||
return {'content': content, 'from_cache': False}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
@@ -158,21 +161,22 @@ class TroostwijkScraper:
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Use fast mode - we only need HTML structure for link extraction
|
||||
content = await self._get_page(page, url, fast_mode=True)
|
||||
if not content:
|
||||
result = await self._get_page(page, url, fast_mode=True)
|
||||
if not result:
|
||||
return []
|
||||
|
||||
auction_urls = self._extract_auction_urls_from_listing(content)
|
||||
auction_urls = self._extract_auction_urls_from_listing(result['content'])
|
||||
print(f"→ Found {len(auction_urls)} auction URLs")
|
||||
return auction_urls
|
||||
|
||||
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
|
||||
"""Crawl an auction page and extract lot URLs"""
|
||||
# Use fast mode for auction pages - we only need the HTML structure, not all assets
|
||||
content = await self._get_page(page, auction_url, fast_mode=True)
|
||||
if not content:
|
||||
result = await self._get_page(page, auction_url, fast_mode=True)
|
||||
if not result:
|
||||
return []
|
||||
|
||||
content = result['content']
|
||||
parse_start = time.time()
|
||||
page_data = self.parser.parse_page(content, auction_url)
|
||||
parse_time = (time.time() - parse_start) * 1000
|
||||
@@ -199,10 +203,12 @@ class TroostwijkScraper:
|
||||
page_id = self.parser.extract_lot_id(url)
|
||||
print(f"\n[PAGE {page_id}]")
|
||||
|
||||
content = await self._get_page(page, url)
|
||||
if not content:
|
||||
result = await self._get_page(page, url)
|
||||
if not result:
|
||||
return None
|
||||
|
||||
content = result['content']
|
||||
from_cache = result['from_cache']
|
||||
page_data = self.parser.parse_page(content, url)
|
||||
if not page_data:
|
||||
return None
|
||||
@@ -245,34 +251,83 @@ class TroostwijkScraper:
|
||||
except:
|
||||
pass
|
||||
|
||||
# Fetch bidding data from GraphQL API (skip if already complete in DB)
|
||||
# Fetch all API data concurrently (or use cache if HTML was cached)
|
||||
lot_id = page_data.get('lot_id')
|
||||
auction_id = page_data.get('auction_id')
|
||||
|
||||
# Check if lot already has complete API data in database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT followers_count, estimated_min_price, current_bid, bid_count
|
||||
FROM lots WHERE lot_id = ?
|
||||
""", (lot_id,))
|
||||
existing = cursor.fetchone()
|
||||
conn.close()
|
||||
if from_cache:
|
||||
# Check if we have cached API data in database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT followers_count, estimated_min_price, current_bid, bid_count
|
||||
FROM lots WHERE lot_id = ?
|
||||
""", (lot_id,))
|
||||
existing = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
# Skip API if we have complete data (followers_count indicates API was called)
|
||||
skip_api = existing and existing[0] is not None
|
||||
|
||||
if skip_api:
|
||||
print(f" Using cached API data")
|
||||
bidding_data = None
|
||||
# Load cached data for display
|
||||
page_data['followers_count'] = existing[0]
|
||||
page_data['estimated_min_price'] = existing[1]
|
||||
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
||||
page_data['bid_count'] = existing[3] or 0
|
||||
# Use cached API data if available and not null
|
||||
if existing and existing[0] is not None:
|
||||
print(f" Using cached API data")
|
||||
page_data['followers_count'] = existing[0]
|
||||
page_data['estimated_min_price'] = existing[1]
|
||||
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
||||
page_data['bid_count'] = existing[3] or 0
|
||||
bidding_data = None
|
||||
bid_history_data = None
|
||||
else:
|
||||
print(f" Fetching lot data from API (concurrent)...")
|
||||
# Make concurrent API calls
|
||||
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
||||
if auction_id:
|
||||
api_tasks.append(fetch_auction_data(auction_id))
|
||||
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
|
||||
bid_history_data = None # Will fetch after we have lot_uuid
|
||||
else:
|
||||
print(f" Fetching bidding data from API...")
|
||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
||||
# Fresh page fetch - make concurrent API calls for all data
|
||||
print(f" Fetching lot data from API (concurrent)...")
|
||||
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
||||
task_map = {'bidding': 0} # Track which index corresponds to which task
|
||||
|
||||
# Add auction data fetch if we need viewing/pickup times
|
||||
if auction_id:
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
|
||||
""", (lot_id,))
|
||||
times = cursor.fetchone()
|
||||
conn.close()
|
||||
has_times = times and (times[0] or times[1])
|
||||
|
||||
if not has_times:
|
||||
task_map['auction'] = len(api_tasks)
|
||||
api_tasks.append(fetch_auction_data(auction_id))
|
||||
|
||||
# Add bid history fetch if we have lot_uuid and expect bids
|
||||
if lot_uuid:
|
||||
task_map['bid_history'] = len(api_tasks)
|
||||
api_tasks.append(fetch_bid_history(lot_uuid))
|
||||
|
||||
# Execute all API calls concurrently
|
||||
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
|
||||
|
||||
# Process auction data if it was fetched
|
||||
if 'auction' in task_map and len(results) > task_map['auction']:
|
||||
auction_data = results[task_map['auction']]
|
||||
if not isinstance(auction_data, Exception) and auction_data:
|
||||
auction_times = format_auction_data(auction_data)
|
||||
page_data.update(auction_times)
|
||||
|
||||
# Process bid history if it was fetched
|
||||
bid_history_data = None
|
||||
if 'bid_history' in task_map and len(results) > task_map['bid_history']:
|
||||
bid_history_data = results[task_map['bid_history']]
|
||||
if isinstance(bid_history_data, Exception):
|
||||
bid_history_data = None
|
||||
|
||||
if bidding_data:
|
||||
formatted_data = format_bid_data(bidding_data)
|
||||
@@ -342,9 +397,24 @@ class TroostwijkScraper:
|
||||
if not lot_uuid:
|
||||
lot_uuid = lot_details_lot.get('id')
|
||||
|
||||
# Fetch bid history for intelligence (skip if already in DB)
|
||||
if lot_uuid and page_data.get('bid_count', 0) > 0:
|
||||
# Check if bid history already exists
|
||||
# Process bid history if we fetched it concurrently
|
||||
if not from_cache and 'bid_history_data' in locals() and bid_history_data and page_data.get('bid_count', 0) > 0:
|
||||
bid_data = parse_bid_history(bid_history_data, lot_id)
|
||||
page_data.update(bid_data)
|
||||
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
||||
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||
elif not from_cache and lot_uuid and page_data.get('bid_count', 0) > 0:
|
||||
# Fallback: fetch bid history if we didn't get it in the concurrent batch
|
||||
# (This happens when lot_uuid wasn't available before the first API call)
|
||||
print(f" Fetching bid history...")
|
||||
bid_history = await fetch_bid_history(lot_uuid)
|
||||
if bid_history:
|
||||
bid_data = parse_bid_history(bid_history, lot_id)
|
||||
page_data.update(bid_data)
|
||||
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
||||
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||
elif from_cache and page_data.get('bid_count', 0) > 0:
|
||||
# Check if cached bid history exists
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
@@ -352,37 +422,8 @@ class TroostwijkScraper:
|
||||
""", (lot_id,))
|
||||
has_history = cursor.fetchone()[0] > 0
|
||||
conn.close()
|
||||
|
||||
if not has_history:
|
||||
print(f" Fetching bid history...")
|
||||
bid_history = await fetch_bid_history(lot_uuid)
|
||||
if bid_history:
|
||||
bid_data = parse_bid_history(bid_history, lot_id)
|
||||
page_data.update(bid_data)
|
||||
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
||||
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||
else:
|
||||
if has_history:
|
||||
print(f" Bid history cached")
|
||||
|
||||
# Fetch auction data for viewing/pickup times (skip if already in DB)
|
||||
auction_id = page_data.get('auction_id')
|
||||
if auction_id:
|
||||
# Check if lot already has viewing/pickup times
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
|
||||
""", (lot_id,))
|
||||
times = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
has_times = times and (times[0] or times[1])
|
||||
|
||||
if not has_times:
|
||||
auction_data = await fetch_auction_data(auction_id)
|
||||
if auction_data:
|
||||
auction_times = format_auction_data(auction_data)
|
||||
page_data.update(auction_times)
|
||||
else:
|
||||
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user