remove-field-scraped_at_timestamp
This commit is contained in:
175
src/scraper.py
175
src/scraper.py
@@ -84,12 +84,15 @@ class TroostwijkScraper:
|
|||||||
|
|
||||||
self.last_request_time = time.time()
|
self.last_request_time = time.time()
|
||||||
|
|
||||||
async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[str]:
|
async def _get_page(self, page: Page, url: str, use_cache: bool = True, fast_mode: bool = False) -> Optional[Dict]:
|
||||||
"""Get page content with caching and strict rate limiting
|
"""Get page content with caching and strict rate limiting
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
|
fast_mode: If True, use 'domcontentloaded' instead of 'networkidle' for faster loading
|
||||||
(useful for auction listing pages where we just need HTML structure)
|
(useful for auction listing pages where we just need HTML structure)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'content' and 'from_cache' keys
|
||||||
"""
|
"""
|
||||||
if use_cache:
|
if use_cache:
|
||||||
cache_start = time.time()
|
cache_start = time.time()
|
||||||
@@ -97,7 +100,7 @@ class TroostwijkScraper:
|
|||||||
if cached:
|
if cached:
|
||||||
cache_time = (time.time() - cache_start) * 1000
|
cache_time = (time.time() - cache_start) * 1000
|
||||||
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
print(f" CACHE HIT: {url} ({cache_time:.0f}ms)")
|
||||||
return cached['content']
|
return {'content': cached['content'], 'from_cache': True}
|
||||||
|
|
||||||
await self._rate_limit()
|
await self._rate_limit()
|
||||||
|
|
||||||
@@ -118,7 +121,7 @@ class TroostwijkScraper:
|
|||||||
total_time = time.time() - fetch_start
|
total_time = time.time() - fetch_start
|
||||||
self.cache.set(url, content, 200)
|
self.cache.set(url, content, 200)
|
||||||
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
|
print(f" [Timing: goto={goto_time:.2f}s, total={total_time:.2f}s, mode={wait_strategy}]")
|
||||||
return content
|
return {'content': content, 'from_cache': False}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ERROR: {e}")
|
print(f" ERROR: {e}")
|
||||||
@@ -158,21 +161,22 @@ class TroostwijkScraper:
|
|||||||
print(f"{'='*60}")
|
print(f"{'='*60}")
|
||||||
|
|
||||||
# Use fast mode - we only need HTML structure for link extraction
|
# Use fast mode - we only need HTML structure for link extraction
|
||||||
content = await self._get_page(page, url, fast_mode=True)
|
result = await self._get_page(page, url, fast_mode=True)
|
||||||
if not content:
|
if not result:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
auction_urls = self._extract_auction_urls_from_listing(content)
|
auction_urls = self._extract_auction_urls_from_listing(result['content'])
|
||||||
print(f"→ Found {len(auction_urls)} auction URLs")
|
print(f"→ Found {len(auction_urls)} auction URLs")
|
||||||
return auction_urls
|
return auction_urls
|
||||||
|
|
||||||
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
|
async def crawl_auction_for_lots(self, page: Page, auction_url: str) -> List[str]:
|
||||||
"""Crawl an auction page and extract lot URLs"""
|
"""Crawl an auction page and extract lot URLs"""
|
||||||
# Use fast mode for auction pages - we only need the HTML structure, not all assets
|
# Use fast mode for auction pages - we only need the HTML structure, not all assets
|
||||||
content = await self._get_page(page, auction_url, fast_mode=True)
|
result = await self._get_page(page, auction_url, fast_mode=True)
|
||||||
if not content:
|
if not result:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
content = result['content']
|
||||||
parse_start = time.time()
|
parse_start = time.time()
|
||||||
page_data = self.parser.parse_page(content, auction_url)
|
page_data = self.parser.parse_page(content, auction_url)
|
||||||
parse_time = (time.time() - parse_start) * 1000
|
parse_time = (time.time() - parse_start) * 1000
|
||||||
@@ -199,10 +203,12 @@ class TroostwijkScraper:
|
|||||||
page_id = self.parser.extract_lot_id(url)
|
page_id = self.parser.extract_lot_id(url)
|
||||||
print(f"\n[PAGE {page_id}]")
|
print(f"\n[PAGE {page_id}]")
|
||||||
|
|
||||||
content = await self._get_page(page, url)
|
result = await self._get_page(page, url)
|
||||||
if not content:
|
if not result:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
content = result['content']
|
||||||
|
from_cache = result['from_cache']
|
||||||
page_data = self.parser.parse_page(content, url)
|
page_data = self.parser.parse_page(content, url)
|
||||||
if not page_data:
|
if not page_data:
|
||||||
return None
|
return None
|
||||||
@@ -245,34 +251,83 @@ class TroostwijkScraper:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Fetch bidding data from GraphQL API (skip if already complete in DB)
|
# Fetch all API data concurrently (or use cache if HTML was cached)
|
||||||
lot_id = page_data.get('lot_id')
|
lot_id = page_data.get('lot_id')
|
||||||
|
auction_id = page_data.get('auction_id')
|
||||||
|
|
||||||
# Check if lot already has complete API data in database
|
if from_cache:
|
||||||
import sqlite3
|
# Check if we have cached API data in database
|
||||||
conn = sqlite3.connect(self.cache.db_path)
|
import sqlite3
|
||||||
cursor = conn.cursor()
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
cursor.execute("""
|
cursor = conn.cursor()
|
||||||
SELECT followers_count, estimated_min_price, current_bid, bid_count
|
cursor.execute("""
|
||||||
FROM lots WHERE lot_id = ?
|
SELECT followers_count, estimated_min_price, current_bid, bid_count
|
||||||
""", (lot_id,))
|
FROM lots WHERE lot_id = ?
|
||||||
existing = cursor.fetchone()
|
""", (lot_id,))
|
||||||
conn.close()
|
existing = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
# Skip API if we have complete data (followers_count indicates API was called)
|
# Use cached API data if available and not null
|
||||||
skip_api = existing and existing[0] is not None
|
if existing and existing[0] is not None:
|
||||||
|
print(f" Using cached API data")
|
||||||
if skip_api:
|
page_data['followers_count'] = existing[0]
|
||||||
print(f" Using cached API data")
|
page_data['estimated_min_price'] = existing[1]
|
||||||
bidding_data = None
|
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
||||||
# Load cached data for display
|
page_data['bid_count'] = existing[3] or 0
|
||||||
page_data['followers_count'] = existing[0]
|
bidding_data = None
|
||||||
page_data['estimated_min_price'] = existing[1]
|
bid_history_data = None
|
||||||
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
else:
|
||||||
page_data['bid_count'] = existing[3] or 0
|
print(f" Fetching lot data from API (concurrent)...")
|
||||||
|
# Make concurrent API calls
|
||||||
|
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
||||||
|
if auction_id:
|
||||||
|
api_tasks.append(fetch_auction_data(auction_id))
|
||||||
|
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||||
|
bidding_data = results[0] if results and not isinstance(results[0], Exception) else None
|
||||||
|
bid_history_data = None # Will fetch after we have lot_uuid
|
||||||
else:
|
else:
|
||||||
print(f" Fetching bidding data from API...")
|
# Fresh page fetch - make concurrent API calls for all data
|
||||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
print(f" Fetching lot data from API (concurrent)...")
|
||||||
|
api_tasks = [fetch_lot_bidding_data(lot_id)]
|
||||||
|
task_map = {'bidding': 0} # Track which index corresponds to which task
|
||||||
|
|
||||||
|
# Add auction data fetch if we need viewing/pickup times
|
||||||
|
if auction_id:
|
||||||
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
|
||||||
|
""", (lot_id,))
|
||||||
|
times = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
has_times = times and (times[0] or times[1])
|
||||||
|
|
||||||
|
if not has_times:
|
||||||
|
task_map['auction'] = len(api_tasks)
|
||||||
|
api_tasks.append(fetch_auction_data(auction_id))
|
||||||
|
|
||||||
|
# Add bid history fetch if we have lot_uuid and expect bids
|
||||||
|
if lot_uuid:
|
||||||
|
task_map['bid_history'] = len(api_tasks)
|
||||||
|
api_tasks.append(fetch_bid_history(lot_uuid))
|
||||||
|
|
||||||
|
# Execute all API calls concurrently
|
||||||
|
results = await asyncio.gather(*api_tasks, return_exceptions=True)
|
||||||
|
bidding_data = results[task_map['bidding']] if results and not isinstance(results[task_map['bidding']], Exception) else None
|
||||||
|
|
||||||
|
# Process auction data if it was fetched
|
||||||
|
if 'auction' in task_map and len(results) > task_map['auction']:
|
||||||
|
auction_data = results[task_map['auction']]
|
||||||
|
if not isinstance(auction_data, Exception) and auction_data:
|
||||||
|
auction_times = format_auction_data(auction_data)
|
||||||
|
page_data.update(auction_times)
|
||||||
|
|
||||||
|
# Process bid history if it was fetched
|
||||||
|
bid_history_data = None
|
||||||
|
if 'bid_history' in task_map and len(results) > task_map['bid_history']:
|
||||||
|
bid_history_data = results[task_map['bid_history']]
|
||||||
|
if isinstance(bid_history_data, Exception):
|
||||||
|
bid_history_data = None
|
||||||
|
|
||||||
if bidding_data:
|
if bidding_data:
|
||||||
formatted_data = format_bid_data(bidding_data)
|
formatted_data = format_bid_data(bidding_data)
|
||||||
@@ -342,9 +397,24 @@ class TroostwijkScraper:
|
|||||||
if not lot_uuid:
|
if not lot_uuid:
|
||||||
lot_uuid = lot_details_lot.get('id')
|
lot_uuid = lot_details_lot.get('id')
|
||||||
|
|
||||||
# Fetch bid history for intelligence (skip if already in DB)
|
# Process bid history if we fetched it concurrently
|
||||||
if lot_uuid and page_data.get('bid_count', 0) > 0:
|
if not from_cache and 'bid_history_data' in locals() and bid_history_data and page_data.get('bid_count', 0) > 0:
|
||||||
# Check if bid history already exists
|
bid_data = parse_bid_history(bid_history_data, lot_id)
|
||||||
|
page_data.update(bid_data)
|
||||||
|
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
||||||
|
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||||
|
elif not from_cache and lot_uuid and page_data.get('bid_count', 0) > 0:
|
||||||
|
# Fallback: fetch bid history if we didn't get it in the concurrent batch
|
||||||
|
# (This happens when lot_uuid wasn't available before the first API call)
|
||||||
|
print(f" Fetching bid history...")
|
||||||
|
bid_history = await fetch_bid_history(lot_uuid)
|
||||||
|
if bid_history:
|
||||||
|
bid_data = parse_bid_history(bid_history, lot_id)
|
||||||
|
page_data.update(bid_data)
|
||||||
|
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
||||||
|
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||||
|
elif from_cache and page_data.get('bid_count', 0) > 0:
|
||||||
|
# Check if cached bid history exists
|
||||||
conn = sqlite3.connect(self.cache.db_path)
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
@@ -352,37 +422,8 @@ class TroostwijkScraper:
|
|||||||
""", (lot_id,))
|
""", (lot_id,))
|
||||||
has_history = cursor.fetchone()[0] > 0
|
has_history = cursor.fetchone()[0] > 0
|
||||||
conn.close()
|
conn.close()
|
||||||
|
if has_history:
|
||||||
if not has_history:
|
|
||||||
print(f" Fetching bid history...")
|
|
||||||
bid_history = await fetch_bid_history(lot_uuid)
|
|
||||||
if bid_history:
|
|
||||||
bid_data = parse_bid_history(bid_history, lot_id)
|
|
||||||
page_data.update(bid_data)
|
|
||||||
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
|
||||||
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
|
||||||
else:
|
|
||||||
print(f" Bid history cached")
|
print(f" Bid history cached")
|
||||||
|
|
||||||
# Fetch auction data for viewing/pickup times (skip if already in DB)
|
|
||||||
auction_id = page_data.get('auction_id')
|
|
||||||
if auction_id:
|
|
||||||
# Check if lot already has viewing/pickup times
|
|
||||||
conn = sqlite3.connect(self.cache.db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
|
|
||||||
""", (lot_id,))
|
|
||||||
times = cursor.fetchone()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
has_times = times and (times[0] or times[1])
|
|
||||||
|
|
||||||
if not has_times:
|
|
||||||
auction_data = await fetch_auction_data(auction_id)
|
|
||||||
if auction_data:
|
|
||||||
auction_times = format_auction_data(auction_data)
|
|
||||||
page_data.update(auction_times)
|
|
||||||
else:
|
else:
|
||||||
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user