performance
This commit is contained in:
118
src/scraper.py
118
src/scraper.py
@@ -214,10 +214,34 @@ class TroostwijkScraper:
|
||||
except:
|
||||
pass
|
||||
|
||||
# Fetch bidding data from GraphQL API
|
||||
# Fetch bidding data from GraphQL API (skip if already complete in DB)
|
||||
lot_id = page_data.get('lot_id')
|
||||
print(f" Fetching bidding data from API...")
|
||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
||||
|
||||
# Check if lot already has complete API data in database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT followers_count, estimated_min_price, current_bid, bid_count
|
||||
FROM lots WHERE lot_id = ?
|
||||
""", (lot_id,))
|
||||
existing = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
# Skip API if we have complete data (followers_count indicates API was called)
|
||||
skip_api = existing and existing[0] is not None
|
||||
|
||||
if skip_api:
|
||||
print(f" Using cached API data")
|
||||
bidding_data = None
|
||||
# Load cached data for display
|
||||
page_data['followers_count'] = existing[0]
|
||||
page_data['estimated_min_price'] = existing[1]
|
||||
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
||||
page_data['bid_count'] = existing[3] or 0
|
||||
else:
|
||||
print(f" Fetching bidding data from API...")
|
||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
||||
|
||||
if bidding_data:
|
||||
formatted_data = format_bid_data(bidding_data)
|
||||
@@ -287,25 +311,47 @@ class TroostwijkScraper:
|
||||
if not lot_uuid:
|
||||
lot_uuid = lot_details_lot.get('id')
|
||||
|
||||
# Fetch bid history for intelligence
|
||||
# Fetch bid history for intelligence (skip if already in DB)
|
||||
if lot_uuid and page_data.get('bid_count', 0) > 0:
|
||||
print(f" Fetching bid history...")
|
||||
bid_history = await fetch_bid_history(lot_uuid)
|
||||
if bid_history:
|
||||
bid_data = parse_bid_history(bid_history, lot_id)
|
||||
page_data.update(bid_data)
|
||||
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
||||
# Check if bid history already exists
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM bid_history WHERE lot_id = ?
|
||||
""", (lot_id,))
|
||||
has_history = cursor.fetchone()[0] > 0
|
||||
conn.close()
|
||||
|
||||
# Save bid history to database
|
||||
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||
if not has_history:
|
||||
print(f" Fetching bid history...")
|
||||
bid_history = await fetch_bid_history(lot_uuid)
|
||||
if bid_history:
|
||||
bid_data = parse_bid_history(bid_history, lot_id)
|
||||
page_data.update(bid_data)
|
||||
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
||||
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||
else:
|
||||
print(f" Bid history cached")
|
||||
|
||||
# Fetch auction data for viewing/pickup times if we have auction_id
|
||||
# Fetch auction data for viewing/pickup times (skip if already in DB)
|
||||
auction_id = page_data.get('auction_id')
|
||||
if auction_id:
|
||||
auction_data = await fetch_auction_data(auction_id)
|
||||
if auction_data:
|
||||
auction_times = format_auction_data(auction_data)
|
||||
page_data.update(auction_times)
|
||||
# Check if lot already has viewing/pickup times
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
|
||||
""", (lot_id,))
|
||||
times = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
has_times = times and (times[0] or times[1])
|
||||
|
||||
if not has_times:
|
||||
auction_data = await fetch_auction_data(auction_id)
|
||||
if auction_data:
|
||||
auction_times = format_auction_data(auction_data)
|
||||
page_data.update(auction_times)
|
||||
else:
|
||||
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
||||
|
||||
@@ -318,17 +364,35 @@ class TroostwijkScraper:
|
||||
print(f" Images: {len(images)}")
|
||||
|
||||
if self.download_images:
|
||||
# Download all images concurrently for this lot
|
||||
import aiohttp
|
||||
async with aiohttp.ClientSession() as session:
|
||||
download_tasks = [
|
||||
self._download_image(session, img_url, page_data['lot_id'], i)
|
||||
for i, img_url in enumerate(images)
|
||||
]
|
||||
results = await asyncio.gather(*download_tasks, return_exceptions=True)
|
||||
# Check which images are already downloaded
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(self.cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT url FROM images
|
||||
WHERE lot_id = ? AND downloaded = 1
|
||||
""", (page_data['lot_id'],))
|
||||
already_downloaded = {row[0] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
|
||||
print(f" Downloaded: {downloaded_count}/{len(images)} images")
|
||||
# Only download missing images
|
||||
images_to_download = [
|
||||
(i, img_url) for i, img_url in enumerate(images)
|
||||
if img_url not in already_downloaded
|
||||
]
|
||||
|
||||
if images_to_download:
|
||||
import aiohttp
|
||||
async with aiohttp.ClientSession() as session:
|
||||
download_tasks = [
|
||||
self._download_image(session, img_url, page_data['lot_id'], i)
|
||||
for i, img_url in images_to_download
|
||||
]
|
||||
results = await asyncio.gather(*download_tasks, return_exceptions=True)
|
||||
downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
|
||||
print(f" Downloaded: {downloaded_count}/{len(images_to_download)} new images")
|
||||
else:
|
||||
print(f" All {len(images)} images already cached")
|
||||
|
||||
return page_data
|
||||
|
||||
|
||||
Reference in New Issue
Block a user