performance

This commit is contained in:
Tour
2025-12-07 08:56:08 +01:00
parent c56d63d6fa
commit 7e6629641f

View File

@@ -214,10 +214,34 @@ class TroostwijkScraper:
except:
pass
# Fetch bidding data from GraphQL API
# Fetch bidding data from GraphQL API (skip if already complete in DB)
lot_id = page_data.get('lot_id')
print(f" Fetching bidding data from API...")
bidding_data = await fetch_lot_bidding_data(lot_id)
# Check if lot already has complete API data in database
import sqlite3
conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT followers_count, estimated_min_price, current_bid, bid_count
FROM lots WHERE lot_id = ?
""", (lot_id,))
existing = cursor.fetchone()
conn.close()
# Skip API if we have complete data (followers_count indicates API was called)
skip_api = existing and existing[0] is not None
if skip_api:
print(f" Using cached API data")
bidding_data = None
# Load cached data for display
page_data['followers_count'] = existing[0]
page_data['estimated_min_price'] = existing[1]
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
page_data['bid_count'] = existing[3] or 0
else:
print(f" Fetching bidding data from API...")
bidding_data = await fetch_lot_bidding_data(lot_id)
if bidding_data:
formatted_data = format_bid_data(bidding_data)
@@ -287,25 +311,47 @@ class TroostwijkScraper:
if not lot_uuid:
lot_uuid = lot_details_lot.get('id')
# Fetch bid history for intelligence
# Fetch bid history for intelligence (skip if already in DB)
if lot_uuid and page_data.get('bid_count', 0) > 0:
print(f" Fetching bid history...")
bid_history = await fetch_bid_history(lot_uuid)
if bid_history:
bid_data = parse_bid_history(bid_history, lot_id)
page_data.update(bid_data)
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
# Check if bid history already exists
conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT COUNT(*) FROM bid_history WHERE lot_id = ?
""", (lot_id,))
has_history = cursor.fetchone()[0] > 0
conn.close()
# Save bid history to database
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
if not has_history:
print(f" Fetching bid history...")
bid_history = await fetch_bid_history(lot_uuid)
if bid_history:
bid_data = parse_bid_history(bid_history, lot_id)
page_data.update(bid_data)
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
else:
print(f" Bid history cached")
# Fetch auction data for viewing/pickup times if we have auction_id
# Fetch auction data for viewing/pickup times (skip if already in DB)
auction_id = page_data.get('auction_id')
if auction_id:
auction_data = await fetch_auction_data(auction_id)
if auction_data:
auction_times = format_auction_data(auction_data)
page_data.update(auction_times)
# Check if lot already has viewing/pickup times
conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
""", (lot_id,))
times = cursor.fetchone()
conn.close()
has_times = times and (times[0] or times[1])
if not has_times:
auction_data = await fetch_auction_data(auction_id)
if auction_data:
auction_times = format_auction_data(auction_data)
page_data.update(auction_times)
else:
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
@@ -318,17 +364,35 @@ class TroostwijkScraper:
print(f" Images: {len(images)}")
if self.download_images:
# Download all images concurrently for this lot
import aiohttp
async with aiohttp.ClientSession() as session:
download_tasks = [
self._download_image(session, img_url, page_data['lot_id'], i)
for i, img_url in enumerate(images)
]
results = await asyncio.gather(*download_tasks, return_exceptions=True)
# Check which images are already downloaded
import sqlite3
conn = sqlite3.connect(self.cache.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT url FROM images
WHERE lot_id = ? AND downloaded = 1
""", (page_data['lot_id'],))
already_downloaded = {row[0] for row in cursor.fetchall()}
conn.close()
downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
print(f" Downloaded: {downloaded_count}/{len(images)} images")
# Only download missing images
images_to_download = [
(i, img_url) for i, img_url in enumerate(images)
if img_url not in already_downloaded
]
if images_to_download:
import aiohttp
async with aiohttp.ClientSession() as session:
download_tasks = [
self._download_image(session, img_url, page_data['lot_id'], i)
for i, img_url in images_to_download
]
results = await asyncio.gather(*download_tasks, return_exceptions=True)
downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
print(f" Downloaded: {downloaded_count}/{len(images_to_download)} new images")
else:
print(f" All {len(images)} images already cached")
return page_data