performance
This commit is contained in:
118
src/scraper.py
118
src/scraper.py
@@ -214,10 +214,34 @@ class TroostwijkScraper:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Fetch bidding data from GraphQL API
|
# Fetch bidding data from GraphQL API (skip if already complete in DB)
|
||||||
lot_id = page_data.get('lot_id')
|
lot_id = page_data.get('lot_id')
|
||||||
print(f" Fetching bidding data from API...")
|
|
||||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
# Check if lot already has complete API data in database
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT followers_count, estimated_min_price, current_bid, bid_count
|
||||||
|
FROM lots WHERE lot_id = ?
|
||||||
|
""", (lot_id,))
|
||||||
|
existing = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# Skip API if we have complete data (followers_count indicates API was called)
|
||||||
|
skip_api = existing and existing[0] is not None
|
||||||
|
|
||||||
|
if skip_api:
|
||||||
|
print(f" Using cached API data")
|
||||||
|
bidding_data = None
|
||||||
|
# Load cached data for display
|
||||||
|
page_data['followers_count'] = existing[0]
|
||||||
|
page_data['estimated_min_price'] = existing[1]
|
||||||
|
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
||||||
|
page_data['bid_count'] = existing[3] or 0
|
||||||
|
else:
|
||||||
|
print(f" Fetching bidding data from API...")
|
||||||
|
bidding_data = await fetch_lot_bidding_data(lot_id)
|
||||||
|
|
||||||
if bidding_data:
|
if bidding_data:
|
||||||
formatted_data = format_bid_data(bidding_data)
|
formatted_data = format_bid_data(bidding_data)
|
||||||
@@ -287,25 +311,47 @@ class TroostwijkScraper:
|
|||||||
if not lot_uuid:
|
if not lot_uuid:
|
||||||
lot_uuid = lot_details_lot.get('id')
|
lot_uuid = lot_details_lot.get('id')
|
||||||
|
|
||||||
# Fetch bid history for intelligence
|
# Fetch bid history for intelligence (skip if already in DB)
|
||||||
if lot_uuid and page_data.get('bid_count', 0) > 0:
|
if lot_uuid and page_data.get('bid_count', 0) > 0:
|
||||||
print(f" Fetching bid history...")
|
# Check if bid history already exists
|
||||||
bid_history = await fetch_bid_history(lot_uuid)
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
if bid_history:
|
cursor = conn.cursor()
|
||||||
bid_data = parse_bid_history(bid_history, lot_id)
|
cursor.execute("""
|
||||||
page_data.update(bid_data)
|
SELECT COUNT(*) FROM bid_history WHERE lot_id = ?
|
||||||
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
""", (lot_id,))
|
||||||
|
has_history = cursor.fetchone()[0] > 0
|
||||||
|
conn.close()
|
||||||
|
|
||||||
# Save bid history to database
|
if not has_history:
|
||||||
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
print(f" Fetching bid history...")
|
||||||
|
bid_history = await fetch_bid_history(lot_uuid)
|
||||||
|
if bid_history:
|
||||||
|
bid_data = parse_bid_history(bid_history, lot_id)
|
||||||
|
page_data.update(bid_data)
|
||||||
|
print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour")
|
||||||
|
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||||
|
else:
|
||||||
|
print(f" Bid history cached")
|
||||||
|
|
||||||
# Fetch auction data for viewing/pickup times if we have auction_id
|
# Fetch auction data for viewing/pickup times (skip if already in DB)
|
||||||
auction_id = page_data.get('auction_id')
|
auction_id = page_data.get('auction_id')
|
||||||
if auction_id:
|
if auction_id:
|
||||||
auction_data = await fetch_auction_data(auction_id)
|
# Check if lot already has viewing/pickup times
|
||||||
if auction_data:
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
auction_times = format_auction_data(auction_data)
|
cursor = conn.cursor()
|
||||||
page_data.update(auction_times)
|
cursor.execute("""
|
||||||
|
SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ?
|
||||||
|
""", (lot_id,))
|
||||||
|
times = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
has_times = times and (times[0] or times[1])
|
||||||
|
|
||||||
|
if not has_times:
|
||||||
|
auction_data = await fetch_auction_data(auction_id)
|
||||||
|
if auction_data:
|
||||||
|
auction_times = format_auction_data(auction_data)
|
||||||
|
page_data.update(auction_times)
|
||||||
else:
|
else:
|
||||||
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
||||||
|
|
||||||
@@ -318,17 +364,35 @@ class TroostwijkScraper:
|
|||||||
print(f" Images: {len(images)}")
|
print(f" Images: {len(images)}")
|
||||||
|
|
||||||
if self.download_images:
|
if self.download_images:
|
||||||
# Download all images concurrently for this lot
|
# Check which images are already downloaded
|
||||||
import aiohttp
|
import sqlite3
|
||||||
async with aiohttp.ClientSession() as session:
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
download_tasks = [
|
cursor = conn.cursor()
|
||||||
self._download_image(session, img_url, page_data['lot_id'], i)
|
cursor.execute("""
|
||||||
for i, img_url in enumerate(images)
|
SELECT url FROM images
|
||||||
]
|
WHERE lot_id = ? AND downloaded = 1
|
||||||
results = await asyncio.gather(*download_tasks, return_exceptions=True)
|
""", (page_data['lot_id'],))
|
||||||
|
already_downloaded = {row[0] for row in cursor.fetchall()}
|
||||||
|
conn.close()
|
||||||
|
|
||||||
downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
|
# Only download missing images
|
||||||
print(f" Downloaded: {downloaded_count}/{len(images)} images")
|
images_to_download = [
|
||||||
|
(i, img_url) for i, img_url in enumerate(images)
|
||||||
|
if img_url not in already_downloaded
|
||||||
|
]
|
||||||
|
|
||||||
|
if images_to_download:
|
||||||
|
import aiohttp
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
download_tasks = [
|
||||||
|
self._download_image(session, img_url, page_data['lot_id'], i)
|
||||||
|
for i, img_url in images_to_download
|
||||||
|
]
|
||||||
|
results = await asyncio.gather(*download_tasks, return_exceptions=True)
|
||||||
|
downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
|
||||||
|
print(f" Downloaded: {downloaded_count}/{len(images_to_download)} new images")
|
||||||
|
else:
|
||||||
|
print(f" All {len(images)} images already cached")
|
||||||
|
|
||||||
return page_data
|
return page_data
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user