From 7e6629641f863eb136bbc505e87339b85d799c04 Mon Sep 17 00:00:00 2001 From: Tour Date: Sun, 7 Dec 2025 08:56:08 +0100 Subject: [PATCH] performance --- src/scraper.py | 118 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 91 insertions(+), 27 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index 5745415..97641ea 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -214,10 +214,34 @@ class TroostwijkScraper: except: pass - # Fetch bidding data from GraphQL API + # Fetch bidding data from GraphQL API (skip if already complete in DB) lot_id = page_data.get('lot_id') - print(f" Fetching bidding data from API...") - bidding_data = await fetch_lot_bidding_data(lot_id) + + # Check if lot already has complete API data in database + import sqlite3 + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT followers_count, estimated_min_price, current_bid, bid_count + FROM lots WHERE lot_id = ? + """, (lot_id,)) + existing = cursor.fetchone() + conn.close() + + # Skip API if we have complete data (followers_count indicates API was called) + skip_api = existing and existing[0] is not None + + if skip_api: + print(f" Using cached API data") + bidding_data = None + # Load cached data for display + page_data['followers_count'] = existing[0] + page_data['estimated_min_price'] = existing[1] + page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids') + page_data['bid_count'] = existing[3] or 0 + else: + print(f" Fetching bidding data from API...") + bidding_data = await fetch_lot_bidding_data(lot_id) if bidding_data: formatted_data = format_bid_data(bidding_data) @@ -287,25 +311,47 @@ class TroostwijkScraper: if not lot_uuid: lot_uuid = lot_details_lot.get('id') - # Fetch bid history for intelligence + # Fetch bid history for intelligence (skip if already in DB) if lot_uuid and page_data.get('bid_count', 0) > 0: - print(f" Fetching bid history...") - bid_history = await fetch_bid_history(lot_uuid) - if bid_history: - bid_data = parse_bid_history(bid_history, lot_id) - page_data.update(bid_data) - print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour") + # Check if bid history already exists + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT COUNT(*) FROM bid_history WHERE lot_id = ? + """, (lot_id,)) + has_history = cursor.fetchone()[0] > 0 + conn.close() - # Save bid history to database - self.cache.save_bid_history(lot_id, bid_data['bid_records']) + if not has_history: + print(f" Fetching bid history...") + bid_history = await fetch_bid_history(lot_uuid) + if bid_history: + bid_data = parse_bid_history(bid_history, lot_id) + page_data.update(bid_data) + print(f" >> Bid velocity: {bid_data['bid_velocity']:.1f} bids/hour") + self.cache.save_bid_history(lot_id, bid_data['bid_records']) + else: + print(f" Bid history cached") - # Fetch auction data for viewing/pickup times if we have auction_id + # Fetch auction data for viewing/pickup times (skip if already in DB) auction_id = page_data.get('auction_id') if auction_id: - auction_data = await fetch_auction_data(auction_id) - if auction_data: - auction_times = format_auction_data(auction_data) - page_data.update(auction_times) + # Check if lot already has viewing/pickup times + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT viewing_time, pickup_date FROM lots WHERE lot_id = ? + """, (lot_id,)) + times = cursor.fetchone() + conn.close() + + has_times = times and (times[0] or times[1]) + + if not has_times: + auction_data = await fetch_auction_data(auction_id) + if auction_data: + auction_times = format_auction_data(auction_data) + page_data.update(auction_times) else: print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)") @@ -318,17 +364,35 @@ class TroostwijkScraper: print(f" Images: {len(images)}") if self.download_images: - # Download all images concurrently for this lot - import aiohttp - async with aiohttp.ClientSession() as session: - download_tasks = [ - self._download_image(session, img_url, page_data['lot_id'], i) - for i, img_url in enumerate(images) - ] - results = await asyncio.gather(*download_tasks, return_exceptions=True) + # Check which images are already downloaded + import sqlite3 + conn = sqlite3.connect(self.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT url FROM images + WHERE lot_id = ? AND downloaded = 1 + """, (page_data['lot_id'],)) + already_downloaded = {row[0] for row in cursor.fetchall()} + conn.close() - downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception)) - print(f" Downloaded: {downloaded_count}/{len(images)} images") + # Only download missing images + images_to_download = [ + (i, img_url) for i, img_url in enumerate(images) + if img_url not in already_downloaded + ] + + if images_to_download: + import aiohttp + async with aiohttp.ClientSession() as session: + download_tasks = [ + self._download_image(session, img_url, page_data['lot_id'], i) + for i, img_url in images_to_download + ] + results = await asyncio.gather(*download_tasks, return_exceptions=True) + downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception)) + print(f" Downloaded: {downloaded_count}/{len(images_to_download)} new images") + else: + print(f" All {len(images)} images already cached") return page_data