GraphQL integrate, data correctness

This commit is contained in:
Tour
2025-12-07 00:25:25 +01:00
parent 8c5f6016ec
commit 71567fd965
17 changed files with 1037 additions and 13 deletions

View File

@@ -50,6 +50,8 @@ class CacheManager:
url TEXT UNIQUE,
title TEXT,
current_bid TEXT,
starting_bid TEXT,
minimum_bid TEXT,
bid_count INTEGER,
closing_time TEXT,
viewing_time TEXT,
@@ -72,6 +74,15 @@ class CacheManager:
)
""")
# Add new columns to lots table if they don't exist
cursor = conn.execute("PRAGMA table_info(lots)")
columns = {row[1] for row in cursor.fetchall()}
if 'starting_bid' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
if 'minimum_bid' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
# Remove duplicates before creating unique index
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
conn.execute("""
@@ -165,15 +176,18 @@ class CacheManager:
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT OR REPLACE INTO lots
(lot_id, auction_id, url, title, current_bid, bid_count, closing_time,
viewing_time, pickup_date, location, description, category, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
(lot_id, auction_id, url, title, current_bid, starting_bid, minimum_bid,
bid_count, closing_time, viewing_time, pickup_date, location, description,
category, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
lot_data['lot_id'],
lot_data.get('auction_id', ''),
lot_data['url'],
lot_data['title'],
lot_data.get('current_bid', ''),
lot_data.get('starting_bid', ''),
lot_data.get('minimum_bid', ''),
lot_data.get('bid_count', 0),
lot_data.get('closing_time', ''),
lot_data.get('viewing_time', ''),

138
src/graphql_client.py Normal file
View File

@@ -0,0 +1,138 @@
#!/usr/bin/env python3
"""
GraphQL client for fetching lot bidding data from Troostwijk API
"""
import aiohttp
from typing import Dict, Optional
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
LOT_BIDDING_QUERY = """
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
estimatedFullPrice {
saleTerm
}
lot {
id
displayId
auctionId
currentBidAmount {
cents
currency
}
initialAmount {
cents
currency
}
nextMinimalBid {
cents
currency
}
nextBidStepInCents
vat
markupPercentage
biddingStatus
bidsCount
startDate
endDate
assignedExplicitly
minimumBidAmountMet
}
}
}
"""
async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
"""
Fetch lot bidding data from GraphQL API
Args:
lot_display_id: The lot display ID (e.g., "A1-28505-5")
Returns:
Dict with bidding data or None if request fails
"""
variables = {
"lotDisplayId": lot_display_id,
"locale": "nl",
"platform": "TWK"
}
payload = {
"query": LOT_BIDDING_QUERY,
"variables": variables
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
if response.status == 200:
data = await response.json()
lot_details = data.get('data', {}).get('lotDetails', {})
if lot_details and lot_details.get('lot'):
return lot_details
return None
else:
print(f" GraphQL API error: {response.status}")
return None
except Exception as e:
print(f" GraphQL request failed: {e}")
return None
def format_bid_data(lot_details: Dict) -> Dict:
"""
Format GraphQL lot details into scraper format
Args:
lot_details: Raw lot details from GraphQL API
Returns:
Dict with formatted bid data
"""
lot = lot_details.get('lot', {})
current_bid_amount = lot.get('currentBidAmount')
initial_amount = lot.get('initialAmount')
next_minimal_bid = lot.get('nextMinimalBid')
# Format currency amounts
def format_cents(amount_obj):
if not amount_obj or not isinstance(amount_obj, dict):
return None
cents = amount_obj.get('cents')
currency = amount_obj.get('currency', 'EUR')
if cents is None:
return None
return f"EUR {cents / 100:.2f}" if currency == 'EUR' else f"{currency} {cents / 100:.2f}"
current_bid = format_cents(current_bid_amount) or "No bids"
starting_bid = format_cents(initial_amount) or ""
minimum_bid = format_cents(next_minimal_bid) or ""
# Format timestamps (Unix timestamps in seconds)
start_date = lot.get('startDate')
end_date = lot.get('endDate')
def format_timestamp(ts):
if ts:
from datetime import datetime
try:
# Timestamps are already in seconds
return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
except:
return ''
return ''
return {
'current_bid': current_bid,
'starting_bid': starting_bid,
'minimum_bid': minimum_bid,
'bid_count': lot.get('bidsCount', 0),
'closing_time': format_timestamp(end_date),
'bidding_status': lot.get('biddingStatus', ''),
'vat_percentage': lot.get('vat', 0),
}

View File

@@ -19,6 +19,7 @@ from config import (
)
from cache import CacheManager
from parse import DataParser
from graphql_client import fetch_lot_bidding_data, format_bid_data
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
@@ -176,29 +177,44 @@ class TroostwijkScraper:
self.visited_lots.add(url)
if page_data.get('type') == 'auction':
print(f" Type: AUCTION")
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
print(f" Location: {page_data.get('location', 'N/A')}")
print(f" Lots: {page_data.get('lots_count', 0)}")
print(f" Type: AUCTION")
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
print(f" Location: {page_data.get('location', 'N/A')}")
print(f" Lots: {page_data.get('lots_count', 0)}")
self.cache.save_auction(page_data)
elif page_data.get('type') == 'lot':
print(f" Type: LOT")
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
print(f" → Bid: {page_data.get('current_bid', 'N/A')}")
print(f" → Location: {page_data.get('location', 'N/A')}")
print(f" Type: LOT")
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
# Fetch bidding data from GraphQL API
lot_id = page_data.get('lot_id')
print(f" Fetching bidding data from API...")
bidding_data = await fetch_lot_bidding_data(lot_id)
if bidding_data:
formatted_data = format_bid_data(bidding_data)
# Update page_data with real bidding info
page_data.update(formatted_data)
print(f" Bid: {page_data.get('current_bid', 'N/A')}")
print(f" Bid Count: {page_data.get('bid_count', 0)}")
print(f" Closing: {page_data.get('closing_time', 'N/A')}")
else:
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
print(f" Location: {page_data.get('location', 'N/A')}")
self.cache.save_lot(page_data)
images = page_data.get('images', [])
if images:
self.cache.save_images(page_data['lot_id'], images)
print(f" Images: {len(images)}")
print(f" Images: {len(images)}")
if self.download_images:
for i, img_url in enumerate(images):
local_path = await self._download_image(img_url, page_data['lot_id'], i)
if local_path:
print(f" Downloaded: {Path(local_path).name}")
print(f" Downloaded: {Path(local_path).name}")
return page_data