GraphQL integrate, data correctness
This commit is contained in:
20
src/cache.py
20
src/cache.py
@@ -50,6 +50,8 @@ class CacheManager:
|
||||
url TEXT UNIQUE,
|
||||
title TEXT,
|
||||
current_bid TEXT,
|
||||
starting_bid TEXT,
|
||||
minimum_bid TEXT,
|
||||
bid_count INTEGER,
|
||||
closing_time TEXT,
|
||||
viewing_time TEXT,
|
||||
@@ -72,6 +74,15 @@ class CacheManager:
|
||||
)
|
||||
""")
|
||||
|
||||
# Add new columns to lots table if they don't exist
|
||||
cursor = conn.execute("PRAGMA table_info(lots)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if 'starting_bid' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
|
||||
if 'minimum_bid' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
|
||||
|
||||
# Remove duplicates before creating unique index
|
||||
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
|
||||
conn.execute("""
|
||||
@@ -165,15 +176,18 @@ class CacheManager:
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO lots
|
||||
(lot_id, auction_id, url, title, current_bid, bid_count, closing_time,
|
||||
viewing_time, pickup_date, location, description, category, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
(lot_id, auction_id, url, title, current_bid, starting_bid, minimum_bid,
|
||||
bid_count, closing_time, viewing_time, pickup_date, location, description,
|
||||
category, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
lot_data['lot_id'],
|
||||
lot_data.get('auction_id', ''),
|
||||
lot_data['url'],
|
||||
lot_data['title'],
|
||||
lot_data.get('current_bid', ''),
|
||||
lot_data.get('starting_bid', ''),
|
||||
lot_data.get('minimum_bid', ''),
|
||||
lot_data.get('bid_count', 0),
|
||||
lot_data.get('closing_time', ''),
|
||||
lot_data.get('viewing_time', ''),
|
||||
|
||||
138
src/graphql_client.py
Normal file
138
src/graphql_client.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
GraphQL client for fetching lot bidding data from Troostwijk API
|
||||
"""
|
||||
import aiohttp
|
||||
from typing import Dict, Optional
|
||||
|
||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||
|
||||
LOT_BIDDING_QUERY = """
|
||||
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
||||
estimatedFullPrice {
|
||||
saleTerm
|
||||
}
|
||||
lot {
|
||||
id
|
||||
displayId
|
||||
auctionId
|
||||
currentBidAmount {
|
||||
cents
|
||||
currency
|
||||
}
|
||||
initialAmount {
|
||||
cents
|
||||
currency
|
||||
}
|
||||
nextMinimalBid {
|
||||
cents
|
||||
currency
|
||||
}
|
||||
nextBidStepInCents
|
||||
vat
|
||||
markupPercentage
|
||||
biddingStatus
|
||||
bidsCount
|
||||
startDate
|
||||
endDate
|
||||
assignedExplicitly
|
||||
minimumBidAmountMet
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
|
||||
"""
|
||||
Fetch lot bidding data from GraphQL API
|
||||
|
||||
Args:
|
||||
lot_display_id: The lot display ID (e.g., "A1-28505-5")
|
||||
|
||||
Returns:
|
||||
Dict with bidding data or None if request fails
|
||||
"""
|
||||
variables = {
|
||||
"lotDisplayId": lot_display_id,
|
||||
"locale": "nl",
|
||||
"platform": "TWK"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"query": LOT_BIDDING_QUERY,
|
||||
"variables": variables
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
lot_details = data.get('data', {}).get('lotDetails', {})
|
||||
|
||||
if lot_details and lot_details.get('lot'):
|
||||
return lot_details
|
||||
return None
|
||||
else:
|
||||
print(f" GraphQL API error: {response.status}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" GraphQL request failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def format_bid_data(lot_details: Dict) -> Dict:
|
||||
"""
|
||||
Format GraphQL lot details into scraper format
|
||||
|
||||
Args:
|
||||
lot_details: Raw lot details from GraphQL API
|
||||
|
||||
Returns:
|
||||
Dict with formatted bid data
|
||||
"""
|
||||
lot = lot_details.get('lot', {})
|
||||
|
||||
current_bid_amount = lot.get('currentBidAmount')
|
||||
initial_amount = lot.get('initialAmount')
|
||||
next_minimal_bid = lot.get('nextMinimalBid')
|
||||
|
||||
# Format currency amounts
|
||||
def format_cents(amount_obj):
|
||||
if not amount_obj or not isinstance(amount_obj, dict):
|
||||
return None
|
||||
cents = amount_obj.get('cents')
|
||||
currency = amount_obj.get('currency', 'EUR')
|
||||
if cents is None:
|
||||
return None
|
||||
return f"EUR {cents / 100:.2f}" if currency == 'EUR' else f"{currency} {cents / 100:.2f}"
|
||||
|
||||
current_bid = format_cents(current_bid_amount) or "No bids"
|
||||
starting_bid = format_cents(initial_amount) or ""
|
||||
minimum_bid = format_cents(next_minimal_bid) or ""
|
||||
|
||||
# Format timestamps (Unix timestamps in seconds)
|
||||
start_date = lot.get('startDate')
|
||||
end_date = lot.get('endDate')
|
||||
|
||||
def format_timestamp(ts):
|
||||
if ts:
|
||||
from datetime import datetime
|
||||
try:
|
||||
# Timestamps are already in seconds
|
||||
return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
return ''
|
||||
return ''
|
||||
|
||||
return {
|
||||
'current_bid': current_bid,
|
||||
'starting_bid': starting_bid,
|
||||
'minimum_bid': minimum_bid,
|
||||
'bid_count': lot.get('bidsCount', 0),
|
||||
'closing_time': format_timestamp(end_date),
|
||||
'bidding_status': lot.get('biddingStatus', ''),
|
||||
'vat_percentage': lot.get('vat', 0),
|
||||
}
|
||||
@@ -19,6 +19,7 @@ from config import (
|
||||
)
|
||||
from cache import CacheManager
|
||||
from parse import DataParser
|
||||
from graphql_client import fetch_lot_bidding_data, format_bid_data
|
||||
|
||||
class TroostwijkScraper:
|
||||
"""Main scraper class for Troostwijk Auctions"""
|
||||
@@ -176,29 +177,44 @@ class TroostwijkScraper:
|
||||
self.visited_lots.add(url)
|
||||
|
||||
if page_data.get('type') == 'auction':
|
||||
print(f" → Type: AUCTION")
|
||||
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||
print(f" → Location: {page_data.get('location', 'N/A')}")
|
||||
print(f" → Lots: {page_data.get('lots_count', 0)}")
|
||||
print(f" Type: AUCTION")
|
||||
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||
print(f" Location: {page_data.get('location', 'N/A')}")
|
||||
print(f" Lots: {page_data.get('lots_count', 0)}")
|
||||
self.cache.save_auction(page_data)
|
||||
|
||||
elif page_data.get('type') == 'lot':
|
||||
print(f" → Type: LOT")
|
||||
print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||
print(f" → Bid: {page_data.get('current_bid', 'N/A')}")
|
||||
print(f" → Location: {page_data.get('location', 'N/A')}")
|
||||
print(f" Type: LOT")
|
||||
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||
|
||||
# Fetch bidding data from GraphQL API
|
||||
lot_id = page_data.get('lot_id')
|
||||
print(f" Fetching bidding data from API...")
|
||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
||||
|
||||
if bidding_data:
|
||||
formatted_data = format_bid_data(bidding_data)
|
||||
# Update page_data with real bidding info
|
||||
page_data.update(formatted_data)
|
||||
print(f" Bid: {page_data.get('current_bid', 'N/A')}")
|
||||
print(f" Bid Count: {page_data.get('bid_count', 0)}")
|
||||
print(f" Closing: {page_data.get('closing_time', 'N/A')}")
|
||||
else:
|
||||
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
||||
|
||||
print(f" Location: {page_data.get('location', 'N/A')}")
|
||||
self.cache.save_lot(page_data)
|
||||
|
||||
images = page_data.get('images', [])
|
||||
if images:
|
||||
self.cache.save_images(page_data['lot_id'], images)
|
||||
print(f" → Images: {len(images)}")
|
||||
print(f" Images: {len(images)}")
|
||||
|
||||
if self.download_images:
|
||||
for i, img_url in enumerate(images):
|
||||
local_path = await self._download_image(img_url, page_data['lot_id'], i)
|
||||
if local_path:
|
||||
print(f" ✓ Downloaded: {Path(local_path).name}")
|
||||
print(f" Downloaded: {Path(local_path).name}")
|
||||
|
||||
return page_data
|
||||
|
||||
|
||||
Reference in New Issue
Block a user