enrich data

This commit is contained in:
Tour
2025-12-07 01:26:48 +01:00
parent bb7f4bbe9d
commit d09ee5574f
14 changed files with 1221 additions and 7 deletions

121
src/bid_history_client.py Normal file
View File

@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""
Client for fetching bid history from Troostwijk REST API
"""
import aiohttp
from typing import Dict, List, Optional
from datetime import datetime
BID_HISTORY_ENDPOINT = "https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history"
async def fetch_bid_history(lot_uuid: str, page_size: int = 100) -> Optional[List[Dict]]:
"""
Fetch complete bid history for a lot
Args:
lot_uuid: The lot UUID (from GraphQL response)
page_size: Number of bids per page
Returns:
List of bid dictionaries or None if request fails
"""
all_bids = []
page_number = 1
has_more = True
try:
async with aiohttp.ClientSession() as session:
while has_more:
url = BID_HISTORY_ENDPOINT.format(lot_uuid=lot_uuid)
params = {"pageNumber": page_number, "pageSize": page_size}
async with session.get(url, params=params, timeout=30) as response:
if response.status == 200:
data = await response.json()
results = data.get('results', [])
all_bids.extend(results)
has_more = data.get('hasNext', False)
page_number += 1
if not has_more:
break
else:
return None if page_number == 1 else all_bids
return all_bids if all_bids else None
except Exception as e:
print(f" Bid history fetch failed: {e}")
return None
def parse_bid_history(bid_history: List[Dict], lot_id: str) -> Dict:
"""
Parse bid history into database-ready format
Args:
bid_history: Raw bid history from API
lot_id: The lot display ID (e.g., "A1-28505-5")
Returns:
Dict with bid_records and calculated metrics
"""
if not bid_history:
return {
'bid_records': [],
'first_bid_time': None,
'last_bid_time': None,
'bid_velocity': 0.0
}
bid_records = []
for bid in bid_history:
bid_amount_cents = bid.get('currentBid', {}).get('cents', 0)
bid_amount = bid_amount_cents / 100.0 if bid_amount_cents else 0.0
bid_time_str = bid.get('createdAt', '')
bid_records.append({
'lot_id': lot_id,
'bid_amount': bid_amount,
'bid_time': bid_time_str,
'is_autobid': bid.get('autoBid', False),
'bidder_id': bid.get('buyerId', ''),
'bidder_number': bid.get('buyerNumber', 0)
})
# Calculate metrics
bid_times = []
for record in bid_records:
try:
# Parse ISO timestamp: "2025-12-04T17:17:45.694698Z"
dt = datetime.fromisoformat(record['bid_time'].replace('Z', '+00:00'))
bid_times.append(dt)
except:
pass
first_bid_time = None
last_bid_time = None
bid_velocity = 0.0
if bid_times:
bid_times.sort()
first_bid_time = bid_times[0].strftime('%Y-%m-%d %H:%M:%S')
last_bid_time = bid_times[-1].strftime('%Y-%m-%d %H:%M:%S')
# Calculate velocity (bids per hour)
if len(bid_times) > 1:
time_span = (bid_times[-1] - bid_times[0]).total_seconds() / 3600 # hours
if time_span > 0:
bid_velocity = len(bid_times) / time_span
return {
'bid_records': bid_records,
'first_bid_time': first_bid_time,
'last_bid_time': last_bid_time,
'bid_velocity': round(bid_velocity, 2)
}

View File

@@ -82,6 +82,63 @@ class CacheManager:
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
if 'minimum_bid' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
if 'status' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN status TEXT")
if 'brand' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT")
if 'model' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN model TEXT")
if 'attributes_json' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT")
# Bidding intelligence fields
if 'first_bid_time' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT")
if 'last_bid_time' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT")
if 'bid_velocity' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL")
if 'bid_increment' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL")
# Valuation intelligence fields
if 'year_manufactured' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER")
if 'condition_score' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL")
if 'condition_description' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT")
if 'serial_number' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT")
if 'manufacturer' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT")
if 'damage_description' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT")
# Create bid_history table
conn.execute("""
CREATE TABLE IF NOT EXISTS bid_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
lot_id TEXT NOT NULL,
bid_amount REAL NOT NULL,
bid_time TEXT NOT NULL,
is_autobid INTEGER DEFAULT 0,
bidder_id TEXT,
bidder_number INTEGER,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
ON bid_history(lot_id, bid_time)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
ON bid_history(bidder_id)
""")
# Remove duplicates before creating unique index
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
@@ -178,8 +235,12 @@ class CacheManager:
INSERT OR REPLACE INTO lots
(lot_id, auction_id, url, title, current_bid, starting_bid, minimum_bid,
bid_count, closing_time, viewing_time, pickup_date, location, description,
category, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
category, status, brand, model, attributes_json,
first_bid_time, last_bid_time, bid_velocity, bid_increment,
year_manufactured, condition_score, condition_description,
serial_number, manufacturer, damage_description,
scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
lot_data['lot_id'],
lot_data.get('auction_id', ''),
@@ -195,10 +256,49 @@ class CacheManager:
lot_data.get('location', ''),
lot_data.get('description', ''),
lot_data.get('category', ''),
lot_data.get('status', ''),
lot_data.get('brand', ''),
lot_data.get('model', ''),
lot_data.get('attributes_json', ''),
lot_data.get('first_bid_time'),
lot_data.get('last_bid_time'),
lot_data.get('bid_velocity'),
lot_data.get('bid_increment'),
lot_data.get('year_manufactured'),
lot_data.get('condition_score'),
lot_data.get('condition_description', ''),
lot_data.get('serial_number', ''),
lot_data.get('manufacturer', ''),
lot_data.get('damage_description', ''),
lot_data['scraped_at']
))
conn.commit()
def save_bid_history(self, lot_id: str, bid_records: List[Dict]):
"""Save bid history records to database"""
if not bid_records:
return
with sqlite3.connect(self.db_path) as conn:
# Clear existing bid history for this lot
conn.execute("DELETE FROM bid_history WHERE lot_id = ?", (lot_id,))
# Insert new records
for record in bid_records:
conn.execute("""
INSERT INTO bid_history
(lot_id, bid_amount, bid_time, is_autobid, bidder_id, bidder_number)
VALUES (?, ?, ?, ?, ?, ?)
""", (
record['lot_id'],
record['bid_amount'],
record['bid_time'],
1 if record['is_autobid'] else 0,
record['bidder_id'],
record['bidder_number']
))
conn.commit()
def save_images(self, lot_id: str, image_urls: List[str]):
"""Save image URLs for a lot (prevents duplicates via unique constraint)"""
with sqlite3.connect(self.db_path) as conn:

View File

@@ -7,6 +7,27 @@ from typing import Dict, Optional
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
AUCTION_QUERY = """
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
auction(id: $auctionId, locale: $locale, platform: $platform) {
id
displayId
viewingDays {
startDate
endDate
city
countryCode
}
collectionDays {
startDate
endDate
city
countryCode
}
}
}
"""
LOT_BIDDING_QUERY = """
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
@@ -44,6 +65,42 @@ query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platfo
"""
async def fetch_auction_data(auction_id: str) -> Optional[Dict]:
"""
Fetch auction data (viewing/pickup times) from GraphQL API
Args:
auction_id: The auction UUID
Returns:
Dict with auction data or None if request fails
"""
variables = {
"auctionId": auction_id,
"locale": "nl",
"platform": "TWK"
}
payload = {
"query": AUCTION_QUERY,
"variables": variables
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
if response.status == 200:
data = await response.json()
auction = data.get('data', {}).get('auction', {})
if auction:
return auction
return None
else:
return None
except Exception as e:
return None
async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
"""
Fetch lot bidding data from GraphQL API
@@ -127,6 +184,15 @@ def format_bid_data(lot_details: Dict) -> Dict:
return ''
return ''
# Format status from minimumBidAmountMet
minimum_bid_met = lot.get('minimumBidAmountMet', '')
status_map = {
'NO_MINIMUM_BID_AMOUNT': 'Geen Minimumprijs',
'MINIMUM_BID_AMOUNT_NOT_MET': 'Minimumprijs nog niet gehaald',
'MINIMUM_BID_AMOUNT_MET': 'Minimumprijs gehaald'
}
status = status_map.get(minimum_bid_met, '')
return {
'current_bid': current_bid,
'starting_bid': starting_bid,
@@ -135,4 +201,209 @@ def format_bid_data(lot_details: Dict) -> Dict:
'closing_time': format_timestamp(end_date),
'bidding_status': lot.get('biddingStatus', ''),
'vat_percentage': lot.get('vat', 0),
'status': status,
'auction_id': lot.get('auctionId', ''),
}
def format_auction_data(auction: Dict) -> Dict:
"""
Extract viewing/pickup times from auction data
Args:
auction: Auction data from GraphQL
Returns:
Dict with viewing_time and pickup_date
"""
from datetime import datetime
def format_days(days_list):
if not days_list or not isinstance(days_list, list) or len(days_list) == 0:
return ''
first_day = days_list[0]
start_ts = first_day.get('startDate')
end_ts = first_day.get('endDate')
city = first_day.get('city', '')
country = first_day.get('countryCode', '').upper()
if not start_ts or not end_ts:
return ''
try:
start_dt = datetime.fromtimestamp(start_ts)
end_dt = datetime.fromtimestamp(end_ts)
# Format: "vr 05 dec 2025 van 09:00 tot 12:00"
days_nl = ['ma', 'di', 'wo', 'do', 'vr', 'za', 'zo']
months_nl = ['jan', 'feb', 'mrt', 'apr', 'mei', 'jun',
'jul', 'aug', 'sep', 'okt', 'nov', 'dec']
day_name = days_nl[start_dt.weekday()]
month_name = months_nl[start_dt.month - 1]
time_str = f"{day_name} {start_dt.day:02d} {month_name} {start_dt.year} van {start_dt.strftime('%H:%M')} tot {end_dt.strftime('%H:%M')}"
if city:
location = f"{city}, {country}" if country else city
return f"{time_str}\n{location}"
return time_str
except:
return ''
viewing_time = format_days(auction.get('viewingDays', []))
pickup_date = format_days(auction.get('collectionDays', []))
return {
'viewing_time': viewing_time,
'pickup_date': pickup_date
}
def extract_attributes_from_lot_json(lot_json: Dict) -> Dict:
"""
Extract brand, model, and other attributes from lot JSON
Args:
lot_json: The lot object from __NEXT_DATA__
Returns:
Dict with brand, model, and attributes
"""
attributes = lot_json.get('attributes', [])
if not isinstance(attributes, list):
return {'brand': '', 'model': '', 'attributes_json': ''}
brand = ''
model = ''
# Look for brand and model in attributes
for attr in attributes:
if not isinstance(attr, dict):
continue
name = attr.get('name', '').lower()
value = attr.get('value', '')
if name in ['brand', 'merk', 'fabrikant', 'manufacturer']:
brand = value
elif name in ['model', 'type']:
model = value
import json
return {
'brand': brand,
'model': model,
'attributes_json': json.dumps(attributes) if attributes else ''
}
def extract_enriched_attributes(lot_json: Dict, page_data: Dict) -> Dict:
"""
Extract enriched valuation attributes from lot data
Args:
lot_json: The lot object from __NEXT_DATA__
page_data: Already parsed page data (title, description)
Returns:
Dict with enriched attributes
"""
import re
attributes = lot_json.get('attributes', [])
title = page_data.get('title', '')
description = page_data.get('description', '')
# Initialize
year_manufactured = None
condition_description = ''
condition_score = None
serial_number = ''
manufacturer = ''
damage_description = ''
# Extract from attributes array
for attr in attributes:
if not isinstance(attr, dict):
continue
name = attr.get('name', '').lower()
value = str(attr.get('value', ''))
if name in ['jaar', 'year', 'bouwjaar', 'productiejaar']:
try:
year_manufactured = int(re.search(r'\d{4}', value).group())
except:
pass
elif name in ['conditie', 'condition', 'staat']:
condition_description = value
# Map condition to score (0-10)
condition_map = {
'nieuw': 10.0, 'new': 10.0,
'als nieuw': 9.5, 'like new': 9.5,
'uitstekend': 9.0, 'excellent': 9.0,
'zeer goed': 8.0, 'very good': 8.0,
'goed': 7.0, 'good': 7.0,
'redelijk': 6.0, 'fair': 6.0,
'matig': 5.0, 'moderate': 5.0,
'slecht': 3.0, 'poor': 3.0,
'defect': 1.0, 'defective': 1.0
}
for key, score in condition_map.items():
if key in value.lower():
condition_score = score
break
elif name in ['serienummer', 'serial', 'serial number', 'artikelnummer']:
serial_number = value
elif name in ['fabrikant', 'manufacturer', 'merk', 'brand']:
manufacturer = value
# Extract 4-digit year from title if not found
if not year_manufactured:
year_match = re.search(r'\b(19|20)\d{2}\b', title)
if year_match:
try:
year_manufactured = int(year_match.group())
except:
pass
# Extract damage mentions from description
damage_keywords = ['schade', 'damage', 'beschadigd', 'damaged', 'defect', 'broken', 'kapot']
if description:
for keyword in damage_keywords:
if keyword in description.lower():
# Extract sentence containing damage keyword
sentences = description.split('.')
for sentence in sentences:
if keyword in sentence.lower():
damage_description = sentence.strip()
break
break
# Extract condition from __NEXT_DATA__ fields
if not condition_description:
lot_condition = lot_json.get('condition', '')
if lot_condition and lot_condition != 'NOT_CHECKED':
condition_description = lot_condition
lot_appearance = lot_json.get('appearance', '')
if lot_appearance and lot_appearance != 'NOT_CHECKED':
if condition_description:
condition_description += f", {lot_appearance}"
else:
condition_description = lot_appearance
return {
'year_manufactured': year_manufactured,
'condition_description': condition_description,
'condition_score': condition_score,
'serial_number': serial_number,
'manufacturer': manufacturer or page_data.get('brand', ''), # Fallback to brand
'damage_description': damage_description
}

View File

@@ -19,7 +19,13 @@ from config import (
)
from cache import CacheManager
from parse import DataParser
from graphql_client import fetch_lot_bidding_data, format_bid_data
from graphql_client import (
fetch_lot_bidding_data, format_bid_data,
fetch_auction_data, format_auction_data,
extract_attributes_from_lot_json,
extract_enriched_attributes
)
from bid_history_client import fetch_bid_history, parse_bid_history
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
@@ -183,6 +189,31 @@ class TroostwijkScraper:
print(f" Type: LOT")
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
# Extract ALL data from __NEXT_DATA__ lot object
import json
import re
lot_json = None
lot_uuid = None
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
if lot_json:
# Basic attributes
attrs = extract_attributes_from_lot_json(lot_json)
page_data.update(attrs)
# Enriched attributes (year, condition, etc.)
enriched = extract_enriched_attributes(lot_json, page_data)
page_data.update(enriched)
# Get lot UUID for bid history
lot_uuid = lot_json.get('id')
except:
pass
# Fetch bidding data from GraphQL API
lot_id = page_data.get('lot_id')
print(f" Fetching bidding data from API...")
@@ -190,11 +221,39 @@ class TroostwijkScraper:
if bidding_data:
formatted_data = format_bid_data(bidding_data)
# Update page_data with real bidding info
page_data.update(formatted_data)
print(f" Bid: {page_data.get('current_bid', 'N/A')}")
print(f" Bid Count: {page_data.get('bid_count', 0)}")
print(f" Closing: {page_data.get('closing_time', 'N/A')}")
print(f" Status: {page_data.get('status', 'N/A')}")
# Extract bid increment from nextBidStepInCents
lot_details_lot = bidding_data.get('lot', {})
next_step_cents = lot_details_lot.get('nextBidStepInCents')
if next_step_cents:
page_data['bid_increment'] = next_step_cents / 100.0
# Get lot UUID if not already extracted
if not lot_uuid:
lot_uuid = lot_details_lot.get('id')
# Fetch bid history for intelligence
if lot_uuid and page_data.get('bid_count', 0) > 0:
print(f" Fetching bid history...")
bid_history = await fetch_bid_history(lot_uuid)
if bid_history:
bid_data = parse_bid_history(bid_history, lot_id)
page_data.update(bid_data)
print(f" Bid velocity: {bid_data['bid_velocity']} bids/hour")
# Save bid history to database
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
# Fetch auction data for viewing/pickup times if we have auction_id
auction_id = page_data.get('auction_id')
if auction_id:
auction_data = await fetch_auction_data(auction_id)
if auction_data:
auction_times = format_auction_data(auction_data)
page_data.update(auction_times)
else:
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")