enrich data
This commit is contained in:
143
COMPREHENSIVE_UPDATE_PLAN.md
Normal file
143
COMPREHENSIVE_UPDATE_PLAN.md
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
# Comprehensive Data Enrichment Plan
|
||||||
|
|
||||||
|
## Current Status: Working Features
|
||||||
|
✅ Image downloads (concurrent)
|
||||||
|
✅ Basic bid data (current_bid, starting_bid, minimum_bid, bid_count, closing_time)
|
||||||
|
✅ Status extraction
|
||||||
|
✅ Brand/Model from attributes
|
||||||
|
✅ Attributes JSON storage
|
||||||
|
|
||||||
|
## Phase 1: Core Bidding Intelligence (HIGH PRIORITY)
|
||||||
|
|
||||||
|
### Data Sources Identified:
|
||||||
|
1. **GraphQL lot bidding API** - Already integrated
|
||||||
|
- currentBidAmount, initialAmount, bidsCount
|
||||||
|
- startDate, endDate (for first_bid_time calculation)
|
||||||
|
|
||||||
|
2. **REST bid history API** ✨ NEW DISCOVERY
|
||||||
|
- Endpoint: `https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history`
|
||||||
|
- Returns: bid amounts, timestamps, autobid flags, bidder IDs
|
||||||
|
- Pagination supported
|
||||||
|
|
||||||
|
### Database Schema Changes:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Extend lots table with bidding intelligence
|
||||||
|
ALTER TABLE lots ADD COLUMN estimated_min DECIMAL(12,2);
|
||||||
|
ALTER TABLE lots ADD COLUMN estimated_max DECIMAL(12,2);
|
||||||
|
ALTER TABLE lots ADD COLUMN reserve_price DECIMAL(12,2);
|
||||||
|
ALTER TABLE lots ADD COLUMN reserve_met BOOLEAN DEFAULT FALSE;
|
||||||
|
ALTER TABLE lots ADD COLUMN bid_increment DECIMAL(12,2);
|
||||||
|
ALTER TABLE lots ADD COLUMN watch_count INTEGER DEFAULT 0;
|
||||||
|
ALTER TABLE lots ADD COLUMN first_bid_time TEXT;
|
||||||
|
ALTER TABLE lots ADD COLUMN last_bid_time TEXT;
|
||||||
|
ALTER TABLE lots ADD COLUMN bid_velocity DECIMAL(5,2);
|
||||||
|
|
||||||
|
-- NEW: Bid history table
|
||||||
|
CREATE TABLE IF NOT EXISTS bid_history (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
lot_id TEXT NOT NULL,
|
||||||
|
lot_uuid TEXT NOT NULL,
|
||||||
|
bid_amount DECIMAL(12,2) NOT NULL,
|
||||||
|
bid_time TEXT NOT NULL,
|
||||||
|
is_winning BOOLEAN DEFAULT FALSE,
|
||||||
|
is_autobid BOOLEAN DEFAULT FALSE,
|
||||||
|
bidder_id TEXT,
|
||||||
|
bidder_number INTEGER,
|
||||||
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time ON bid_history(lot_id, bid_time);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder ON bid_history(bidder_id);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Implementation:
|
||||||
|
- Add `fetch_bid_history()` function to call REST API
|
||||||
|
- Parse and store all historical bids
|
||||||
|
- Calculate bid_velocity (bids per hour)
|
||||||
|
- Extract first_bid_time, last_bid_time
|
||||||
|
|
||||||
|
## Phase 2: Valuation Intelligence
|
||||||
|
|
||||||
|
### Data Sources:
|
||||||
|
1. **Attributes array** (already in __NEXT_DATA__)
|
||||||
|
- condition, year, manufacturer, model, serial_number
|
||||||
|
|
||||||
|
2. **Description field**
|
||||||
|
- Extract year patterns, condition mentions, damage descriptions
|
||||||
|
|
||||||
|
### Database Schema:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Valuation fields
|
||||||
|
ALTER TABLE lots ADD COLUMN condition_score DECIMAL(3,2);
|
||||||
|
ALTER TABLE lots ADD COLUMN condition_description TEXT;
|
||||||
|
ALTER TABLE lots ADD COLUMN year_manufactured INTEGER;
|
||||||
|
ALTER TABLE lots ADD COLUMN serial_number TEXT;
|
||||||
|
ALTER TABLE lots ADD COLUMN manufacturer TEXT;
|
||||||
|
ALTER TABLE lots ADD COLUMN damage_description TEXT;
|
||||||
|
ALTER TABLE lots ADD COLUMN provenance TEXT;
|
||||||
|
```
|
||||||
|
|
||||||
|
### Implementation:
|
||||||
|
- Parse attributes for: Jaar, Conditie, Serienummer, Fabrikant
|
||||||
|
- Extract 4-digit years from title/description
|
||||||
|
- Map condition values to 0-10 scale
|
||||||
|
|
||||||
|
## Phase 3: Auction House Intelligence
|
||||||
|
|
||||||
|
### Data Sources:
|
||||||
|
1. **GraphQL auction query**
|
||||||
|
- Already partially working
|
||||||
|
|
||||||
|
2. **Auction __NEXT_DATA__**
|
||||||
|
- May contain buyer's premium, shipping costs
|
||||||
|
|
||||||
|
### Database Schema:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
ALTER TABLE auctions ADD COLUMN buyers_premium_percent DECIMAL(5,2);
|
||||||
|
ALTER TABLE auctions ADD COLUMN shipping_available BOOLEAN;
|
||||||
|
ALTER TABLE auctions ADD COLUMN payment_methods TEXT;
|
||||||
|
```
|
||||||
|
|
||||||
|
## Viewing/Pickup Times Resolution
|
||||||
|
|
||||||
|
### Finding:
|
||||||
|
- `viewingDays` and `collectionDays` in GraphQL only return location (city, countryCode)
|
||||||
|
- Times are NOT in the GraphQL API
|
||||||
|
- Times must be in auction __NEXT_DATA__ or not set for many auctions
|
||||||
|
|
||||||
|
### Solution:
|
||||||
|
- Mark viewing_time/pickup_date as "location only" when times unavailable
|
||||||
|
- Store: "Nijmegen, NL" instead of full date/time string
|
||||||
|
- Accept that many auctions don't have viewing times set
|
||||||
|
|
||||||
|
## Priority Implementation Order:
|
||||||
|
|
||||||
|
1. **BID HISTORY API** (30 min) - Highest value
|
||||||
|
- Fetch and store all bid history
|
||||||
|
- Calculate bid_velocity
|
||||||
|
- Track autobid patterns
|
||||||
|
|
||||||
|
2. **ENRICHED ATTRIBUTES** (20 min) - Medium-high value
|
||||||
|
- Extract year, condition, manufacturer from existing data
|
||||||
|
- Parse description for damage/condition mentions
|
||||||
|
|
||||||
|
3. **VIEWING/PICKUP FIX** (10 min) - Low value (data often missing)
|
||||||
|
- Update to store location-only when times unavailable
|
||||||
|
|
||||||
|
## Data Quality Expectations:
|
||||||
|
|
||||||
|
| Field | Coverage Expected | Source |
|
||||||
|
|-------|------------------|---------|
|
||||||
|
| bid_history | 100% (for lots with bids) | REST API |
|
||||||
|
| bid_velocity | 100% (calculated) | Derived |
|
||||||
|
| year_manufactured | ~40% | Attributes/Title |
|
||||||
|
| condition_score | ~30% | Attributes |
|
||||||
|
| manufacturer | ~60% | Attributes |
|
||||||
|
| viewing_time | ~20% | Often not set |
|
||||||
|
| buyers_premium | 100% | GraphQL/Props |
|
||||||
|
|
||||||
|
## Estimated Total Implementation Time: 60-90 minutes
|
||||||
@@ -278,7 +278,7 @@ SELECT lot_id, current_bid, bid_count FROM lots WHERE bid_count > 0;
|
|||||||
### 3. **Analytics & Reporting**
|
### 3. **Analytics & Reporting**
|
||||||
```sqlite
|
```sqlite
|
||||||
-- Top locations
|
-- Top locations
|
||||||
SELECT location, COUNT(*) as lot_count FROM lots GROUP BY location;
|
SELECT location, COUNT(*) as lots_count FROM lots GROUP BY location;
|
||||||
|
|
||||||
-- Auction statistics
|
-- Auction statistics
|
||||||
SELECT
|
SELECT
|
||||||
|
|||||||
67
check_graphql_full.py
Normal file
67
check_graphql_full.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Check if GraphQL has viewing/pickup data"""
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, 'src')
|
||||||
|
|
||||||
|
from graphql_client import GRAPHQL_ENDPOINT
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
# Expanded query to check for all available fields
|
||||||
|
EXTENDED_QUERY = """
|
||||||
|
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
||||||
|
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
||||||
|
lot {
|
||||||
|
id
|
||||||
|
displayId
|
||||||
|
auctionId
|
||||||
|
currentBidAmount { cents currency }
|
||||||
|
initialAmount { cents currency }
|
||||||
|
nextMinimalBid { cents currency }
|
||||||
|
bidsCount
|
||||||
|
startDate
|
||||||
|
endDate
|
||||||
|
|
||||||
|
# Try to find viewing/pickup fields
|
||||||
|
viewingDays { startDate endDate city countryCode }
|
||||||
|
collectionDays { startDate endDate city countryCode }
|
||||||
|
pickupDays { startDate endDate city countryCode }
|
||||||
|
}
|
||||||
|
auction {
|
||||||
|
id
|
||||||
|
displayId
|
||||||
|
viewingDays { startDate endDate city countryCode }
|
||||||
|
collectionDays { startDate endDate city countryCode }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
variables = {
|
||||||
|
"lotDisplayId": "A1-28505-5",
|
||||||
|
"locale": "nl",
|
||||||
|
"platform": "TWK"
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"query": EXTENDED_QUERY,
|
||||||
|
"variables": variables
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
data = await response.json()
|
||||||
|
print("Full GraphQL Response:")
|
||||||
|
print(json.dumps(data, indent=2))
|
||||||
|
else:
|
||||||
|
print(f"Error: {response.status}")
|
||||||
|
print(await response.text())
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Exception: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
65
deep_inspect_lot.py
Normal file
65
deep_inspect_lot.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Deep inspect lot JSON for viewing/pickup data"""
|
||||||
|
import sqlite3
|
||||||
|
import zlib
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||||
|
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT url, content
|
||||||
|
FROM cache
|
||||||
|
WHERE url LIKE '%/l/%'
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT 1
|
||||||
|
""")
|
||||||
|
|
||||||
|
row = cursor.fetchone()
|
||||||
|
url, content_blob = row
|
||||||
|
content = zlib.decompress(content_blob).decode('utf-8')
|
||||||
|
|
||||||
|
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||||
|
data = json.loads(match.group(1))
|
||||||
|
lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
||||||
|
|
||||||
|
print(f"Inspecting: {url}\n")
|
||||||
|
|
||||||
|
# Check onboarding
|
||||||
|
if 'onboarding' in lot:
|
||||||
|
print("ONBOARDING:")
|
||||||
|
print(json.dumps(lot['onboarding'], indent=2))
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check attributes
|
||||||
|
if 'attributes' in lot:
|
||||||
|
print("ATTRIBUTES:")
|
||||||
|
attrs = lot['attributes']
|
||||||
|
print(json.dumps(attrs[:3] if isinstance(attrs, list) else attrs, indent=2))
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check condition
|
||||||
|
if 'condition' in lot:
|
||||||
|
print("CONDITION:")
|
||||||
|
print(json.dumps(lot['condition'], indent=2))
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check appearance
|
||||||
|
if 'appearance' in lot:
|
||||||
|
print("APPEARANCE:")
|
||||||
|
print(json.dumps(lot['appearance'], indent=2))
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check location
|
||||||
|
if 'location' in lot:
|
||||||
|
print("LOCATION:")
|
||||||
|
print(json.dumps(lot['location'], indent=2))
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check for any field with "view", "pick", "collect", "date", "time"
|
||||||
|
print("\nFIELDS WITH VIEWING/PICKUP/TIME:")
|
||||||
|
for key in lot.keys():
|
||||||
|
if any(term in key.lower() for term in ['view', 'pick', 'collect', 'date', 'time', 'day']):
|
||||||
|
print(f" {key}: {lot[key]}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
93
explore_auction_schema.py
Normal file
93
explore_auction_schema.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Explore the actual auction schema"""
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import json
|
||||||
|
|
||||||
|
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||||
|
|
||||||
|
# Try different field structures
|
||||||
|
QUERIES = {
|
||||||
|
"viewingDays_simple": """
|
||||||
|
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||||
|
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||||
|
viewingDays {
|
||||||
|
city
|
||||||
|
countryCode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
"viewingDays_with_times": """
|
||||||
|
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||||
|
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||||
|
viewingDays {
|
||||||
|
from
|
||||||
|
to
|
||||||
|
city
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
"full_auction": """
|
||||||
|
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||||
|
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||||
|
id
|
||||||
|
displayId
|
||||||
|
biddingStatus
|
||||||
|
buyersPremium
|
||||||
|
viewingDays {
|
||||||
|
city
|
||||||
|
countryCode
|
||||||
|
from
|
||||||
|
to
|
||||||
|
}
|
||||||
|
collectionDays {
|
||||||
|
city
|
||||||
|
countryCode
|
||||||
|
from
|
||||||
|
to
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
async def test_query(name, query, auction_id):
|
||||||
|
variables = {
|
||||||
|
"auctionId": auction_id,
|
||||||
|
"locale": "nl",
|
||||||
|
"platform": "TWK"
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"query": query,
|
||||||
|
"variables": variables
|
||||||
|
}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||||
|
data = await response.json()
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"QUERY: {name}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
if 'errors' in data:
|
||||||
|
print("ERRORS:")
|
||||||
|
for error in data['errors']:
|
||||||
|
print(f" {error}")
|
||||||
|
else:
|
||||||
|
print("SUCCESS:")
|
||||||
|
print(json.dumps(data, indent=2))
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Test with the auction we know exists
|
||||||
|
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||||
|
|
||||||
|
for name, query in QUERIES.items():
|
||||||
|
await test_query(name, query, auction_id)
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
49
inspect_lot_html.py
Normal file
49
inspect_lot_html.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Inspect a lot page HTML to find viewing_time and pickup_date"""
|
||||||
|
import asyncio
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
page = await browser.new_page()
|
||||||
|
|
||||||
|
# Use the known lot
|
||||||
|
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
||||||
|
content = await page.content()
|
||||||
|
|
||||||
|
print("Searching for patterns...")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# Search for viewing time patterns
|
||||||
|
import re
|
||||||
|
patterns = {
|
||||||
|
'Bezichtigingen': r'Bezichtigingen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||||
|
'viewing': r'(?i)viewing.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||||
|
'Ophalen': r'Ophalen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||||
|
'pickup': r'(?i)pickup.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||||
|
'Status': r'Status\s+([^<]+)',
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, pattern in patterns.items():
|
||||||
|
matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
|
||||||
|
if matches:
|
||||||
|
print(f"\n{name}:")
|
||||||
|
for match in matches[:3]:
|
||||||
|
print(f" {match[:200]}")
|
||||||
|
|
||||||
|
# Also look for structured data
|
||||||
|
print("\n\nSearching for 'Bezichtigingen' section:")
|
||||||
|
bez_match = re.search(r'Bezichtigingen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
|
||||||
|
if bez_match:
|
||||||
|
print(bez_match.group(0)[:500])
|
||||||
|
|
||||||
|
print("\n\nSearching for 'Ophalen' section:")
|
||||||
|
oph_match = re.search(r'Ophalen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
|
||||||
|
if oph_match:
|
||||||
|
print(oph_match.group(0)[:500])
|
||||||
|
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
121
src/bid_history_client.py
Normal file
121
src/bid_history_client.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Client for fetching bid history from Troostwijk REST API
|
||||||
|
"""
|
||||||
|
import aiohttp
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
BID_HISTORY_ENDPOINT = "https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history"
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_bid_history(lot_uuid: str, page_size: int = 100) -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Fetch complete bid history for a lot
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lot_uuid: The lot UUID (from GraphQL response)
|
||||||
|
page_size: Number of bids per page
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of bid dictionaries or None if request fails
|
||||||
|
"""
|
||||||
|
all_bids = []
|
||||||
|
page_number = 1
|
||||||
|
has_more = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
while has_more:
|
||||||
|
url = BID_HISTORY_ENDPOINT.format(lot_uuid=lot_uuid)
|
||||||
|
params = {"pageNumber": page_number, "pageSize": page_size}
|
||||||
|
|
||||||
|
async with session.get(url, params=params, timeout=30) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
data = await response.json()
|
||||||
|
|
||||||
|
results = data.get('results', [])
|
||||||
|
all_bids.extend(results)
|
||||||
|
|
||||||
|
has_more = data.get('hasNext', False)
|
||||||
|
page_number += 1
|
||||||
|
|
||||||
|
if not has_more:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
return None if page_number == 1 else all_bids
|
||||||
|
|
||||||
|
return all_bids if all_bids else None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Bid history fetch failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_bid_history(bid_history: List[Dict], lot_id: str) -> Dict:
|
||||||
|
"""
|
||||||
|
Parse bid history into database-ready format
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bid_history: Raw bid history from API
|
||||||
|
lot_id: The lot display ID (e.g., "A1-28505-5")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with bid_records and calculated metrics
|
||||||
|
"""
|
||||||
|
if not bid_history:
|
||||||
|
return {
|
||||||
|
'bid_records': [],
|
||||||
|
'first_bid_time': None,
|
||||||
|
'last_bid_time': None,
|
||||||
|
'bid_velocity': 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
bid_records = []
|
||||||
|
|
||||||
|
for bid in bid_history:
|
||||||
|
bid_amount_cents = bid.get('currentBid', {}).get('cents', 0)
|
||||||
|
bid_amount = bid_amount_cents / 100.0 if bid_amount_cents else 0.0
|
||||||
|
|
||||||
|
bid_time_str = bid.get('createdAt', '')
|
||||||
|
|
||||||
|
bid_records.append({
|
||||||
|
'lot_id': lot_id,
|
||||||
|
'bid_amount': bid_amount,
|
||||||
|
'bid_time': bid_time_str,
|
||||||
|
'is_autobid': bid.get('autoBid', False),
|
||||||
|
'bidder_id': bid.get('buyerId', ''),
|
||||||
|
'bidder_number': bid.get('buyerNumber', 0)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
bid_times = []
|
||||||
|
for record in bid_records:
|
||||||
|
try:
|
||||||
|
# Parse ISO timestamp: "2025-12-04T17:17:45.694698Z"
|
||||||
|
dt = datetime.fromisoformat(record['bid_time'].replace('Z', '+00:00'))
|
||||||
|
bid_times.append(dt)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
first_bid_time = None
|
||||||
|
last_bid_time = None
|
||||||
|
bid_velocity = 0.0
|
||||||
|
|
||||||
|
if bid_times:
|
||||||
|
bid_times.sort()
|
||||||
|
first_bid_time = bid_times[0].strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
last_bid_time = bid_times[-1].strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
# Calculate velocity (bids per hour)
|
||||||
|
if len(bid_times) > 1:
|
||||||
|
time_span = (bid_times[-1] - bid_times[0]).total_seconds() / 3600 # hours
|
||||||
|
if time_span > 0:
|
||||||
|
bid_velocity = len(bid_times) / time_span
|
||||||
|
|
||||||
|
return {
|
||||||
|
'bid_records': bid_records,
|
||||||
|
'first_bid_time': first_bid_time,
|
||||||
|
'last_bid_time': last_bid_time,
|
||||||
|
'bid_velocity': round(bid_velocity, 2)
|
||||||
|
}
|
||||||
104
src/cache.py
104
src/cache.py
@@ -82,6 +82,63 @@ class CacheManager:
|
|||||||
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
|
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
|
||||||
if 'minimum_bid' not in columns:
|
if 'minimum_bid' not in columns:
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
|
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
|
||||||
|
if 'status' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN status TEXT")
|
||||||
|
if 'brand' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT")
|
||||||
|
if 'model' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN model TEXT")
|
||||||
|
if 'attributes_json' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT")
|
||||||
|
|
||||||
|
# Bidding intelligence fields
|
||||||
|
if 'first_bid_time' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT")
|
||||||
|
if 'last_bid_time' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT")
|
||||||
|
if 'bid_velocity' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL")
|
||||||
|
if 'bid_increment' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL")
|
||||||
|
|
||||||
|
# Valuation intelligence fields
|
||||||
|
if 'year_manufactured' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER")
|
||||||
|
if 'condition_score' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL")
|
||||||
|
if 'condition_description' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT")
|
||||||
|
if 'serial_number' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT")
|
||||||
|
if 'manufacturer' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT")
|
||||||
|
if 'damage_description' not in columns:
|
||||||
|
conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT")
|
||||||
|
|
||||||
|
# Create bid_history table
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS bid_history (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
lot_id TEXT NOT NULL,
|
||||||
|
bid_amount REAL NOT NULL,
|
||||||
|
bid_time TEXT NOT NULL,
|
||||||
|
is_autobid INTEGER DEFAULT 0,
|
||||||
|
bidder_id TEXT,
|
||||||
|
bidder_number INTEGER,
|
||||||
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
|
||||||
|
ON bid_history(lot_id, bid_time)
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
|
||||||
|
ON bid_history(bidder_id)
|
||||||
|
""")
|
||||||
|
|
||||||
# Remove duplicates before creating unique index
|
# Remove duplicates before creating unique index
|
||||||
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
|
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
|
||||||
@@ -178,8 +235,12 @@ class CacheManager:
|
|||||||
INSERT OR REPLACE INTO lots
|
INSERT OR REPLACE INTO lots
|
||||||
(lot_id, auction_id, url, title, current_bid, starting_bid, minimum_bid,
|
(lot_id, auction_id, url, title, current_bid, starting_bid, minimum_bid,
|
||||||
bid_count, closing_time, viewing_time, pickup_date, location, description,
|
bid_count, closing_time, viewing_time, pickup_date, location, description,
|
||||||
category, scraped_at)
|
category, status, brand, model, attributes_json,
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
first_bid_time, last_bid_time, bid_velocity, bid_increment,
|
||||||
|
year_manufactured, condition_score, condition_description,
|
||||||
|
serial_number, manufacturer, damage_description,
|
||||||
|
scraped_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
""", (
|
""", (
|
||||||
lot_data['lot_id'],
|
lot_data['lot_id'],
|
||||||
lot_data.get('auction_id', ''),
|
lot_data.get('auction_id', ''),
|
||||||
@@ -195,10 +256,49 @@ class CacheManager:
|
|||||||
lot_data.get('location', ''),
|
lot_data.get('location', ''),
|
||||||
lot_data.get('description', ''),
|
lot_data.get('description', ''),
|
||||||
lot_data.get('category', ''),
|
lot_data.get('category', ''),
|
||||||
|
lot_data.get('status', ''),
|
||||||
|
lot_data.get('brand', ''),
|
||||||
|
lot_data.get('model', ''),
|
||||||
|
lot_data.get('attributes_json', ''),
|
||||||
|
lot_data.get('first_bid_time'),
|
||||||
|
lot_data.get('last_bid_time'),
|
||||||
|
lot_data.get('bid_velocity'),
|
||||||
|
lot_data.get('bid_increment'),
|
||||||
|
lot_data.get('year_manufactured'),
|
||||||
|
lot_data.get('condition_score'),
|
||||||
|
lot_data.get('condition_description', ''),
|
||||||
|
lot_data.get('serial_number', ''),
|
||||||
|
lot_data.get('manufacturer', ''),
|
||||||
|
lot_data.get('damage_description', ''),
|
||||||
lot_data['scraped_at']
|
lot_data['scraped_at']
|
||||||
))
|
))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
def save_bid_history(self, lot_id: str, bid_records: List[Dict]):
|
||||||
|
"""Save bid history records to database"""
|
||||||
|
if not bid_records:
|
||||||
|
return
|
||||||
|
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
# Clear existing bid history for this lot
|
||||||
|
conn.execute("DELETE FROM bid_history WHERE lot_id = ?", (lot_id,))
|
||||||
|
|
||||||
|
# Insert new records
|
||||||
|
for record in bid_records:
|
||||||
|
conn.execute("""
|
||||||
|
INSERT INTO bid_history
|
||||||
|
(lot_id, bid_amount, bid_time, is_autobid, bidder_id, bidder_number)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
""", (
|
||||||
|
record['lot_id'],
|
||||||
|
record['bid_amount'],
|
||||||
|
record['bid_time'],
|
||||||
|
1 if record['is_autobid'] else 0,
|
||||||
|
record['bidder_id'],
|
||||||
|
record['bidder_number']
|
||||||
|
))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
def save_images(self, lot_id: str, image_urls: List[str]):
|
def save_images(self, lot_id: str, image_urls: List[str]):
|
||||||
"""Save image URLs for a lot (prevents duplicates via unique constraint)"""
|
"""Save image URLs for a lot (prevents duplicates via unique constraint)"""
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
|||||||
@@ -7,6 +7,27 @@ from typing import Dict, Optional
|
|||||||
|
|
||||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||||
|
|
||||||
|
AUCTION_QUERY = """
|
||||||
|
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||||
|
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||||
|
id
|
||||||
|
displayId
|
||||||
|
viewingDays {
|
||||||
|
startDate
|
||||||
|
endDate
|
||||||
|
city
|
||||||
|
countryCode
|
||||||
|
}
|
||||||
|
collectionDays {
|
||||||
|
startDate
|
||||||
|
endDate
|
||||||
|
city
|
||||||
|
countryCode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
LOT_BIDDING_QUERY = """
|
LOT_BIDDING_QUERY = """
|
||||||
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
||||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
||||||
@@ -44,6 +65,42 @@ query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platfo
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_auction_data(auction_id: str) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Fetch auction data (viewing/pickup times) from GraphQL API
|
||||||
|
|
||||||
|
Args:
|
||||||
|
auction_id: The auction UUID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with auction data or None if request fails
|
||||||
|
"""
|
||||||
|
variables = {
|
||||||
|
"auctionId": auction_id,
|
||||||
|
"locale": "nl",
|
||||||
|
"platform": "TWK"
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"query": AUCTION_QUERY,
|
||||||
|
"variables": variables
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
data = await response.json()
|
||||||
|
auction = data.get('data', {}).get('auction', {})
|
||||||
|
if auction:
|
||||||
|
return auction
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
|
async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
|
||||||
"""
|
"""
|
||||||
Fetch lot bidding data from GraphQL API
|
Fetch lot bidding data from GraphQL API
|
||||||
@@ -127,6 +184,15 @@ def format_bid_data(lot_details: Dict) -> Dict:
|
|||||||
return ''
|
return ''
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
# Format status from minimumBidAmountMet
|
||||||
|
minimum_bid_met = lot.get('minimumBidAmountMet', '')
|
||||||
|
status_map = {
|
||||||
|
'NO_MINIMUM_BID_AMOUNT': 'Geen Minimumprijs',
|
||||||
|
'MINIMUM_BID_AMOUNT_NOT_MET': 'Minimumprijs nog niet gehaald',
|
||||||
|
'MINIMUM_BID_AMOUNT_MET': 'Minimumprijs gehaald'
|
||||||
|
}
|
||||||
|
status = status_map.get(minimum_bid_met, '')
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'current_bid': current_bid,
|
'current_bid': current_bid,
|
||||||
'starting_bid': starting_bid,
|
'starting_bid': starting_bid,
|
||||||
@@ -135,4 +201,209 @@ def format_bid_data(lot_details: Dict) -> Dict:
|
|||||||
'closing_time': format_timestamp(end_date),
|
'closing_time': format_timestamp(end_date),
|
||||||
'bidding_status': lot.get('biddingStatus', ''),
|
'bidding_status': lot.get('biddingStatus', ''),
|
||||||
'vat_percentage': lot.get('vat', 0),
|
'vat_percentage': lot.get('vat', 0),
|
||||||
|
'status': status,
|
||||||
|
'auction_id': lot.get('auctionId', ''),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def format_auction_data(auction: Dict) -> Dict:
|
||||||
|
"""
|
||||||
|
Extract viewing/pickup times from auction data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
auction: Auction data from GraphQL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with viewing_time and pickup_date
|
||||||
|
"""
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def format_days(days_list):
|
||||||
|
if not days_list or not isinstance(days_list, list) or len(days_list) == 0:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
first_day = days_list[0]
|
||||||
|
start_ts = first_day.get('startDate')
|
||||||
|
end_ts = first_day.get('endDate')
|
||||||
|
city = first_day.get('city', '')
|
||||||
|
country = first_day.get('countryCode', '').upper()
|
||||||
|
|
||||||
|
if not start_ts or not end_ts:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_dt = datetime.fromtimestamp(start_ts)
|
||||||
|
end_dt = datetime.fromtimestamp(end_ts)
|
||||||
|
|
||||||
|
# Format: "vr 05 dec 2025 van 09:00 tot 12:00"
|
||||||
|
days_nl = ['ma', 'di', 'wo', 'do', 'vr', 'za', 'zo']
|
||||||
|
months_nl = ['jan', 'feb', 'mrt', 'apr', 'mei', 'jun',
|
||||||
|
'jul', 'aug', 'sep', 'okt', 'nov', 'dec']
|
||||||
|
|
||||||
|
day_name = days_nl[start_dt.weekday()]
|
||||||
|
month_name = months_nl[start_dt.month - 1]
|
||||||
|
|
||||||
|
time_str = f"{day_name} {start_dt.day:02d} {month_name} {start_dt.year} van {start_dt.strftime('%H:%M')} tot {end_dt.strftime('%H:%M')}"
|
||||||
|
|
||||||
|
if city:
|
||||||
|
location = f"{city}, {country}" if country else city
|
||||||
|
return f"{time_str}\n{location}"
|
||||||
|
|
||||||
|
return time_str
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
viewing_time = format_days(auction.get('viewingDays', []))
|
||||||
|
pickup_date = format_days(auction.get('collectionDays', []))
|
||||||
|
|
||||||
|
return {
|
||||||
|
'viewing_time': viewing_time,
|
||||||
|
'pickup_date': pickup_date
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_attributes_from_lot_json(lot_json: Dict) -> Dict:
|
||||||
|
"""
|
||||||
|
Extract brand, model, and other attributes from lot JSON
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lot_json: The lot object from __NEXT_DATA__
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with brand, model, and attributes
|
||||||
|
"""
|
||||||
|
attributes = lot_json.get('attributes', [])
|
||||||
|
if not isinstance(attributes, list):
|
||||||
|
return {'brand': '', 'model': '', 'attributes_json': ''}
|
||||||
|
|
||||||
|
brand = ''
|
||||||
|
model = ''
|
||||||
|
|
||||||
|
# Look for brand and model in attributes
|
||||||
|
for attr in attributes:
|
||||||
|
if not isinstance(attr, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = attr.get('name', '').lower()
|
||||||
|
value = attr.get('value', '')
|
||||||
|
|
||||||
|
if name in ['brand', 'merk', 'fabrikant', 'manufacturer']:
|
||||||
|
brand = value
|
||||||
|
elif name in ['model', 'type']:
|
||||||
|
model = value
|
||||||
|
|
||||||
|
import json
|
||||||
|
return {
|
||||||
|
'brand': brand,
|
||||||
|
'model': model,
|
||||||
|
'attributes_json': json.dumps(attributes) if attributes else ''
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_enriched_attributes(lot_json: Dict, page_data: Dict) -> Dict:
|
||||||
|
"""
|
||||||
|
Extract enriched valuation attributes from lot data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lot_json: The lot object from __NEXT_DATA__
|
||||||
|
page_data: Already parsed page data (title, description)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with enriched attributes
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
attributes = lot_json.get('attributes', [])
|
||||||
|
title = page_data.get('title', '')
|
||||||
|
description = page_data.get('description', '')
|
||||||
|
|
||||||
|
# Initialize
|
||||||
|
year_manufactured = None
|
||||||
|
condition_description = ''
|
||||||
|
condition_score = None
|
||||||
|
serial_number = ''
|
||||||
|
manufacturer = ''
|
||||||
|
damage_description = ''
|
||||||
|
|
||||||
|
# Extract from attributes array
|
||||||
|
for attr in attributes:
|
||||||
|
if not isinstance(attr, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = attr.get('name', '').lower()
|
||||||
|
value = str(attr.get('value', ''))
|
||||||
|
|
||||||
|
if name in ['jaar', 'year', 'bouwjaar', 'productiejaar']:
|
||||||
|
try:
|
||||||
|
year_manufactured = int(re.search(r'\d{4}', value).group())
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif name in ['conditie', 'condition', 'staat']:
|
||||||
|
condition_description = value
|
||||||
|
# Map condition to score (0-10)
|
||||||
|
condition_map = {
|
||||||
|
'nieuw': 10.0, 'new': 10.0,
|
||||||
|
'als nieuw': 9.5, 'like new': 9.5,
|
||||||
|
'uitstekend': 9.0, 'excellent': 9.0,
|
||||||
|
'zeer goed': 8.0, 'very good': 8.0,
|
||||||
|
'goed': 7.0, 'good': 7.0,
|
||||||
|
'redelijk': 6.0, 'fair': 6.0,
|
||||||
|
'matig': 5.0, 'moderate': 5.0,
|
||||||
|
'slecht': 3.0, 'poor': 3.0,
|
||||||
|
'defect': 1.0, 'defective': 1.0
|
||||||
|
}
|
||||||
|
for key, score in condition_map.items():
|
||||||
|
if key in value.lower():
|
||||||
|
condition_score = score
|
||||||
|
break
|
||||||
|
|
||||||
|
elif name in ['serienummer', 'serial', 'serial number', 'artikelnummer']:
|
||||||
|
serial_number = value
|
||||||
|
|
||||||
|
elif name in ['fabrikant', 'manufacturer', 'merk', 'brand']:
|
||||||
|
manufacturer = value
|
||||||
|
|
||||||
|
# Extract 4-digit year from title if not found
|
||||||
|
if not year_manufactured:
|
||||||
|
year_match = re.search(r'\b(19|20)\d{2}\b', title)
|
||||||
|
if year_match:
|
||||||
|
try:
|
||||||
|
year_manufactured = int(year_match.group())
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Extract damage mentions from description
|
||||||
|
damage_keywords = ['schade', 'damage', 'beschadigd', 'damaged', 'defect', 'broken', 'kapot']
|
||||||
|
if description:
|
||||||
|
for keyword in damage_keywords:
|
||||||
|
if keyword in description.lower():
|
||||||
|
# Extract sentence containing damage keyword
|
||||||
|
sentences = description.split('.')
|
||||||
|
for sentence in sentences:
|
||||||
|
if keyword in sentence.lower():
|
||||||
|
damage_description = sentence.strip()
|
||||||
|
break
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract condition from __NEXT_DATA__ fields
|
||||||
|
if not condition_description:
|
||||||
|
lot_condition = lot_json.get('condition', '')
|
||||||
|
if lot_condition and lot_condition != 'NOT_CHECKED':
|
||||||
|
condition_description = lot_condition
|
||||||
|
|
||||||
|
lot_appearance = lot_json.get('appearance', '')
|
||||||
|
if lot_appearance and lot_appearance != 'NOT_CHECKED':
|
||||||
|
if condition_description:
|
||||||
|
condition_description += f", {lot_appearance}"
|
||||||
|
else:
|
||||||
|
condition_description = lot_appearance
|
||||||
|
|
||||||
|
return {
|
||||||
|
'year_manufactured': year_manufactured,
|
||||||
|
'condition_description': condition_description,
|
||||||
|
'condition_score': condition_score,
|
||||||
|
'serial_number': serial_number,
|
||||||
|
'manufacturer': manufacturer or page_data.get('brand', ''), # Fallback to brand
|
||||||
|
'damage_description': damage_description
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,13 @@ from config import (
|
|||||||
)
|
)
|
||||||
from cache import CacheManager
|
from cache import CacheManager
|
||||||
from parse import DataParser
|
from parse import DataParser
|
||||||
from graphql_client import fetch_lot_bidding_data, format_bid_data
|
from graphql_client import (
|
||||||
|
fetch_lot_bidding_data, format_bid_data,
|
||||||
|
fetch_auction_data, format_auction_data,
|
||||||
|
extract_attributes_from_lot_json,
|
||||||
|
extract_enriched_attributes
|
||||||
|
)
|
||||||
|
from bid_history_client import fetch_bid_history, parse_bid_history
|
||||||
|
|
||||||
class TroostwijkScraper:
|
class TroostwijkScraper:
|
||||||
"""Main scraper class for Troostwijk Auctions"""
|
"""Main scraper class for Troostwijk Auctions"""
|
||||||
@@ -183,6 +189,31 @@ class TroostwijkScraper:
|
|||||||
print(f" Type: LOT")
|
print(f" Type: LOT")
|
||||||
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
|
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||||
|
|
||||||
|
# Extract ALL data from __NEXT_DATA__ lot object
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
lot_json = None
|
||||||
|
lot_uuid = None
|
||||||
|
|
||||||
|
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
data = json.loads(match.group(1))
|
||||||
|
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
||||||
|
if lot_json:
|
||||||
|
# Basic attributes
|
||||||
|
attrs = extract_attributes_from_lot_json(lot_json)
|
||||||
|
page_data.update(attrs)
|
||||||
|
|
||||||
|
# Enriched attributes (year, condition, etc.)
|
||||||
|
enriched = extract_enriched_attributes(lot_json, page_data)
|
||||||
|
page_data.update(enriched)
|
||||||
|
|
||||||
|
# Get lot UUID for bid history
|
||||||
|
lot_uuid = lot_json.get('id')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Fetch bidding data from GraphQL API
|
# Fetch bidding data from GraphQL API
|
||||||
lot_id = page_data.get('lot_id')
|
lot_id = page_data.get('lot_id')
|
||||||
print(f" Fetching bidding data from API...")
|
print(f" Fetching bidding data from API...")
|
||||||
@@ -190,11 +221,39 @@ class TroostwijkScraper:
|
|||||||
|
|
||||||
if bidding_data:
|
if bidding_data:
|
||||||
formatted_data = format_bid_data(bidding_data)
|
formatted_data = format_bid_data(bidding_data)
|
||||||
# Update page_data with real bidding info
|
|
||||||
page_data.update(formatted_data)
|
page_data.update(formatted_data)
|
||||||
print(f" Bid: {page_data.get('current_bid', 'N/A')}")
|
print(f" Bid: {page_data.get('current_bid', 'N/A')}")
|
||||||
print(f" Bid Count: {page_data.get('bid_count', 0)}")
|
print(f" Status: {page_data.get('status', 'N/A')}")
|
||||||
print(f" Closing: {page_data.get('closing_time', 'N/A')}")
|
|
||||||
|
# Extract bid increment from nextBidStepInCents
|
||||||
|
lot_details_lot = bidding_data.get('lot', {})
|
||||||
|
next_step_cents = lot_details_lot.get('nextBidStepInCents')
|
||||||
|
if next_step_cents:
|
||||||
|
page_data['bid_increment'] = next_step_cents / 100.0
|
||||||
|
|
||||||
|
# Get lot UUID if not already extracted
|
||||||
|
if not lot_uuid:
|
||||||
|
lot_uuid = lot_details_lot.get('id')
|
||||||
|
|
||||||
|
# Fetch bid history for intelligence
|
||||||
|
if lot_uuid and page_data.get('bid_count', 0) > 0:
|
||||||
|
print(f" Fetching bid history...")
|
||||||
|
bid_history = await fetch_bid_history(lot_uuid)
|
||||||
|
if bid_history:
|
||||||
|
bid_data = parse_bid_history(bid_history, lot_id)
|
||||||
|
page_data.update(bid_data)
|
||||||
|
print(f" Bid velocity: {bid_data['bid_velocity']} bids/hour")
|
||||||
|
|
||||||
|
# Save bid history to database
|
||||||
|
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||||
|
|
||||||
|
# Fetch auction data for viewing/pickup times if we have auction_id
|
||||||
|
auction_id = page_data.get('auction_id')
|
||||||
|
if auction_id:
|
||||||
|
auction_data = await fetch_auction_data(auction_id)
|
||||||
|
if auction_data:
|
||||||
|
auction_times = format_auction_data(auction_data)
|
||||||
|
page_data.update(auction_times)
|
||||||
else:
|
else:
|
||||||
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
||||||
|
|
||||||
|
|||||||
28
test_auction_fetch.py
Normal file
28
test_auction_fetch.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test auction data fetch"""
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, 'src')
|
||||||
|
|
||||||
|
from graphql_client import fetch_auction_data, format_auction_data
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||||
|
|
||||||
|
print(f"Fetching auction: {auction_id}\n")
|
||||||
|
auction_data = await fetch_auction_data(auction_id)
|
||||||
|
|
||||||
|
if auction_data:
|
||||||
|
print("Raw Auction Data:")
|
||||||
|
print(json.dumps(auction_data, indent=2))
|
||||||
|
|
||||||
|
print("\n\nFormatted:")
|
||||||
|
formatted = format_auction_data(auction_data)
|
||||||
|
print(f"Viewing: {formatted['viewing_time']}")
|
||||||
|
print(f"Pickup: {formatted['pickup_date']}")
|
||||||
|
else:
|
||||||
|
print("No auction data returned")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
59
test_auction_query.py
Normal file
59
test_auction_query.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test if the auction query works at all"""
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import json
|
||||||
|
|
||||||
|
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||||
|
|
||||||
|
# Try a simpler query first
|
||||||
|
SIMPLE_QUERY = """
|
||||||
|
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||||
|
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||||
|
id
|
||||||
|
displayId
|
||||||
|
viewingDays {
|
||||||
|
startDate
|
||||||
|
endDate
|
||||||
|
city
|
||||||
|
countryCode
|
||||||
|
}
|
||||||
|
collectionDays {
|
||||||
|
startDate
|
||||||
|
endDate
|
||||||
|
city
|
||||||
|
countryCode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||||
|
|
||||||
|
variables = {
|
||||||
|
"auctionId": auction_id,
|
||||||
|
"locale": "nl",
|
||||||
|
"platform": "TWK"
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"query": SIMPLE_QUERY,
|
||||||
|
"variables": variables
|
||||||
|
}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||||
|
print(f"Status: {response.status}")
|
||||||
|
text = await response.text()
|
||||||
|
print(f"Response: {text}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await response.json()
|
||||||
|
print(f"\nParsed:")
|
||||||
|
print(json.dumps(data, indent=2))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
95
test_comprehensive.py
Normal file
95
test_comprehensive.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test comprehensive data enrichment"""
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, 'src')
|
||||||
|
|
||||||
|
from scraper import TroostwijkScraper
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
scraper = TroostwijkScraper()
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
page = await browser.new_page(
|
||||||
|
viewport={'width': 1920, 'height': 1080},
|
||||||
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with lot that has bids
|
||||||
|
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
|
||||||
|
|
||||||
|
print(f"Testing comprehensive extraction\n")
|
||||||
|
result = await scraper.crawl_page(page, lot_url)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("COMPREHENSIVE DATA EXTRACTION:")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Lot ID: {result.get('lot_id')}")
|
||||||
|
print(f"Title: {result.get('title', '')[:50]}...")
|
||||||
|
print(f"\n[Bidding Intelligence]")
|
||||||
|
print(f" Status: {result.get('status')}")
|
||||||
|
print(f" Current Bid: {result.get('current_bid')}")
|
||||||
|
print(f" Starting Bid: {result.get('starting_bid')}")
|
||||||
|
print(f" Bid Increment: EUR {result.get('bid_increment', 0):.2f}")
|
||||||
|
print(f" Bid Count: {result.get('bid_count')}")
|
||||||
|
print(f" First Bid: {result.get('first_bid_time', 'N/A')}")
|
||||||
|
print(f" Last Bid: {result.get('last_bid_time', 'N/A')}")
|
||||||
|
print(f" Bid Velocity: {result.get('bid_velocity', 0)} bids/hour")
|
||||||
|
print(f"\n[Valuation Intelligence]")
|
||||||
|
print(f" Brand: {result.get('brand', 'N/A')}")
|
||||||
|
print(f" Model: {result.get('model', 'N/A')}")
|
||||||
|
print(f" Year: {result.get('year_manufactured', 'N/A')}")
|
||||||
|
print(f" Manufacturer: {result.get('manufacturer', 'N/A')}")
|
||||||
|
print(f" Condition Score: {result.get('condition_score', 'N/A')}")
|
||||||
|
print(f" Condition: {result.get('condition_description', 'N/A')}")
|
||||||
|
print(f" Serial#: {result.get('serial_number', 'N/A')}")
|
||||||
|
print(f" Damage: {result.get('damage_description', 'N/A')[:50] if result.get('damage_description') else 'N/A'}...")
|
||||||
|
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
# Verify database
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||||
|
|
||||||
|
# Check lot data
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT bid_velocity, first_bid_time, year_manufactured, condition_score
|
||||||
|
FROM lots
|
||||||
|
WHERE lot_id = ?
|
||||||
|
""", (result.get('lot_id'),))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("DATABASE VERIFICATION (lots table):")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" Bid Velocity: {row[0]}")
|
||||||
|
print(f" First Bid Time: {row[1]}")
|
||||||
|
print(f" Year: {row[2]}")
|
||||||
|
print(f" Condition Score: {row[3]}")
|
||||||
|
|
||||||
|
# Check bid history
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT COUNT(*), MIN(bid_time), MAX(bid_time), SUM(is_autobid)
|
||||||
|
FROM bid_history
|
||||||
|
WHERE lot_id = ?
|
||||||
|
""", (result.get('lot_id'),))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
|
||||||
|
if row and row[0] > 0:
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("DATABASE VERIFICATION (bid_history table):")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" Total Bids Stored: {row[0]}")
|
||||||
|
print(f" First Bid: {row[1]}")
|
||||||
|
print(f" Last Bid: {row[2]}")
|
||||||
|
print(f" Autobids: {row[3]}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
64
test_new_fields.py
Normal file
64
test_new_fields.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test the new fields extraction"""
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, 'src')
|
||||||
|
|
||||||
|
from scraper import TroostwijkScraper
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
scraper = TroostwijkScraper()
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
page = await browser.new_page(
|
||||||
|
viewport={'width': 1920, 'height': 1080},
|
||||||
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with lot that has attributes
|
||||||
|
lot_url = "https://www.troostwijkauctions.com/l/47-5kg-hexagon-dumbbell-%25282x%2529-A1-40668-34"
|
||||||
|
|
||||||
|
print(f"Testing new fields with: {lot_url}\n")
|
||||||
|
result = await scraper.crawl_page(page, lot_url)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("EXTRACTED FIELDS:")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Lot ID: {result.get('lot_id')}")
|
||||||
|
print(f"Title: {result.get('title', '')[:50]}...")
|
||||||
|
print(f"Status: {result.get('status')}")
|
||||||
|
print(f"Brand: {result.get('brand')}")
|
||||||
|
print(f"Model: {result.get('model')}")
|
||||||
|
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
|
||||||
|
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
|
||||||
|
print(f"Attributes: {result.get('attributes_json', '')[:100]}...")
|
||||||
|
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
# Verify database
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT status, brand, model, viewing_time, pickup_date
|
||||||
|
FROM lots
|
||||||
|
WHERE lot_id = ?
|
||||||
|
""", (result.get('lot_id'),))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("DATABASE VERIFICATION:")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Status: {row[0]}")
|
||||||
|
print(f"Brand: {row[1]}")
|
||||||
|
print(f"Model: {row[2]}")
|
||||||
|
print(f"Viewing: {row[3][:100] if row[3] else 'N/A'}...")
|
||||||
|
print(f"Pickup: {row[4][:100] if row[4] else 'N/A'}...")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user