enrich data
This commit is contained in:
143
COMPREHENSIVE_UPDATE_PLAN.md
Normal file
143
COMPREHENSIVE_UPDATE_PLAN.md
Normal file
@@ -0,0 +1,143 @@
|
||||
# Comprehensive Data Enrichment Plan
|
||||
|
||||
## Current Status: Working Features
|
||||
✅ Image downloads (concurrent)
|
||||
✅ Basic bid data (current_bid, starting_bid, minimum_bid, bid_count, closing_time)
|
||||
✅ Status extraction
|
||||
✅ Brand/Model from attributes
|
||||
✅ Attributes JSON storage
|
||||
|
||||
## Phase 1: Core Bidding Intelligence (HIGH PRIORITY)
|
||||
|
||||
### Data Sources Identified:
|
||||
1. **GraphQL lot bidding API** - Already integrated
|
||||
- currentBidAmount, initialAmount, bidsCount
|
||||
- startDate, endDate (for first_bid_time calculation)
|
||||
|
||||
2. **REST bid history API** ✨ NEW DISCOVERY
|
||||
- Endpoint: `https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history`
|
||||
- Returns: bid amounts, timestamps, autobid flags, bidder IDs
|
||||
- Pagination supported
|
||||
|
||||
### Database Schema Changes:
|
||||
|
||||
```sql
|
||||
-- Extend lots table with bidding intelligence
|
||||
ALTER TABLE lots ADD COLUMN estimated_min DECIMAL(12,2);
|
||||
ALTER TABLE lots ADD COLUMN estimated_max DECIMAL(12,2);
|
||||
ALTER TABLE lots ADD COLUMN reserve_price DECIMAL(12,2);
|
||||
ALTER TABLE lots ADD COLUMN reserve_met BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE lots ADD COLUMN bid_increment DECIMAL(12,2);
|
||||
ALTER TABLE lots ADD COLUMN watch_count INTEGER DEFAULT 0;
|
||||
ALTER TABLE lots ADD COLUMN first_bid_time TEXT;
|
||||
ALTER TABLE lots ADD COLUMN last_bid_time TEXT;
|
||||
ALTER TABLE lots ADD COLUMN bid_velocity DECIMAL(5,2);
|
||||
|
||||
-- NEW: Bid history table
|
||||
CREATE TABLE IF NOT EXISTS bid_history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
lot_id TEXT NOT NULL,
|
||||
lot_uuid TEXT NOT NULL,
|
||||
bid_amount DECIMAL(12,2) NOT NULL,
|
||||
bid_time TEXT NOT NULL,
|
||||
is_winning BOOLEAN DEFAULT FALSE,
|
||||
is_autobid BOOLEAN DEFAULT FALSE,
|
||||
bidder_id TEXT,
|
||||
bidder_number INTEGER,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time ON bid_history(lot_id, bid_time);
|
||||
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder ON bid_history(bidder_id);
|
||||
```
|
||||
|
||||
### Implementation:
|
||||
- Add `fetch_bid_history()` function to call REST API
|
||||
- Parse and store all historical bids
|
||||
- Calculate bid_velocity (bids per hour)
|
||||
- Extract first_bid_time, last_bid_time
|
||||
|
||||
## Phase 2: Valuation Intelligence
|
||||
|
||||
### Data Sources:
|
||||
1. **Attributes array** (already in __NEXT_DATA__)
|
||||
- condition, year, manufacturer, model, serial_number
|
||||
|
||||
2. **Description field**
|
||||
- Extract year patterns, condition mentions, damage descriptions
|
||||
|
||||
### Database Schema:
|
||||
|
||||
```sql
|
||||
-- Valuation fields
|
||||
ALTER TABLE lots ADD COLUMN condition_score DECIMAL(3,2);
|
||||
ALTER TABLE lots ADD COLUMN condition_description TEXT;
|
||||
ALTER TABLE lots ADD COLUMN year_manufactured INTEGER;
|
||||
ALTER TABLE lots ADD COLUMN serial_number TEXT;
|
||||
ALTER TABLE lots ADD COLUMN manufacturer TEXT;
|
||||
ALTER TABLE lots ADD COLUMN damage_description TEXT;
|
||||
ALTER TABLE lots ADD COLUMN provenance TEXT;
|
||||
```
|
||||
|
||||
### Implementation:
|
||||
- Parse attributes for: Jaar, Conditie, Serienummer, Fabrikant
|
||||
- Extract 4-digit years from title/description
|
||||
- Map condition values to 0-10 scale
|
||||
|
||||
## Phase 3: Auction House Intelligence
|
||||
|
||||
### Data Sources:
|
||||
1. **GraphQL auction query**
|
||||
- Already partially working
|
||||
|
||||
2. **Auction __NEXT_DATA__**
|
||||
- May contain buyer's premium, shipping costs
|
||||
|
||||
### Database Schema:
|
||||
|
||||
```sql
|
||||
ALTER TABLE auctions ADD COLUMN buyers_premium_percent DECIMAL(5,2);
|
||||
ALTER TABLE auctions ADD COLUMN shipping_available BOOLEAN;
|
||||
ALTER TABLE auctions ADD COLUMN payment_methods TEXT;
|
||||
```
|
||||
|
||||
## Viewing/Pickup Times Resolution
|
||||
|
||||
### Finding:
|
||||
- `viewingDays` and `collectionDays` in GraphQL only return location (city, countryCode)
|
||||
- Times are NOT in the GraphQL API
|
||||
- Times must be in auction __NEXT_DATA__ or not set for many auctions
|
||||
|
||||
### Solution:
|
||||
- Mark viewing_time/pickup_date as "location only" when times unavailable
|
||||
- Store: "Nijmegen, NL" instead of full date/time string
|
||||
- Accept that many auctions don't have viewing times set
|
||||
|
||||
## Priority Implementation Order:
|
||||
|
||||
1. **BID HISTORY API** (30 min) - Highest value
|
||||
- Fetch and store all bid history
|
||||
- Calculate bid_velocity
|
||||
- Track autobid patterns
|
||||
|
||||
2. **ENRICHED ATTRIBUTES** (20 min) - Medium-high value
|
||||
- Extract year, condition, manufacturer from existing data
|
||||
- Parse description for damage/condition mentions
|
||||
|
||||
3. **VIEWING/PICKUP FIX** (10 min) - Low value (data often missing)
|
||||
- Update to store location-only when times unavailable
|
||||
|
||||
## Data Quality Expectations:
|
||||
|
||||
| Field | Coverage Expected | Source |
|
||||
|-------|------------------|---------|
|
||||
| bid_history | 100% (for lots with bids) | REST API |
|
||||
| bid_velocity | 100% (calculated) | Derived |
|
||||
| year_manufactured | ~40% | Attributes/Title |
|
||||
| condition_score | ~30% | Attributes |
|
||||
| manufacturer | ~60% | Attributes |
|
||||
| viewing_time | ~20% | Often not set |
|
||||
| buyers_premium | 100% | GraphQL/Props |
|
||||
|
||||
## Estimated Total Implementation Time: 60-90 minutes
|
||||
@@ -278,7 +278,7 @@ SELECT lot_id, current_bid, bid_count FROM lots WHERE bid_count > 0;
|
||||
### 3. **Analytics & Reporting**
|
||||
```sqlite
|
||||
-- Top locations
|
||||
SELECT location, COUNT(*) as lot_count FROM lots GROUP BY location;
|
||||
SELECT location, COUNT(*) as lots_count FROM lots GROUP BY location;
|
||||
|
||||
-- Auction statistics
|
||||
SELECT
|
||||
|
||||
67
check_graphql_full.py
Normal file
67
check_graphql_full.py
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check if GraphQL has viewing/pickup data"""
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from graphql_client import GRAPHQL_ENDPOINT
|
||||
import aiohttp
|
||||
|
||||
# Expanded query to check for all available fields
|
||||
EXTENDED_QUERY = """
|
||||
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
||||
lot {
|
||||
id
|
||||
displayId
|
||||
auctionId
|
||||
currentBidAmount { cents currency }
|
||||
initialAmount { cents currency }
|
||||
nextMinimalBid { cents currency }
|
||||
bidsCount
|
||||
startDate
|
||||
endDate
|
||||
|
||||
# Try to find viewing/pickup fields
|
||||
viewingDays { startDate endDate city countryCode }
|
||||
collectionDays { startDate endDate city countryCode }
|
||||
pickupDays { startDate endDate city countryCode }
|
||||
}
|
||||
auction {
|
||||
id
|
||||
displayId
|
||||
viewingDays { startDate endDate city countryCode }
|
||||
collectionDays { startDate endDate city countryCode }
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
async def main():
|
||||
variables = {
|
||||
"lotDisplayId": "A1-28505-5",
|
||||
"locale": "nl",
|
||||
"platform": "TWK"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"query": EXTENDED_QUERY,
|
||||
"variables": variables
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
print("Full GraphQL Response:")
|
||||
print(json.dumps(data, indent=2))
|
||||
else:
|
||||
print(f"Error: {response.status}")
|
||||
print(await response.text())
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
65
deep_inspect_lot.py
Normal file
65
deep_inspect_lot.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Deep inspect lot JSON for viewing/pickup data"""
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
|
||||
row = cursor.fetchone()
|
||||
url, content_blob = row
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
data = json.loads(match.group(1))
|
||||
lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
||||
|
||||
print(f"Inspecting: {url}\n")
|
||||
|
||||
# Check onboarding
|
||||
if 'onboarding' in lot:
|
||||
print("ONBOARDING:")
|
||||
print(json.dumps(lot['onboarding'], indent=2))
|
||||
print()
|
||||
|
||||
# Check attributes
|
||||
if 'attributes' in lot:
|
||||
print("ATTRIBUTES:")
|
||||
attrs = lot['attributes']
|
||||
print(json.dumps(attrs[:3] if isinstance(attrs, list) else attrs, indent=2))
|
||||
print()
|
||||
|
||||
# Check condition
|
||||
if 'condition' in lot:
|
||||
print("CONDITION:")
|
||||
print(json.dumps(lot['condition'], indent=2))
|
||||
print()
|
||||
|
||||
# Check appearance
|
||||
if 'appearance' in lot:
|
||||
print("APPEARANCE:")
|
||||
print(json.dumps(lot['appearance'], indent=2))
|
||||
print()
|
||||
|
||||
# Check location
|
||||
if 'location' in lot:
|
||||
print("LOCATION:")
|
||||
print(json.dumps(lot['location'], indent=2))
|
||||
print()
|
||||
|
||||
# Check for any field with "view", "pick", "collect", "date", "time"
|
||||
print("\nFIELDS WITH VIEWING/PICKUP/TIME:")
|
||||
for key in lot.keys():
|
||||
if any(term in key.lower() for term in ['view', 'pick', 'collect', 'date', 'time', 'day']):
|
||||
print(f" {key}: {lot[key]}")
|
||||
|
||||
conn.close()
|
||||
93
explore_auction_schema.py
Normal file
93
explore_auction_schema.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Explore the actual auction schema"""
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
|
||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||
|
||||
# Try different field structures
|
||||
QUERIES = {
|
||||
"viewingDays_simple": """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
viewingDays {
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
"viewingDays_with_times": """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
viewingDays {
|
||||
from
|
||||
to
|
||||
city
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
"full_auction": """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
id
|
||||
displayId
|
||||
biddingStatus
|
||||
buyersPremium
|
||||
viewingDays {
|
||||
city
|
||||
countryCode
|
||||
from
|
||||
to
|
||||
}
|
||||
collectionDays {
|
||||
city
|
||||
countryCode
|
||||
from
|
||||
to
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
}
|
||||
|
||||
async def test_query(name, query, auction_id):
|
||||
variables = {
|
||||
"auctionId": auction_id,
|
||||
"locale": "nl",
|
||||
"platform": "TWK"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"query": query,
|
||||
"variables": variables
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||
data = await response.json()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"QUERY: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if 'errors' in data:
|
||||
print("ERRORS:")
|
||||
for error in data['errors']:
|
||||
print(f" {error}")
|
||||
else:
|
||||
print("SUCCESS:")
|
||||
print(json.dumps(data, indent=2))
|
||||
|
||||
async def main():
|
||||
# Test with the auction we know exists
|
||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||
|
||||
for name, query in QUERIES.items():
|
||||
await test_query(name, query, auction_id)
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
49
inspect_lot_html.py
Normal file
49
inspect_lot_html.py
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Inspect a lot page HTML to find viewing_time and pickup_date"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# Use the known lot
|
||||
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
||||
content = await page.content()
|
||||
|
||||
print("Searching for patterns...")
|
||||
print("="*60)
|
||||
|
||||
# Search for viewing time patterns
|
||||
import re
|
||||
patterns = {
|
||||
'Bezichtigingen': r'Bezichtigingen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||
'viewing': r'(?i)viewing.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||
'Ophalen': r'Ophalen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||
'pickup': r'(?i)pickup.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||
'Status': r'Status\s+([^<]+)',
|
||||
}
|
||||
|
||||
for name, pattern in patterns.items():
|
||||
matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
|
||||
if matches:
|
||||
print(f"\n{name}:")
|
||||
for match in matches[:3]:
|
||||
print(f" {match[:200]}")
|
||||
|
||||
# Also look for structured data
|
||||
print("\n\nSearching for 'Bezichtigingen' section:")
|
||||
bez_match = re.search(r'Bezichtigingen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
|
||||
if bez_match:
|
||||
print(bez_match.group(0)[:500])
|
||||
|
||||
print("\n\nSearching for 'Ophalen' section:")
|
||||
oph_match = re.search(r'Ophalen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
|
||||
if oph_match:
|
||||
print(oph_match.group(0)[:500])
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
121
src/bid_history_client.py
Normal file
121
src/bid_history_client.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Client for fetching bid history from Troostwijk REST API
|
||||
"""
|
||||
import aiohttp
|
||||
from typing import Dict, List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
BID_HISTORY_ENDPOINT = "https://shared-api.tbauctions.com/bidmanagement/lots/{lot_uuid}/bidding-history"
|
||||
|
||||
|
||||
async def fetch_bid_history(lot_uuid: str, page_size: int = 100) -> Optional[List[Dict]]:
|
||||
"""
|
||||
Fetch complete bid history for a lot
|
||||
|
||||
Args:
|
||||
lot_uuid: The lot UUID (from GraphQL response)
|
||||
page_size: Number of bids per page
|
||||
|
||||
Returns:
|
||||
List of bid dictionaries or None if request fails
|
||||
"""
|
||||
all_bids = []
|
||||
page_number = 1
|
||||
has_more = True
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
while has_more:
|
||||
url = BID_HISTORY_ENDPOINT.format(lot_uuid=lot_uuid)
|
||||
params = {"pageNumber": page_number, "pageSize": page_size}
|
||||
|
||||
async with session.get(url, params=params, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
|
||||
results = data.get('results', [])
|
||||
all_bids.extend(results)
|
||||
|
||||
has_more = data.get('hasNext', False)
|
||||
page_number += 1
|
||||
|
||||
if not has_more:
|
||||
break
|
||||
else:
|
||||
return None if page_number == 1 else all_bids
|
||||
|
||||
return all_bids if all_bids else None
|
||||
|
||||
except Exception as e:
|
||||
print(f" Bid history fetch failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_bid_history(bid_history: List[Dict], lot_id: str) -> Dict:
|
||||
"""
|
||||
Parse bid history into database-ready format
|
||||
|
||||
Args:
|
||||
bid_history: Raw bid history from API
|
||||
lot_id: The lot display ID (e.g., "A1-28505-5")
|
||||
|
||||
Returns:
|
||||
Dict with bid_records and calculated metrics
|
||||
"""
|
||||
if not bid_history:
|
||||
return {
|
||||
'bid_records': [],
|
||||
'first_bid_time': None,
|
||||
'last_bid_time': None,
|
||||
'bid_velocity': 0.0
|
||||
}
|
||||
|
||||
bid_records = []
|
||||
|
||||
for bid in bid_history:
|
||||
bid_amount_cents = bid.get('currentBid', {}).get('cents', 0)
|
||||
bid_amount = bid_amount_cents / 100.0 if bid_amount_cents else 0.0
|
||||
|
||||
bid_time_str = bid.get('createdAt', '')
|
||||
|
||||
bid_records.append({
|
||||
'lot_id': lot_id,
|
||||
'bid_amount': bid_amount,
|
||||
'bid_time': bid_time_str,
|
||||
'is_autobid': bid.get('autoBid', False),
|
||||
'bidder_id': bid.get('buyerId', ''),
|
||||
'bidder_number': bid.get('buyerNumber', 0)
|
||||
})
|
||||
|
||||
# Calculate metrics
|
||||
bid_times = []
|
||||
for record in bid_records:
|
||||
try:
|
||||
# Parse ISO timestamp: "2025-12-04T17:17:45.694698Z"
|
||||
dt = datetime.fromisoformat(record['bid_time'].replace('Z', '+00:00'))
|
||||
bid_times.append(dt)
|
||||
except:
|
||||
pass
|
||||
|
||||
first_bid_time = None
|
||||
last_bid_time = None
|
||||
bid_velocity = 0.0
|
||||
|
||||
if bid_times:
|
||||
bid_times.sort()
|
||||
first_bid_time = bid_times[0].strftime('%Y-%m-%d %H:%M:%S')
|
||||
last_bid_time = bid_times[-1].strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# Calculate velocity (bids per hour)
|
||||
if len(bid_times) > 1:
|
||||
time_span = (bid_times[-1] - bid_times[0]).total_seconds() / 3600 # hours
|
||||
if time_span > 0:
|
||||
bid_velocity = len(bid_times) / time_span
|
||||
|
||||
return {
|
||||
'bid_records': bid_records,
|
||||
'first_bid_time': first_bid_time,
|
||||
'last_bid_time': last_bid_time,
|
||||
'bid_velocity': round(bid_velocity, 2)
|
||||
}
|
||||
104
src/cache.py
104
src/cache.py
@@ -82,6 +82,63 @@ class CacheManager:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
|
||||
if 'minimum_bid' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
|
||||
if 'status' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN status TEXT")
|
||||
if 'brand' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT")
|
||||
if 'model' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN model TEXT")
|
||||
if 'attributes_json' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT")
|
||||
|
||||
# Bidding intelligence fields
|
||||
if 'first_bid_time' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT")
|
||||
if 'last_bid_time' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT")
|
||||
if 'bid_velocity' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL")
|
||||
if 'bid_increment' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL")
|
||||
|
||||
# Valuation intelligence fields
|
||||
if 'year_manufactured' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER")
|
||||
if 'condition_score' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL")
|
||||
if 'condition_description' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT")
|
||||
if 'serial_number' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT")
|
||||
if 'manufacturer' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT")
|
||||
if 'damage_description' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT")
|
||||
|
||||
# Create bid_history table
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS bid_history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
lot_id TEXT NOT NULL,
|
||||
bid_amount REAL NOT NULL,
|
||||
bid_time TEXT NOT NULL,
|
||||
is_autobid INTEGER DEFAULT 0,
|
||||
bidder_id TEXT,
|
||||
bidder_number INTEGER,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
)
|
||||
""")
|
||||
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
|
||||
ON bid_history(lot_id, bid_time)
|
||||
""")
|
||||
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
|
||||
ON bid_history(bidder_id)
|
||||
""")
|
||||
|
||||
# Remove duplicates before creating unique index
|
||||
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
|
||||
@@ -178,8 +235,12 @@ class CacheManager:
|
||||
INSERT OR REPLACE INTO lots
|
||||
(lot_id, auction_id, url, title, current_bid, starting_bid, minimum_bid,
|
||||
bid_count, closing_time, viewing_time, pickup_date, location, description,
|
||||
category, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
category, status, brand, model, attributes_json,
|
||||
first_bid_time, last_bid_time, bid_velocity, bid_increment,
|
||||
year_manufactured, condition_score, condition_description,
|
||||
serial_number, manufacturer, damage_description,
|
||||
scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
lot_data['lot_id'],
|
||||
lot_data.get('auction_id', ''),
|
||||
@@ -195,10 +256,49 @@ class CacheManager:
|
||||
lot_data.get('location', ''),
|
||||
lot_data.get('description', ''),
|
||||
lot_data.get('category', ''),
|
||||
lot_data.get('status', ''),
|
||||
lot_data.get('brand', ''),
|
||||
lot_data.get('model', ''),
|
||||
lot_data.get('attributes_json', ''),
|
||||
lot_data.get('first_bid_time'),
|
||||
lot_data.get('last_bid_time'),
|
||||
lot_data.get('bid_velocity'),
|
||||
lot_data.get('bid_increment'),
|
||||
lot_data.get('year_manufactured'),
|
||||
lot_data.get('condition_score'),
|
||||
lot_data.get('condition_description', ''),
|
||||
lot_data.get('serial_number', ''),
|
||||
lot_data.get('manufacturer', ''),
|
||||
lot_data.get('damage_description', ''),
|
||||
lot_data['scraped_at']
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
def save_bid_history(self, lot_id: str, bid_records: List[Dict]):
|
||||
"""Save bid history records to database"""
|
||||
if not bid_records:
|
||||
return
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Clear existing bid history for this lot
|
||||
conn.execute("DELETE FROM bid_history WHERE lot_id = ?", (lot_id,))
|
||||
|
||||
# Insert new records
|
||||
for record in bid_records:
|
||||
conn.execute("""
|
||||
INSERT INTO bid_history
|
||||
(lot_id, bid_amount, bid_time, is_autobid, bidder_id, bidder_number)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
record['lot_id'],
|
||||
record['bid_amount'],
|
||||
record['bid_time'],
|
||||
1 if record['is_autobid'] else 0,
|
||||
record['bidder_id'],
|
||||
record['bidder_number']
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
def save_images(self, lot_id: str, image_urls: List[str]):
|
||||
"""Save image URLs for a lot (prevents duplicates via unique constraint)"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
|
||||
@@ -7,6 +7,27 @@ from typing import Dict, Optional
|
||||
|
||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||
|
||||
AUCTION_QUERY = """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
id
|
||||
displayId
|
||||
viewingDays {
|
||||
startDate
|
||||
endDate
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
collectionDays {
|
||||
startDate
|
||||
endDate
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
LOT_BIDDING_QUERY = """
|
||||
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
||||
@@ -44,6 +65,42 @@ query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platfo
|
||||
"""
|
||||
|
||||
|
||||
async def fetch_auction_data(auction_id: str) -> Optional[Dict]:
|
||||
"""
|
||||
Fetch auction data (viewing/pickup times) from GraphQL API
|
||||
|
||||
Args:
|
||||
auction_id: The auction UUID
|
||||
|
||||
Returns:
|
||||
Dict with auction data or None if request fails
|
||||
"""
|
||||
variables = {
|
||||
"auctionId": auction_id,
|
||||
"locale": "nl",
|
||||
"platform": "TWK"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"query": AUCTION_QUERY,
|
||||
"variables": variables
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
auction = data.get('data', {}).get('auction', {})
|
||||
if auction:
|
||||
return auction
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
|
||||
"""
|
||||
Fetch lot bidding data from GraphQL API
|
||||
@@ -127,6 +184,15 @@ def format_bid_data(lot_details: Dict) -> Dict:
|
||||
return ''
|
||||
return ''
|
||||
|
||||
# Format status from minimumBidAmountMet
|
||||
minimum_bid_met = lot.get('minimumBidAmountMet', '')
|
||||
status_map = {
|
||||
'NO_MINIMUM_BID_AMOUNT': 'Geen Minimumprijs',
|
||||
'MINIMUM_BID_AMOUNT_NOT_MET': 'Minimumprijs nog niet gehaald',
|
||||
'MINIMUM_BID_AMOUNT_MET': 'Minimumprijs gehaald'
|
||||
}
|
||||
status = status_map.get(minimum_bid_met, '')
|
||||
|
||||
return {
|
||||
'current_bid': current_bid,
|
||||
'starting_bid': starting_bid,
|
||||
@@ -135,4 +201,209 @@ def format_bid_data(lot_details: Dict) -> Dict:
|
||||
'closing_time': format_timestamp(end_date),
|
||||
'bidding_status': lot.get('biddingStatus', ''),
|
||||
'vat_percentage': lot.get('vat', 0),
|
||||
'status': status,
|
||||
'auction_id': lot.get('auctionId', ''),
|
||||
}
|
||||
|
||||
|
||||
def format_auction_data(auction: Dict) -> Dict:
|
||||
"""
|
||||
Extract viewing/pickup times from auction data
|
||||
|
||||
Args:
|
||||
auction: Auction data from GraphQL
|
||||
|
||||
Returns:
|
||||
Dict with viewing_time and pickup_date
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
def format_days(days_list):
|
||||
if not days_list or not isinstance(days_list, list) or len(days_list) == 0:
|
||||
return ''
|
||||
|
||||
first_day = days_list[0]
|
||||
start_ts = first_day.get('startDate')
|
||||
end_ts = first_day.get('endDate')
|
||||
city = first_day.get('city', '')
|
||||
country = first_day.get('countryCode', '').upper()
|
||||
|
||||
if not start_ts or not end_ts:
|
||||
return ''
|
||||
|
||||
try:
|
||||
start_dt = datetime.fromtimestamp(start_ts)
|
||||
end_dt = datetime.fromtimestamp(end_ts)
|
||||
|
||||
# Format: "vr 05 dec 2025 van 09:00 tot 12:00"
|
||||
days_nl = ['ma', 'di', 'wo', 'do', 'vr', 'za', 'zo']
|
||||
months_nl = ['jan', 'feb', 'mrt', 'apr', 'mei', 'jun',
|
||||
'jul', 'aug', 'sep', 'okt', 'nov', 'dec']
|
||||
|
||||
day_name = days_nl[start_dt.weekday()]
|
||||
month_name = months_nl[start_dt.month - 1]
|
||||
|
||||
time_str = f"{day_name} {start_dt.day:02d} {month_name} {start_dt.year} van {start_dt.strftime('%H:%M')} tot {end_dt.strftime('%H:%M')}"
|
||||
|
||||
if city:
|
||||
location = f"{city}, {country}" if country else city
|
||||
return f"{time_str}\n{location}"
|
||||
|
||||
return time_str
|
||||
except:
|
||||
return ''
|
||||
|
||||
viewing_time = format_days(auction.get('viewingDays', []))
|
||||
pickup_date = format_days(auction.get('collectionDays', []))
|
||||
|
||||
return {
|
||||
'viewing_time': viewing_time,
|
||||
'pickup_date': pickup_date
|
||||
}
|
||||
|
||||
|
||||
def extract_attributes_from_lot_json(lot_json: Dict) -> Dict:
|
||||
"""
|
||||
Extract brand, model, and other attributes from lot JSON
|
||||
|
||||
Args:
|
||||
lot_json: The lot object from __NEXT_DATA__
|
||||
|
||||
Returns:
|
||||
Dict with brand, model, and attributes
|
||||
"""
|
||||
attributes = lot_json.get('attributes', [])
|
||||
if not isinstance(attributes, list):
|
||||
return {'brand': '', 'model': '', 'attributes_json': ''}
|
||||
|
||||
brand = ''
|
||||
model = ''
|
||||
|
||||
# Look for brand and model in attributes
|
||||
for attr in attributes:
|
||||
if not isinstance(attr, dict):
|
||||
continue
|
||||
|
||||
name = attr.get('name', '').lower()
|
||||
value = attr.get('value', '')
|
||||
|
||||
if name in ['brand', 'merk', 'fabrikant', 'manufacturer']:
|
||||
brand = value
|
||||
elif name in ['model', 'type']:
|
||||
model = value
|
||||
|
||||
import json
|
||||
return {
|
||||
'brand': brand,
|
||||
'model': model,
|
||||
'attributes_json': json.dumps(attributes) if attributes else ''
|
||||
}
|
||||
|
||||
|
||||
def extract_enriched_attributes(lot_json: Dict, page_data: Dict) -> Dict:
|
||||
"""
|
||||
Extract enriched valuation attributes from lot data
|
||||
|
||||
Args:
|
||||
lot_json: The lot object from __NEXT_DATA__
|
||||
page_data: Already parsed page data (title, description)
|
||||
|
||||
Returns:
|
||||
Dict with enriched attributes
|
||||
"""
|
||||
import re
|
||||
|
||||
attributes = lot_json.get('attributes', [])
|
||||
title = page_data.get('title', '')
|
||||
description = page_data.get('description', '')
|
||||
|
||||
# Initialize
|
||||
year_manufactured = None
|
||||
condition_description = ''
|
||||
condition_score = None
|
||||
serial_number = ''
|
||||
manufacturer = ''
|
||||
damage_description = ''
|
||||
|
||||
# Extract from attributes array
|
||||
for attr in attributes:
|
||||
if not isinstance(attr, dict):
|
||||
continue
|
||||
|
||||
name = attr.get('name', '').lower()
|
||||
value = str(attr.get('value', ''))
|
||||
|
||||
if name in ['jaar', 'year', 'bouwjaar', 'productiejaar']:
|
||||
try:
|
||||
year_manufactured = int(re.search(r'\d{4}', value).group())
|
||||
except:
|
||||
pass
|
||||
|
||||
elif name in ['conditie', 'condition', 'staat']:
|
||||
condition_description = value
|
||||
# Map condition to score (0-10)
|
||||
condition_map = {
|
||||
'nieuw': 10.0, 'new': 10.0,
|
||||
'als nieuw': 9.5, 'like new': 9.5,
|
||||
'uitstekend': 9.0, 'excellent': 9.0,
|
||||
'zeer goed': 8.0, 'very good': 8.0,
|
||||
'goed': 7.0, 'good': 7.0,
|
||||
'redelijk': 6.0, 'fair': 6.0,
|
||||
'matig': 5.0, 'moderate': 5.0,
|
||||
'slecht': 3.0, 'poor': 3.0,
|
||||
'defect': 1.0, 'defective': 1.0
|
||||
}
|
||||
for key, score in condition_map.items():
|
||||
if key in value.lower():
|
||||
condition_score = score
|
||||
break
|
||||
|
||||
elif name in ['serienummer', 'serial', 'serial number', 'artikelnummer']:
|
||||
serial_number = value
|
||||
|
||||
elif name in ['fabrikant', 'manufacturer', 'merk', 'brand']:
|
||||
manufacturer = value
|
||||
|
||||
# Extract 4-digit year from title if not found
|
||||
if not year_manufactured:
|
||||
year_match = re.search(r'\b(19|20)\d{2}\b', title)
|
||||
if year_match:
|
||||
try:
|
||||
year_manufactured = int(year_match.group())
|
||||
except:
|
||||
pass
|
||||
|
||||
# Extract damage mentions from description
|
||||
damage_keywords = ['schade', 'damage', 'beschadigd', 'damaged', 'defect', 'broken', 'kapot']
|
||||
if description:
|
||||
for keyword in damage_keywords:
|
||||
if keyword in description.lower():
|
||||
# Extract sentence containing damage keyword
|
||||
sentences = description.split('.')
|
||||
for sentence in sentences:
|
||||
if keyword in sentence.lower():
|
||||
damage_description = sentence.strip()
|
||||
break
|
||||
break
|
||||
|
||||
# Extract condition from __NEXT_DATA__ fields
|
||||
if not condition_description:
|
||||
lot_condition = lot_json.get('condition', '')
|
||||
if lot_condition and lot_condition != 'NOT_CHECKED':
|
||||
condition_description = lot_condition
|
||||
|
||||
lot_appearance = lot_json.get('appearance', '')
|
||||
if lot_appearance and lot_appearance != 'NOT_CHECKED':
|
||||
if condition_description:
|
||||
condition_description += f", {lot_appearance}"
|
||||
else:
|
||||
condition_description = lot_appearance
|
||||
|
||||
return {
|
||||
'year_manufactured': year_manufactured,
|
||||
'condition_description': condition_description,
|
||||
'condition_score': condition_score,
|
||||
'serial_number': serial_number,
|
||||
'manufacturer': manufacturer or page_data.get('brand', ''), # Fallback to brand
|
||||
'damage_description': damage_description
|
||||
}
|
||||
|
||||
@@ -19,7 +19,13 @@ from config import (
|
||||
)
|
||||
from cache import CacheManager
|
||||
from parse import DataParser
|
||||
from graphql_client import fetch_lot_bidding_data, format_bid_data
|
||||
from graphql_client import (
|
||||
fetch_lot_bidding_data, format_bid_data,
|
||||
fetch_auction_data, format_auction_data,
|
||||
extract_attributes_from_lot_json,
|
||||
extract_enriched_attributes
|
||||
)
|
||||
from bid_history_client import fetch_bid_history, parse_bid_history
|
||||
|
||||
class TroostwijkScraper:
|
||||
"""Main scraper class for Troostwijk Auctions"""
|
||||
@@ -183,6 +189,31 @@ class TroostwijkScraper:
|
||||
print(f" Type: LOT")
|
||||
print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
|
||||
|
||||
# Extract ALL data from __NEXT_DATA__ lot object
|
||||
import json
|
||||
import re
|
||||
lot_json = None
|
||||
lot_uuid = None
|
||||
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
||||
if lot_json:
|
||||
# Basic attributes
|
||||
attrs = extract_attributes_from_lot_json(lot_json)
|
||||
page_data.update(attrs)
|
||||
|
||||
# Enriched attributes (year, condition, etc.)
|
||||
enriched = extract_enriched_attributes(lot_json, page_data)
|
||||
page_data.update(enriched)
|
||||
|
||||
# Get lot UUID for bid history
|
||||
lot_uuid = lot_json.get('id')
|
||||
except:
|
||||
pass
|
||||
|
||||
# Fetch bidding data from GraphQL API
|
||||
lot_id = page_data.get('lot_id')
|
||||
print(f" Fetching bidding data from API...")
|
||||
@@ -190,11 +221,39 @@ class TroostwijkScraper:
|
||||
|
||||
if bidding_data:
|
||||
formatted_data = format_bid_data(bidding_data)
|
||||
# Update page_data with real bidding info
|
||||
page_data.update(formatted_data)
|
||||
print(f" Bid: {page_data.get('current_bid', 'N/A')}")
|
||||
print(f" Bid Count: {page_data.get('bid_count', 0)}")
|
||||
print(f" Closing: {page_data.get('closing_time', 'N/A')}")
|
||||
print(f" Status: {page_data.get('status', 'N/A')}")
|
||||
|
||||
# Extract bid increment from nextBidStepInCents
|
||||
lot_details_lot = bidding_data.get('lot', {})
|
||||
next_step_cents = lot_details_lot.get('nextBidStepInCents')
|
||||
if next_step_cents:
|
||||
page_data['bid_increment'] = next_step_cents / 100.0
|
||||
|
||||
# Get lot UUID if not already extracted
|
||||
if not lot_uuid:
|
||||
lot_uuid = lot_details_lot.get('id')
|
||||
|
||||
# Fetch bid history for intelligence
|
||||
if lot_uuid and page_data.get('bid_count', 0) > 0:
|
||||
print(f" Fetching bid history...")
|
||||
bid_history = await fetch_bid_history(lot_uuid)
|
||||
if bid_history:
|
||||
bid_data = parse_bid_history(bid_history, lot_id)
|
||||
page_data.update(bid_data)
|
||||
print(f" Bid velocity: {bid_data['bid_velocity']} bids/hour")
|
||||
|
||||
# Save bid history to database
|
||||
self.cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||
|
||||
# Fetch auction data for viewing/pickup times if we have auction_id
|
||||
auction_id = page_data.get('auction_id')
|
||||
if auction_id:
|
||||
auction_data = await fetch_auction_data(auction_id)
|
||||
if auction_data:
|
||||
auction_times = format_auction_data(auction_data)
|
||||
page_data.update(auction_times)
|
||||
else:
|
||||
print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
|
||||
|
||||
|
||||
28
test_auction_fetch.py
Normal file
28
test_auction_fetch.py
Normal file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test auction data fetch"""
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from graphql_client import fetch_auction_data, format_auction_data
|
||||
|
||||
async def main():
|
||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||
|
||||
print(f"Fetching auction: {auction_id}\n")
|
||||
auction_data = await fetch_auction_data(auction_id)
|
||||
|
||||
if auction_data:
|
||||
print("Raw Auction Data:")
|
||||
print(json.dumps(auction_data, indent=2))
|
||||
|
||||
print("\n\nFormatted:")
|
||||
formatted = format_auction_data(auction_data)
|
||||
print(f"Viewing: {formatted['viewing_time']}")
|
||||
print(f"Pickup: {formatted['pickup_date']}")
|
||||
else:
|
||||
print("No auction data returned")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
59
test_auction_query.py
Normal file
59
test_auction_query.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test if the auction query works at all"""
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
|
||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||
|
||||
# Try a simpler query first
|
||||
SIMPLE_QUERY = """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
id
|
||||
displayId
|
||||
viewingDays {
|
||||
startDate
|
||||
endDate
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
collectionDays {
|
||||
startDate
|
||||
endDate
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
async def main():
|
||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||
|
||||
variables = {
|
||||
"auctionId": auction_id,
|
||||
"locale": "nl",
|
||||
"platform": "TWK"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"query": SIMPLE_QUERY,
|
||||
"variables": variables
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||
print(f"Status: {response.status}")
|
||||
text = await response.text()
|
||||
print(f"Response: {text}")
|
||||
|
||||
try:
|
||||
data = await response.json()
|
||||
print(f"\nParsed:")
|
||||
print(json.dumps(data, indent=2))
|
||||
except:
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
95
test_comprehensive.py
Normal file
95
test_comprehensive.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test comprehensive data enrichment"""
|
||||
import asyncio
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
async def main():
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
)
|
||||
|
||||
# Test with lot that has bids
|
||||
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
|
||||
|
||||
print(f"Testing comprehensive extraction\n")
|
||||
result = await scraper.crawl_page(page, lot_url)
|
||||
|
||||
if result:
|
||||
print(f"\n{'='*60}")
|
||||
print("COMPREHENSIVE DATA EXTRACTION:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Lot ID: {result.get('lot_id')}")
|
||||
print(f"Title: {result.get('title', '')[:50]}...")
|
||||
print(f"\n[Bidding Intelligence]")
|
||||
print(f" Status: {result.get('status')}")
|
||||
print(f" Current Bid: {result.get('current_bid')}")
|
||||
print(f" Starting Bid: {result.get('starting_bid')}")
|
||||
print(f" Bid Increment: EUR {result.get('bid_increment', 0):.2f}")
|
||||
print(f" Bid Count: {result.get('bid_count')}")
|
||||
print(f" First Bid: {result.get('first_bid_time', 'N/A')}")
|
||||
print(f" Last Bid: {result.get('last_bid_time', 'N/A')}")
|
||||
print(f" Bid Velocity: {result.get('bid_velocity', 0)} bids/hour")
|
||||
print(f"\n[Valuation Intelligence]")
|
||||
print(f" Brand: {result.get('brand', 'N/A')}")
|
||||
print(f" Model: {result.get('model', 'N/A')}")
|
||||
print(f" Year: {result.get('year_manufactured', 'N/A')}")
|
||||
print(f" Manufacturer: {result.get('manufacturer', 'N/A')}")
|
||||
print(f" Condition Score: {result.get('condition_score', 'N/A')}")
|
||||
print(f" Condition: {result.get('condition_description', 'N/A')}")
|
||||
print(f" Serial#: {result.get('serial_number', 'N/A')}")
|
||||
print(f" Damage: {result.get('damage_description', 'N/A')[:50] if result.get('damage_description') else 'N/A'}...")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Verify database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
# Check lot data
|
||||
cursor = conn.execute("""
|
||||
SELECT bid_velocity, first_bid_time, year_manufactured, condition_score
|
||||
FROM lots
|
||||
WHERE lot_id = ?
|
||||
""", (result.get('lot_id'),))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
print(f"\n{'='*60}")
|
||||
print("DATABASE VERIFICATION (lots table):")
|
||||
print(f"{'='*60}")
|
||||
print(f" Bid Velocity: {row[0]}")
|
||||
print(f" First Bid Time: {row[1]}")
|
||||
print(f" Year: {row[2]}")
|
||||
print(f" Condition Score: {row[3]}")
|
||||
|
||||
# Check bid history
|
||||
cursor = conn.execute("""
|
||||
SELECT COUNT(*), MIN(bid_time), MAX(bid_time), SUM(is_autobid)
|
||||
FROM bid_history
|
||||
WHERE lot_id = ?
|
||||
""", (result.get('lot_id'),))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row and row[0] > 0:
|
||||
print(f"\n{'='*60}")
|
||||
print("DATABASE VERIFICATION (bid_history table):")
|
||||
print(f"{'='*60}")
|
||||
print(f" Total Bids Stored: {row[0]}")
|
||||
print(f" First Bid: {row[1]}")
|
||||
print(f" Last Bid: {row[2]}")
|
||||
print(f" Autobids: {row[3]}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
64
test_new_fields.py
Normal file
64
test_new_fields.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test the new fields extraction"""
|
||||
import asyncio
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
async def main():
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
)
|
||||
|
||||
# Test with lot that has attributes
|
||||
lot_url = "https://www.troostwijkauctions.com/l/47-5kg-hexagon-dumbbell-%25282x%2529-A1-40668-34"
|
||||
|
||||
print(f"Testing new fields with: {lot_url}\n")
|
||||
result = await scraper.crawl_page(page, lot_url)
|
||||
|
||||
if result:
|
||||
print(f"\n{'='*60}")
|
||||
print("EXTRACTED FIELDS:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Lot ID: {result.get('lot_id')}")
|
||||
print(f"Title: {result.get('title', '')[:50]}...")
|
||||
print(f"Status: {result.get('status')}")
|
||||
print(f"Brand: {result.get('brand')}")
|
||||
print(f"Model: {result.get('model')}")
|
||||
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
|
||||
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
|
||||
print(f"Attributes: {result.get('attributes_json', '')[:100]}...")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Verify database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
cursor = conn.execute("""
|
||||
SELECT status, brand, model, viewing_time, pickup_date
|
||||
FROM lots
|
||||
WHERE lot_id = ?
|
||||
""", (result.get('lot_id'),))
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
print(f"\n{'='*60}")
|
||||
print("DATABASE VERIFICATION:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Status: {row[0]}")
|
||||
print(f"Brand: {row[1]}")
|
||||
print(f"Model: {row[2]}")
|
||||
print(f"Viewing: {row[3][:100] if row[3] else 'N/A'}...")
|
||||
print(f"Pickup: {row[4][:100] if row[4] else 'N/A'}...")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user