enrich data

This commit is contained in:
Tour
2025-12-07 16:26:30 +01:00
parent fd69faebcc
commit b1905164bd
40 changed files with 76 additions and 3605 deletions

View File

@@ -1,54 +0,0 @@
#!/usr/bin/env python3
"""Check for Apollo state or other embedded data"""
import asyncio
import json
import re
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
content = await page.content()
# Look for embedded data structures
patterns = [
(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', "NEXT_DATA"),
(r'window\.__APOLLO_STATE__\s*=\s*({.+?});', "APOLLO_STATE"),
(r'"lots"\s*:\s*\[(.+?)\]', "LOTS_ARRAY"),
]
for pattern, name in patterns:
match = re.search(pattern, content, re.DOTALL)
if match:
print(f"\n{'='*60}")
print(f"FOUND: {name}")
print(f"{'='*60}")
try:
if name == "LOTS_ARRAY":
print(f"Preview: {match.group(1)[:500]}")
else:
data = json.loads(match.group(1))
print(json.dumps(data, indent=2)[:2000])
except:
print(f"Preview: {match.group(1)[:1000]}")
# Also check for any script tags with "lot" and "bid" and "end"
print(f"\n{'='*60}")
print("SEARCHING FOR LOT DATA IN ALL SCRIPTS")
print(f"{'='*60}")
scripts = re.findall(r'<script[^>]*>(.+?)</script>', content, re.DOTALL)
for i, script in enumerate(scripts):
if all(term in script.lower() for term in ['lot', 'bid', 'end']):
print(f"\nScript #{i} (first 500 chars):")
print(script[:500])
if i > 3: # Limit output
break
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,54 +0,0 @@
#!/usr/bin/env python3
"""Check current data quality in cache.db"""
import sqlite3
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
print("=" * 60)
print("CURRENT DATA QUALITY CHECK")
print("=" * 60)
# Check lots table
print("\n[*] Sample Lot Data:")
cursor = conn.execute("""
SELECT lot_id, current_bid, bid_count, closing_time
FROM lots
LIMIT 10
""")
for row in cursor:
print(f" Lot: {row[0]}")
print(f" Current Bid: {row[1]}")
print(f" Bid Count: {row[2]}")
print(f" Closing Time: {row[3]}")
# Check auctions table
print("\n[*] Sample Auction Data:")
cursor = conn.execute("""
SELECT auction_id, title, closing_time, first_lot_closing_time
FROM auctions
LIMIT 5
""")
for row in cursor:
print(f" Auction: {row[0]}")
print(f" Title: {row[1][:50]}...")
print(f" Closing Time: {row[2] if len(row) > 2 else 'N/A'}")
print(f" First Lot Closing: {row[3]}")
# Data completeness stats
print("\n[*] Data Completeness:")
cursor = conn.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN current_bid IS NULL OR current_bid = '' THEN 1 ELSE 0 END) as missing_current_bid,
SUM(CASE WHEN closing_time IS NULL OR closing_time = '' THEN 1 ELSE 0 END) as missing_closing_time,
SUM(CASE WHEN bid_count IS NULL OR bid_count = 0 THEN 1 ELSE 0 END) as zero_bid_count
FROM lots
""")
row = cursor.fetchone()
print(f" Total lots: {row[0]:,}")
print(f" Missing current_bid: {row[1]:,} ({100*row[1]/row[0]:.1f}%)")
print(f" Missing closing_time: {row[2]:,} ({100*row[2]/row[0]:.1f}%)")
print(f" Zero bid_count: {row[3]:,} ({100*row[3]/row[0]:.1f}%)")
conn.close()
print("\n" + "=" * 60)

View File

@@ -1,67 +0,0 @@
#!/usr/bin/env python3
"""Check if GraphQL has viewing/pickup data"""
import asyncio
import json
import sys
sys.path.insert(0, 'src')
from graphql_client import GRAPHQL_ENDPOINT
import aiohttp
# Expanded query to check for all available fields
EXTENDED_QUERY = """
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
lot {
id
displayId
auctionId
currentBidAmount { cents currency }
initialAmount { cents currency }
nextMinimalBid { cents currency }
bidsCount
startDate
endDate
# Try to find viewing/pickup fields
viewingDays { startDate endDate city countryCode }
collectionDays { startDate endDate city countryCode }
pickupDays { startDate endDate city countryCode }
}
auction {
id
displayId
viewingDays { startDate endDate city countryCode }
collectionDays { startDate endDate city countryCode }
}
}
}
"""
async def main():
variables = {
"lotDisplayId": "A1-28505-5",
"locale": "nl",
"platform": "TWK"
}
payload = {
"query": EXTENDED_QUERY,
"variables": variables
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
if response.status == 200:
data = await response.json()
print("Full GraphQL Response:")
print(json.dumps(data, indent=2))
else:
print(f"Error: {response.status}")
print(await response.text())
except Exception as e:
print(f"Exception: {e}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,72 +0,0 @@
"""Check how lots link to auctions"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
from cache import CacheManager
import sqlite3
import zlib
import json
import re
cache = CacheManager()
conn = sqlite3.connect(cache.db_path)
cursor = conn.cursor()
# Get a lot page from cache
cursor.execute("SELECT url, content FROM cache WHERE url LIKE '%/l/%' LIMIT 1")
url, content_blob = cursor.fetchone()
content = zlib.decompress(content_blob).decode('utf-8')
# Extract __NEXT_DATA__
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
data = json.loads(match.group(1))
props = data.get('props', {}).get('pageProps', {})
print("PageProps keys:", list(props.keys()))
lot = props.get('lot', {})
print("\nLot data:")
print(f" displayId: {lot.get('displayId')}")
print(f" auctionId (UUID): {lot.get('auctionId')}")
# Check if auction data is also included
auction = props.get('auction')
if auction:
print("\nAuction data IS included in lot page!")
print(f" Auction displayId: {auction.get('displayId')}")
print(f" Auction id (UUID): {auction.get('id')}")
print(f" Auction name: {auction.get('name', '')[:60]}")
else:
print("\nAuction data NOT included in lot page")
print("Need to look up auction by UUID")
# Check if we can find the auction by UUID
lot_auction_uuid = lot.get('auctionId')
if lot_auction_uuid:
# Try to find auction page with this UUID
cursor.execute("""
SELECT url, content FROM cache
WHERE url LIKE '%/a/%'
LIMIT 10
""")
found_match = False
for auction_url, auction_content_blob in cursor.fetchall():
auction_content = zlib.decompress(auction_content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', auction_content, re.DOTALL)
if match:
auction_data = json.loads(match.group(1))
auction_obj = auction_data.get('props', {}).get('pageProps', {}).get('auction', {})
if auction_obj.get('id') == lot_auction_uuid:
print(f"\n✓ Found matching auction!")
print(f" Auction displayId: {auction_obj.get('displayId')}")
print(f" Auction UUID: {auction_obj.get('id')}")
print(f" Auction URL: {auction_url}")
found_match = True
break
if not found_match:
print(f"\n✗ Could not find auction with UUID {lot_auction_uuid} in first 10 cached auctions")
conn.close()

View File

@@ -1,36 +0,0 @@
#!/usr/bin/env python3
"""Check viewing time data"""
import sqlite3
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
# Check if viewing_time has data
cursor = conn.execute("""
SELECT viewing_time, pickup_date
FROM lots
WHERE viewing_time IS NOT NULL AND viewing_time != ''
LIMIT 5
""")
rows = cursor.fetchall()
print("Existing viewing_time data:")
for r in rows:
print(f" Viewing: {r[0]}")
print(f" Pickup: {r[1]}")
print()
# Check overall completeness
cursor = conn.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN viewing_time IS NOT NULL AND viewing_time != '' THEN 1 ELSE 0 END) as has_viewing,
SUM(CASE WHEN pickup_date IS NOT NULL AND pickup_date != '' THEN 1 ELSE 0 END) as has_pickup
FROM lots
""")
row = cursor.fetchone()
print(f"Completeness:")
print(f" Total lots: {row[0]}")
print(f" Has viewing_time: {row[1]} ({100*row[1]/row[0]:.1f}%)")
print(f" Has pickup_date: {row[2]} ({100*row[2]/row[0]:.1f}%)")
conn.close()

View File

@@ -1,35 +0,0 @@
#!/usr/bin/env python3
"""Check if viewing time is in the GraphQL response"""
import asyncio
import json
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
responses = []
async def capture_response(response):
if 'graphql' in response.url and 'LotBiddingData' in await response.text():
try:
body = await response.json()
responses.append(body)
except:
pass
page.on('response', capture_response)
await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle')
await asyncio.sleep(2)
if responses:
print("Full LotBiddingData Response:")
print("="*60)
print(json.dumps(responses[0], indent=2))
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,69 +0,0 @@
#!/usr/bin/env python3
"""Debug lot data structure from cached page"""
import sqlite3
import zlib
import json
import re
import sys
sys.path.insert(0, 'src')
from parse import DataParser
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
# Get a recent lot page
cursor = conn.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/l/%'
ORDER BY timestamp DESC
LIMIT 1
""")
row = cursor.fetchone()
if not row:
print("No lot pages found")
exit(1)
url, content_blob = row
content = zlib.decompress(content_blob).decode('utf-8')
parser = DataParser()
result = parser.parse_page(content, url)
if result:
print(f"URL: {url}")
print(f"\nParsed Data:")
print(f" type: {result.get('type')}")
print(f" lot_id: {result.get('lot_id')}")
print(f" title: {result.get('title', '')[:50]}...")
print(f" current_bid: {result.get('current_bid')}")
print(f" bid_count: {result.get('bid_count')}")
print(f" closing_time: {result.get('closing_time')}")
print(f" location: {result.get('location')}")
# Also dump the raw JSON
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if match:
data = json.loads(match.group(1))
page_props = data.get('props', {}).get('pageProps', {})
if 'lot' in page_props:
lot = page_props['lot']
print(f"\nRAW __NEXT_DATA__.lot keys: {list(lot.keys())}")
print(f"\nSearching for bid/timing fields...")
# Deep search for these fields
def deep_search(obj, prefix=""):
if isinstance(obj, dict):
for k, v in obj.items():
if any(term in k.lower() for term in ['bid', 'end', 'close', 'date', 'time']):
print(f" {prefix}{k}: {v}")
if isinstance(v, (dict, list)):
deep_search(v, prefix + k + ".")
elif isinstance(obj, list) and len(obj) > 0:
deep_search(obj[0], prefix + "[0].")
deep_search(lot)
conn.close()

View File

@@ -1,65 +0,0 @@
#!/usr/bin/env python3
"""Deep inspect lot JSON for viewing/pickup data"""
import sqlite3
import zlib
import json
import re
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/l/%'
ORDER BY timestamp DESC
LIMIT 1
""")
row = cursor.fetchone()
url, content_blob = row
content = zlib.decompress(content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
data = json.loads(match.group(1))
lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
print(f"Inspecting: {url}\n")
# Check onboarding
if 'onboarding' in lot:
print("ONBOARDING:")
print(json.dumps(lot['onboarding'], indent=2))
print()
# Check attributes
if 'attributes' in lot:
print("ATTRIBUTES:")
attrs = lot['attributes']
print(json.dumps(attrs[:3] if isinstance(attrs, list) else attrs, indent=2))
print()
# Check condition
if 'condition' in lot:
print("CONDITION:")
print(json.dumps(lot['condition'], indent=2))
print()
# Check appearance
if 'appearance' in lot:
print("APPEARANCE:")
print(json.dumps(lot['appearance'], indent=2))
print()
# Check location
if 'location' in lot:
print("LOCATION:")
print(json.dumps(lot['location'], indent=2))
print()
# Check for any field with "view", "pick", "collect", "date", "time"
print("\nFIELDS WITH VIEWING/PICKUP/TIME:")
for key in lot.keys():
if any(term in key.lower() for term in ['view', 'pick', 'collect', 'date', 'time', 'day']):
print(f" {key}: {lot[key]}")
conn.close()

View File

@@ -1,120 +0,0 @@
"""
Enrich existing lots with new intelligence fields:
- followers_count
- estimated_min_price / estimated_max_price
- lot_condition
- appearance
Reads from cached lot pages __NEXT_DATA__ JSON
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
import asyncio
from cache import CacheManager
import sqlite3
import zlib
import json
import re
from graphql_client import fetch_lot_bidding_data, format_bid_data
async def enrich_existing_lots():
"""Enrich existing lots with new fields from GraphQL API"""
cache = CacheManager()
conn = sqlite3.connect(cache.db_path)
cursor = conn.cursor()
# Get all lot IDs
cursor.execute("SELECT lot_id FROM lots")
lot_ids = [r[0] for r in cursor.fetchall()]
print(f"Found {len(lot_ids)} lots to enrich")
print("Fetching enrichment data from GraphQL API...")
print("This will take ~{:.1f} minutes (0.5s rate limit)".format(len(lot_ids) * 0.5 / 60))
enriched = 0
failed = 0
no_data = 0
for i, lot_id in enumerate(lot_ids):
if (i + 1) % 10 == 0:
print(f"Progress: {i+1}/{len(lot_ids)} ({enriched} enriched, {no_data} no data, {failed} failed)", end='\r')
try:
# Fetch from GraphQL API
bidding_data = await fetch_lot_bidding_data(lot_id)
if bidding_data:
formatted_data = format_bid_data(bidding_data)
# Update lot with new fields
cursor.execute("""
UPDATE lots
SET followers_count = ?,
estimated_min_price = ?,
estimated_max_price = ?,
lot_condition = ?,
appearance = ?
WHERE lot_id = ?
""", (
formatted_data.get('followers_count', 0),
formatted_data.get('estimated_min_price'),
formatted_data.get('estimated_max_price'),
formatted_data.get('lot_condition', ''),
formatted_data.get('appearance', ''),
lot_id
))
enriched += 1
# Commit every 50 lots
if enriched % 50 == 0:
conn.commit()
else:
no_data += 1
# Rate limit
await asyncio.sleep(0.5)
except Exception as e:
failed += 1
continue
conn.commit()
print(f"\n\nComplete!")
print(f"Total lots: {len(lot_ids)}")
print(f"Enriched: {enriched}")
print(f"No data: {no_data}")
print(f"Failed: {failed}")
# Show statistics
cursor.execute("SELECT COUNT(*) FROM lots WHERE followers_count > 0")
with_followers = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM lots WHERE estimated_min_price IS NOT NULL")
with_estimates = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM lots WHERE lot_condition IS NOT NULL AND lot_condition != ''")
with_condition = cursor.fetchone()[0]
print(f"\nEnrichment statistics:")
print(f" Lots with followers_count: {with_followers} ({with_followers/len(lot_ids)*100:.1f}%)")
print(f" Lots with estimated prices: {with_estimates} ({with_estimates/len(lot_ids)*100:.1f}%)")
print(f" Lots with condition: {with_condition} ({with_condition/len(lot_ids)*100:.1f}%)")
conn.close()
if __name__ == "__main__":
print("WARNING: This will make ~16,800 API calls at 0.5s intervals (~2.3 hours)")
print("Press Ctrl+C to cancel, or wait 5 seconds to continue...")
import time
try:
time.sleep(5)
except KeyboardInterrupt:
print("\nCancelled")
sys.exit(0)
asyncio.run(enrich_existing_lots())

View File

@@ -1,370 +0,0 @@
"""
Explore API responses to identify additional fields available for intelligence.
Tests GraphQL and REST API responses for field coverage.
"""
import asyncio
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
import json
import aiohttp
from graphql_client import fetch_lot_bidding_data, GRAPHQL_ENDPOINT
from bid_history_client import fetch_bid_history, BID_HISTORY_ENDPOINT
async def explore_graphql_schema():
"""Query GraphQL schema to see all available fields"""
print("=" * 80)
print("GRAPHQL SCHEMA EXPLORATION")
print("=" * 80)
# Introspection query for LotDetails type
introspection_query = """
query IntrospectionQuery {
__type(name: "LotDetails") {
name
fields {
name
type {
name
kind
ofType {
name
kind
}
}
}
}
}
"""
async with aiohttp.ClientSession() as session:
try:
async with session.post(
GRAPHQL_ENDPOINT,
json={
"query": introspection_query,
"variables": {}
},
headers={"Content-Type": "application/json"}
) as response:
if response.status == 200:
data = await response.json()
lot_type = data.get('data', {}).get('__type')
if lot_type:
print("\nLotDetails available fields:")
for field in lot_type.get('fields', []):
field_name = field['name']
field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex')
print(f" - {field_name}: {field_type}")
print()
else:
print(f"Failed with status {response.status}")
except Exception as e:
print(f"Error: {e}")
# Also try Lot type
introspection_query_lot = """
query IntrospectionQuery {
__type(name: "Lot") {
name
fields {
name
type {
name
kind
ofType {
name
kind
}
}
}
}
}
"""
async with aiohttp.ClientSession() as session:
try:
async with session.post(
GRAPHQL_ENDPOINT,
json={
"query": introspection_query_lot,
"variables": {}
},
headers={"Content-Type": "application/json"}
) as response:
if response.status == 200:
data = await response.json()
lot_type = data.get('data', {}).get('__type')
if lot_type:
print("\nLot type available fields:")
for field in lot_type.get('fields', []):
field_name = field['name']
field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex')
print(f" - {field_name}: {field_type}")
print()
except Exception as e:
print(f"Error: {e}")
async def test_graphql_full_query():
"""Test a comprehensive GraphQL query to see all returned data"""
print("=" * 80)
print("GRAPHQL FULL QUERY TEST")
print("=" * 80)
# Test with a real lot ID
lot_id = "A1-34731-107" # Example from database
comprehensive_query = """
query ComprehensiveLotQuery($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
lot {
id
displayId
title
description
currentBidAmount { cents currency }
initialAmount { cents currency }
nextMinimalBid { cents currency }
bidsCount
startDate
endDate
minimumBidAmountMet
lotNumber
auctionId
lotState
location {
city
countryCode
}
viewingDays {
city
countryCode
addressLine1
addressLine2
endDate
startDate
}
collectionDays {
city
countryCode
addressLine1
addressLine2
endDate
startDate
}
images {
url
thumbnailUrl
}
attributes {
name
value
}
}
}
}
"""
async with aiohttp.ClientSession() as session:
try:
async with session.post(
GRAPHQL_ENDPOINT,
json={
"query": comprehensive_query,
"variables": {
"lotDisplayId": lot_id,
"locale": "nl_NL",
"platform": "WEB"
}
},
headers={"Content-Type": "application/json"}
) as response:
if response.status == 200:
data = await response.json()
print(f"\nFull GraphQL response for {lot_id}:")
print(json.dumps(data, indent=2))
print()
else:
print(f"Failed with status {response.status}")
print(await response.text())
except Exception as e:
print(f"Error: {e}")
async def test_bid_history_response():
"""Test bid history API to see all returned fields"""
print("=" * 80)
print("BID HISTORY API TEST")
print("=" * 80)
# Get a lot with bids from database
import sqlite3
from cache import CacheManager
cache = CacheManager()
conn = sqlite3.connect(cache.db_path)
cursor = conn.cursor()
# Find a lot with bids
cursor.execute("""
SELECT lot_id, url FROM lots
WHERE bid_count > 0
ORDER BY bid_count DESC
LIMIT 1
""")
result = cursor.fetchone()
if result:
lot_id, url = result
# Extract UUID from URL
import re
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>', url)
# We need to get UUID from cached page
cursor.execute("SELECT content FROM cache WHERE url = ?", (url,))
page_result = cursor.fetchone()
if page_result:
import zlib
content = zlib.decompress(page_result[0]).decode('utf-8')
match = re.search(r'"lot":\s*\{[^}]*"id":\s*"([^"]+)"', content)
if match:
lot_uuid = match.group(1)
print(f"\nTesting with lot {lot_id} (UUID: {lot_uuid})")
# Fetch bid history
bid_history = await fetch_bid_history(lot_uuid)
if bid_history:
print(f"\nBid history sample (first 3 records):")
for i, bid in enumerate(bid_history[:3]):
print(f"\nBid {i+1}:")
print(json.dumps(bid, indent=2))
print(f"\n\nAll available fields in bid records:")
if bid_history:
all_keys = set()
for bid in bid_history:
all_keys.update(bid.keys())
for key in sorted(all_keys):
print(f" - {key}")
else:
print("No bid history found")
conn.close()
async def check_auction_api():
"""Check if there's an auction details API"""
print("=" * 80)
print("AUCTION API EXPLORATION")
print("=" * 80)
auction_query = """
query AuctionDetails($auctionId: String!, $locale: String!, $platform: Platform!) {
auctionDetails(auctionId: $auctionId, locale: $locale, platform: $platform) {
auction {
id
title
description
startDate
endDate
firstLotEndDate
location {
city
countryCode
}
viewingDays {
city
countryCode
startDate
endDate
addressLine1
addressLine2
}
collectionDays {
city
countryCode
startDate
endDate
addressLine1
addressLine2
}
}
}
}
"""
# Get an auction ID from database
import sqlite3
from cache import CacheManager
cache = CacheManager()
conn = sqlite3.connect(cache.db_path)
cursor = conn.cursor()
# Get auction ID from a lot
cursor.execute("SELECT DISTINCT auction_id FROM lots WHERE auction_id IS NOT NULL LIMIT 1")
result = cursor.fetchone()
if result:
auction_id = result[0]
print(f"\nTesting with auction {auction_id}")
async with aiohttp.ClientSession() as session:
try:
async with session.post(
GRAPHQL_ENDPOINT,
json={
"query": auction_query,
"variables": {
"auctionId": auction_id,
"locale": "nl_NL",
"platform": "WEB"
}
},
headers={"Content-Type": "application/json"}
) as response:
if response.status == 200:
data = await response.json()
print("\nAuction API response:")
print(json.dumps(data, indent=2))
else:
print(f"Failed with status {response.status}")
print(await response.text())
except Exception as e:
print(f"Error: {e}")
conn.close()
async def main():
"""Run all API explorations"""
await explore_graphql_schema()
await test_graphql_full_query()
await test_bid_history_response()
await check_auction_api()
print("\n" + "=" * 80)
print("SUMMARY: AVAILABLE DATA FIELDS")
print("=" * 80)
print("""
CURRENTLY CAPTURED:
- Lot bidding data: current_bid, starting_bid, minimum_bid, bid_count, closing_time
- Lot attributes: brand, model, manufacturer, year, condition, serial_number
- Bid history: bid_amount, bid_time, bidder_id, is_autobid
- Bid intelligence: first_bid_time, last_bid_time, bid_velocity, bid_increment
- Images: URLs and local paths
POTENTIALLY AVAILABLE (TO CHECK):
- Viewing/collection times with full address and date ranges
- Lot location details (city, country)
- Lot state/status
- Image thumbnails
- More detailed attributes
NOT AVAILABLE:
- Watch count (not exposed in API)
- Reserve price (not exposed in API)
- Estimated min/max value (not exposed in API)
- Bidder identities (anonymized)
""")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,93 +0,0 @@
#!/usr/bin/env python3
"""Explore the actual auction schema"""
import asyncio
import aiohttp
import json
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
# Try different field structures
QUERIES = {
"viewingDays_simple": """
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
auction(id: $auctionId, locale: $locale, platform: $platform) {
viewingDays {
city
countryCode
}
}
}
""",
"viewingDays_with_times": """
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
auction(id: $auctionId, locale: $locale, platform: $platform) {
viewingDays {
from
to
city
}
}
}
""",
"full_auction": """
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
auction(id: $auctionId, locale: $locale, platform: $platform) {
id
displayId
biddingStatus
buyersPremium
viewingDays {
city
countryCode
from
to
}
collectionDays {
city
countryCode
from
to
}
}
}
"""
}
async def test_query(name, query, auction_id):
variables = {
"auctionId": auction_id,
"locale": "nl",
"platform": "TWK"
}
payload = {
"query": query,
"variables": variables
}
async with aiohttp.ClientSession() as session:
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
data = await response.json()
print(f"\n{'='*60}")
print(f"QUERY: {name}")
print(f"{'='*60}")
if 'errors' in data:
print("ERRORS:")
for error in data['errors']:
print(f" {error}")
else:
print("SUCCESS:")
print(json.dumps(data, indent=2))
async def main():
# Test with the auction we know exists
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
for name, query in QUERIES.items():
await test_query(name, query, auction_id)
await asyncio.sleep(0.5)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,141 +0,0 @@
#!/usr/bin/env python3
"""
Export only NEW auctions/lots that haven't been sent to server yet
Prevents UNIQUE constraint errors on server import
"""
import sqlite3
import json
import csv
from datetime import datetime
from pathlib import Path
DB_PATH = "C:/mnt/okcomputer/output/cache.db"
OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
SYNC_STATE_FILE = OUTPUT_DIR / ".server_sync_state"
def get_last_export_timestamp():
"""Get timestamp of last successful export to server"""
if SYNC_STATE_FILE.exists():
return int(SYNC_STATE_FILE.read_text().strip())
return 0
def save_export_timestamp(timestamp: int):
"""Save timestamp of successful export"""
SYNC_STATE_FILE.write_text(str(timestamp))
def export_new_data():
"""Export only records that are NEW since last server import"""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
last_export = get_last_export_timestamp()
current_time = int(datetime.now().timestamp())
print("="*60)
print("INCREMENTAL EXPORT FOR SERVER")
print("="*60)
print(f"Last export: {datetime.fromtimestamp(last_export).strftime('%Y-%m-%d %H:%M:%S') if last_export else 'Never (will export ALL)'}")
print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
print()
# Get new auctions (discovered_at > last_export)
if last_export == 0:
# First run: export all
cursor.execute("SELECT * FROM auctions ORDER BY auction_id")
else:
# Subsequent runs: only new ones
cursor.execute("""
SELECT * FROM auctions
WHERE discovered_at > ?
ORDER BY auction_id
""", (last_export,))
new_auctions = [dict(row) for row in cursor.fetchall()]
# Get new lots (scraped_at_timestamp > last_export)
if last_export == 0:
cursor.execute("SELECT * FROM lots ORDER BY lot_id")
else:
cursor.execute("""
SELECT * FROM lots
WHERE scraped_at_timestamp > ?
ORDER BY lot_id
""", (last_export,))
new_lots = [dict(row) for row in cursor.fetchall()]
conn.close()
# Export to server-ready files
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
files_created = []
# Export auctions
if new_auctions:
auctions_csv = OUTPUT_DIR / f'auctions_{timestamp}.csv'
auctions_json = OUTPUT_DIR / f'auctions_{timestamp}.json'
with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
writer.writeheader()
writer.writerows(new_auctions)
with open(auctions_json, 'w', encoding='utf-8') as f:
json.dump(new_auctions, f, indent=2, ensure_ascii=False)
files_created.extend([auctions_csv, auctions_json])
print(f"✓ Exported {len(new_auctions)} auctions")
print(f" CSV: {auctions_csv}")
print(f" JSON: {auctions_json}")
else:
print("✓ No new auctions to export")
# Export lots
if new_lots:
lots_csv = OUTPUT_DIR / f'lots_{timestamp}.csv'
lots_json = OUTPUT_DIR / f'lots_{timestamp}.json'
with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
writer.writeheader()
writer.writerows(new_lots)
with open(lots_json, 'w', encoding='utf-8') as f:
json.dump(new_lots, f, indent=2, ensure_ascii=False)
files_created.extend([lots_csv, lots_json])
print(f"✓ Exported {len(new_lots)} lots")
print(f" CSV: {lots_csv}")
print(f" JSON: {lots_json}")
else:
print("✓ No new lots to export")
# Save sync state
if new_auctions or new_lots:
save_export_timestamp(current_time)
print()
print("="*60)
print("EXPORT COMPLETE")
print("="*60)
print(f"New auctions: {len(new_auctions)}")
print(f"New lots: {len(new_lots)}")
print()
print("Next export will only include records newer than:")
print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
else:
print()
print("="*60)
print("NOTHING TO EXPORT")
print("="*60)
print("All data already exported to server")
return {
'auctions': len(new_auctions),
'lots': len(new_lots),
'files': [str(f) for f in files_created]
}
if __name__ == "__main__":
export_new_data()

View File

@@ -1,53 +0,0 @@
#!/usr/bin/env python3
"""Extract the GraphQL query being used"""
import asyncio
import json
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
graphql_requests = []
async def capture_request(request):
if 'graphql' in request.url:
graphql_requests.append({
'url': request.url,
'method': request.method,
'post_data': request.post_data,
'headers': dict(request.headers)
})
page.on('request', capture_request)
await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle')
await asyncio.sleep(2)
print(f"Captured {len(graphql_requests)} GraphQL requests\n")
for i, req in enumerate(graphql_requests):
print(f"{'='*60}")
print(f"REQUEST #{i+1}")
print(f"{'='*60}")
print(f"URL: {req['url']}")
print(f"Method: {req['method']}")
if req['post_data']:
try:
data = json.loads(req['post_data'])
print(f"\nQuery Name: {data.get('operationName', 'N/A')}")
print(f"\nVariables:")
print(json.dumps(data.get('variables', {}), indent=2))
print(f"\nQuery:")
print(data.get('query', '')[:1000])
except:
print(f"\nPOST Data: {req['post_data'][:500]}")
print()
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,45 +0,0 @@
#!/usr/bin/env python3
"""Find viewing/pickup in actual HTML"""
import asyncio
from playwright.async_api import async_playwright
import re
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Try a lot that should have viewing times
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
# Get text content
text_content = await page.evaluate("document.body.innerText")
print("Searching for viewing/pickup patterns...\n")
# Look for "Bezichtigingen" section
lines = text_content.split('\n')
for i, line in enumerate(lines):
if 'bezichtig' in line.lower() or 'viewing' in line.lower():
# Print surrounding context
context = lines[max(0, i-1):min(len(lines), i+5)]
print("FOUND Bezichtigingen:")
for c in context:
print(f" {c}")
print()
break
# Look for "Ophalen" section
for i, line in enumerate(lines):
if 'ophalen' in line.lower() or 'collection' in line.lower() or 'pickup' in line.lower():
context = lines[max(0, i-1):min(len(lines), i+5)]
print("FOUND Ophalen:")
for c in context:
print(f" {c}")
print()
break
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,166 +0,0 @@
"""
Fetch bid history for existing lots that have bids but no bid history records.
Reads cached lot pages to get lot UUIDs, then calls bid history API.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
import asyncio
from cache import CacheManager
import sqlite3
import zlib
import json
import re
from bid_history_client import fetch_bid_history, parse_bid_history
async def fetch_missing_bid_history():
"""Fetch bid history for lots that have bids but no history records"""
cache = CacheManager()
conn = sqlite3.connect(cache.db_path)
cursor = conn.cursor()
# Get lots with bids but no bid history
cursor.execute("""
SELECT l.lot_id, l.bid_count
FROM lots l
WHERE l.bid_count > 0
AND l.lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
ORDER BY l.bid_count DESC
""")
lots_to_fetch = cursor.fetchall()
print(f"Found {len(lots_to_fetch)} lots with bids but no bid history")
if not lots_to_fetch:
print("No lots to process!")
conn.close()
return
# Build mapping from lot_id to lot UUID from cached pages
print("Building lot_id -> UUID mapping from cache...")
cursor.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/l/%'
""")
lot_id_to_uuid = {}
total_cached = 0
for url, content_blob in cursor:
total_cached += 1
if total_cached % 100 == 0:
print(f"Processed {total_cached} cached pages...", end='\r')
try:
content = zlib.decompress(content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
continue
data = json.loads(match.group(1))
lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
if not lot:
continue
lot_display_id = lot.get('displayId')
lot_uuid = lot.get('id')
if lot_display_id and lot_uuid:
lot_id_to_uuid[lot_display_id] = lot_uuid
except:
continue
print(f"\n\nBuilt UUID mapping for {len(lot_id_to_uuid)} lots")
# Fetch bid history for each lot
print("\nFetching bid history from API...")
fetched = 0
failed = 0
no_uuid = 0
for lot_id, bid_count in lots_to_fetch:
lot_uuid = lot_id_to_uuid.get(lot_id)
if not lot_uuid:
no_uuid += 1
continue
try:
print(f"\nFetching bid history for {lot_id} ({bid_count} bids)...")
bid_history = await fetch_bid_history(lot_uuid)
if bid_history:
bid_data = parse_bid_history(bid_history, lot_id)
# Update lots table with bid intelligence
cursor.execute("""
UPDATE lots
SET first_bid_time = ?,
last_bid_time = ?,
bid_velocity = ?
WHERE lot_id = ?
""", (
bid_data['first_bid_time'],
bid_data['last_bid_time'],
bid_data['bid_velocity'],
lot_id
))
# Save bid history records
cache.save_bid_history(lot_id, bid_data['bid_records'])
fetched += 1
print(f" Saved {len(bid_data['bid_records'])} bid records")
print(f" Bid velocity: {bid_data['bid_velocity']:.2f} bids/hour")
# Commit every 10 lots
if fetched % 10 == 0:
conn.commit()
print(f"\nProgress: {fetched}/{len(lots_to_fetch)} lots processed...")
# Rate limit to be respectful
await asyncio.sleep(0.5)
else:
failed += 1
except Exception as e:
print(f" Error fetching bid history for {lot_id}: {e}")
failed += 1
continue
conn.commit()
print(f"\n\nComplete!")
print(f"Total lots to process: {len(lots_to_fetch)}")
print(f"Successfully fetched: {fetched}")
print(f"Failed: {failed}")
print(f"No UUID found: {no_uuid}")
# Verify fix
cursor.execute("""
SELECT COUNT(DISTINCT lot_id) FROM bid_history
""")
lots_with_history = cursor.fetchone()[0]
cursor.execute("""
SELECT COUNT(*) FROM lots WHERE bid_count > 0
""")
lots_with_bids = cursor.fetchone()[0]
print(f"\nLots with bids: {lots_with_bids}")
print(f"Lots with bid history: {lots_with_history}")
print(f"Coverage: {lots_with_history/lots_with_bids*100:.1f}%")
conn.close()
if __name__ == "__main__":
asyncio.run(fetch_missing_bid_history())

View File

@@ -1,64 +0,0 @@
#!/usr/bin/env python3
"""Find the API endpoint by monitoring network requests"""
import asyncio
import json
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
requests = []
responses = []
async def log_request(request):
if any(term in request.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
requests.append({
'url': request.url,
'method': request.method,
'headers': dict(request.headers),
'post_data': request.post_data
})
async def log_response(response):
if any(term in response.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
try:
body = await response.text()
responses.append({
'url': response.url,
'status': response.status,
'body': body[:1000]
})
except:
pass
page.on('request', log_request)
page.on('response', log_response)
print("Loading lot page...")
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
# Wait for dynamic content
await asyncio.sleep(3)
print(f"\nFound {len(requests)} relevant requests")
print(f"Found {len(responses)} relevant responses\n")
for req in requests[:10]:
print(f"REQUEST: {req['method']} {req['url']}")
if req['post_data']:
print(f" POST DATA: {req['post_data'][:200]}")
print("\n" + "="*60 + "\n")
for resp in responses[:10]:
print(f"RESPONSE: {resp['url']}")
print(f" Status: {resp['status']}")
print(f" Body: {resp['body'][:300]}")
print()
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,70 +0,0 @@
#!/usr/bin/env python3
"""Find API endpoint using a valid lot from database"""
import asyncio
import sqlite3
from playwright.async_api import async_playwright
# Get a valid lot URL
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("SELECT url FROM lots WHERE url LIKE '%/l/%' LIMIT 5")
lot_urls = [row[0] for row in cursor.fetchall()]
conn.close()
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
api_calls = []
async def log_response(response):
url = response.url
# Look for API calls
if ('api' in url.lower() or 'graphql' in url.lower() or
'/v2/' in url or '/v3/' in url or '/v4/' in url or
'query' in url.lower() or 'mutation' in url.lower()):
try:
body = await response.text()
api_calls.append({
'url': url,
'status': response.status,
'body': body
})
print(f"\nAPI: {url}")
except:
pass
page.on('response', log_response)
for lot_url in lot_urls[:2]:
print(f"\n{'='*60}")
print(f"Loading: {lot_url}")
print(f"{'='*60}")
try:
await page.goto(lot_url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(2)
# Check if page has bid info
content = await page.content()
if 'currentBid' in content or 'Current bid' in content or 'Huidig bod' in content:
print("[+] Page contains bid information")
break
except Exception as e:
print(f"[!] Error: {e}")
continue
print(f"\n\n{'='*60}")
print(f"CAPTURED {len(api_calls)} API CALLS")
print(f"{'='*60}")
for call in api_calls:
print(f"\n{call['url']}")
print(f"Status: {call['status']}")
if 'json' in call['body'][:100].lower() or call['body'].startswith('{'):
print(f"Body (first 500 chars): {call['body'][:500]}")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,48 +0,0 @@
#!/usr/bin/env python3
"""Find an auction page with lots data"""
import sqlite3
import zlib
import json
import re
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/a/%'
""")
for row in cursor:
url, content_blob = row
content = zlib.decompress(content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
continue
data = json.loads(match.group(1))
page_props = data.get('props', {}).get('pageProps', {})
if 'auction' in page_props:
auction = page_props['auction']
lots = auction.get('lots', [])
if lots and len(lots) > 0:
print(f"Found auction with {len(lots)} lots: {url}\n")
lot = lots[0]
print(f"SAMPLE LOT FROM AUCTION.LOTS[]:")
print(f" displayId: {lot.get('displayId')}")
print(f" title: {lot.get('title', '')[:50]}...")
print(f" urlSlug: {lot.get('urlSlug')}")
print(f"\nBIDDING FIELDS:")
for key in ['currentBid', 'highestBid', 'startingBid', 'minimumBidAmount', 'bidCount', 'numberOfBids']:
print(f" {key}: {lot.get(key)}")
print(f"\nTIMING FIELDS:")
for key in ['endDate', 'startDate', 'closingTime']:
print(f" {key}: {lot.get(key)}")
print(f"\nALL KEYS: {list(lot.keys())[:30]}...")
break
conn.close()

View File

@@ -1,155 +0,0 @@
"""
Fix auctions table by replacing with correct data from cached auction pages.
The auctions table currently has wrong auction_ids (numeric instead of displayId).
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
from cache import CacheManager
import sqlite3
import zlib
import json
import re
from datetime import datetime
def fix_auctions_table():
"""Rebuild auctions table from cached auction pages"""
cache = CacheManager()
conn = sqlite3.connect(cache.db_path)
cursor = conn.cursor()
# Clear existing auctions table
print("Clearing auctions table...")
cursor.execute("DELETE FROM auctions")
conn.commit()
# Get all auction pages from cache
cursor.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/a/%'
""")
auction_pages = cursor.fetchall()
print(f"Found {len(auction_pages)} auction pages in cache")
total = 0
inserted = 0
errors = 0
print("Extracting auction data from cached pages...")
for url, content_blob in auction_pages:
total += 1
if total % 10 == 0:
print(f"Processed {total}/{len(auction_pages)}...", end='\r')
try:
# Decompress and parse __NEXT_DATA__
content = zlib.decompress(content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
errors += 1
continue
data = json.loads(match.group(1))
page_props = data.get('props', {}).get('pageProps', {})
auction = page_props.get('auction', {})
if not auction:
errors += 1
continue
# Extract auction data
auction_id = auction.get('displayId')
if not auction_id:
errors += 1
continue
title = auction.get('name', '')
# Get location
location = ''
viewing_days = auction.get('viewingDays', [])
if viewing_days and isinstance(viewing_days, list) and len(viewing_days) > 0:
loc = viewing_days[0]
city = loc.get('city', '')
country = loc.get('countryCode', '').upper()
location = f"{city}, {country}" if city and country else (city or country)
lots_count = auction.get('lotCount', 0)
# Get first lot closing time
first_lot_closing = ''
min_end_date = auction.get('minEndDate', '')
if min_end_date:
# Format timestamp
try:
dt = datetime.fromisoformat(min_end_date.replace('Z', '+00:00'))
first_lot_closing = dt.strftime('%Y-%m-%d %H:%M:%S')
except:
first_lot_closing = min_end_date
scraped_at = datetime.now().isoformat()
# Insert into auctions table
cursor.execute("""
INSERT OR REPLACE INTO auctions
(auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (auction_id, url, title, location, lots_count, first_lot_closing, scraped_at))
inserted += 1
except Exception as e:
errors += 1
continue
conn.commit()
print(f"\n\nComplete!")
print(f"Total auction pages processed: {total}")
print(f"Auctions inserted: {inserted}")
print(f"Errors: {errors}")
# Verify fix
cursor.execute("SELECT COUNT(*) FROM auctions")
total_auctions = cursor.fetchone()[0]
print(f"\nTotal auctions in table: {total_auctions}")
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
AND auction_id != ''
""")
orphaned = cursor.fetchone()[0]
print(f"Orphaned lots remaining: {orphaned}")
if orphaned == 0:
print("\nSUCCESS! All lots now have matching auctions!")
else:
# Show sample of remaining orphans
cursor.execute("""
SELECT lot_id, auction_id FROM lots
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
AND auction_id != ''
LIMIT 5
""")
print("\nSample remaining orphaned lots:")
for lot_id, auction_id in cursor.fetchall():
print(f" {lot_id} -> auction_id: {auction_id}")
# Show what auction_ids we do have
cursor.execute("SELECT auction_id FROM auctions LIMIT 10")
print("\nSample auction_ids in auctions table:")
for row in cursor.fetchall():
print(f" {row[0]}")
conn.close()
if __name__ == "__main__":
fix_auctions_table()

View File

@@ -1,136 +0,0 @@
"""
Fix orphaned lots by updating auction_id from UUID to displayId.
This migration reads cached lot pages and extracts the correct auction displayId.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
from cache import CacheManager
import sqlite3
import zlib
import json
import re
def fix_orphaned_lots():
"""Update lot auction_id from UUID to auction displayId"""
cache = CacheManager()
conn = sqlite3.connect(cache.db_path)
cursor = conn.cursor()
# Get all lots that need fixing (have UUID auction_id)
cursor.execute("""
SELECT l.lot_id, l.auction_id
FROM lots l
WHERE length(l.auction_id) > 20 -- UUID is longer than displayId like "A1-12345"
""")
lots_to_fix = {lot_id: auction_uuid for lot_id, auction_uuid in cursor.fetchall()}
print(f"Found {len(lots_to_fix)} lots with UUID auction_id that need fixing")
if not lots_to_fix:
print("No lots to fix!")
conn.close()
return
# Build mapping from lot displayId to auction displayId from cached pages
print("Building lot displayId -> auction displayId mapping from cache...")
cursor.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/l/%'
""")
lot_to_auction_map = {}
total = 0
errors = 0
for url, content_blob in cursor:
total += 1
if total % 100 == 0:
print(f"Processing cached pages... {total}", end='\r')
try:
# Decompress and parse __NEXT_DATA__
content = zlib.decompress(content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
continue
data = json.loads(match.group(1))
page_props = data.get('props', {}).get('pageProps', {})
lot = page_props.get('lot', {})
auction = page_props.get('auction', {})
if not lot or not auction:
continue
lot_display_id = lot.get('displayId')
auction_display_id = auction.get('displayId')
if lot_display_id and auction_display_id:
lot_to_auction_map[lot_display_id] = auction_display_id
except Exception as e:
errors += 1
continue
print(f"\n\nBuilt mapping for {len(lot_to_auction_map)} lots")
print(f"Errors while parsing: {errors}")
# Now update the lots table
print("\nUpdating lots table...")
updated = 0
not_found = 0
for lot_id, old_auction_uuid in lots_to_fix.items():
if lot_id in lot_to_auction_map:
new_auction_id = lot_to_auction_map[lot_id]
cursor.execute("""
UPDATE lots
SET auction_id = ?
WHERE lot_id = ?
""", (new_auction_id, lot_id))
updated += 1
else:
not_found += 1
if (updated + not_found) % 100 == 0:
print(f"Updated: {updated}, not found: {not_found}", end='\r')
conn.commit()
print(f"\n\nComplete!")
print(f"Total cached pages processed: {total}")
print(f"Lots updated with auction displayId: {updated}")
print(f"Lots not found in cache: {not_found}")
print(f"Parse errors: {errors}")
# Verify fix
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
""")
orphaned = cursor.fetchone()[0]
print(f"\nOrphaned lots remaining: {orphaned}")
if orphaned > 0:
# Show sample of remaining orphans
cursor.execute("""
SELECT lot_id, auction_id FROM lots
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
LIMIT 5
""")
print("\nSample remaining orphaned lots:")
for lot_id, auction_id in cursor.fetchall():
print(f" {lot_id} -> auction_id: {auction_id}")
conn.close()
if __name__ == "__main__":
fix_orphaned_lots()

View File

@@ -1,69 +0,0 @@
#!/usr/bin/env python3
"""Extract and inspect __NEXT_DATA__ from a cached lot page"""
import sqlite3
import zlib
import json
import re
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
# Get a cached auction page
cursor = conn.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/a/%'
LIMIT 1
""")
row = cursor.fetchone()
if not row:
print("No cached lot pages found")
exit(1)
url, content_blob = row
print(f"Inspecting: {url}\n")
# Decompress
content = zlib.decompress(content_blob).decode('utf-8')
# Extract __NEXT_DATA__
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
print("No __NEXT_DATA__ found")
exit(1)
data = json.loads(match.group(1))
page_props = data.get('props', {}).get('pageProps', {})
if 'auction' in page_props:
auction = page_props['auction']
print("AUCTION DATA STRUCTURE:")
print("=" * 60)
print(f"displayId: {auction.get('displayId')}")
print(f"name: {auction.get('name', '')[:50]}...")
print(f"lots count: {len(auction.get('lots', []))}")
if auction.get('lots'):
lot = auction['lots'][0]
print(f"\nFIRST LOT STRUCTURE:")
print(f" displayId: {lot.get('displayId')}")
print(f" title: {lot.get('title', '')[:50]}...")
print(f"\n BIDDING:")
print(f" currentBid: {lot.get('currentBid')}")
print(f" highestBid: {lot.get('highestBid')}")
print(f" startingBid: {lot.get('startingBid')}")
print(f" minimumBidAmount: {lot.get('minimumBidAmount')}")
print(f" bidCount: {lot.get('bidCount')}")
print(f" numberOfBids: {lot.get('numberOfBids')}")
print(f" TIMING:")
print(f" endDate: {lot.get('endDate')}")
print(f" startDate: {lot.get('startDate')}")
print(f" closingTime: {lot.get('closingTime')}")
print(f" ALL KEYS: {list(lot.keys())}")
print(f"\nAUCTION TIMING:")
print(f" minEndDate: {auction.get('minEndDate')}")
print(f" maxEndDate: {auction.get('maxEndDate')}")
print(f" ALL KEYS: {list(auction.keys())}")
conn.close()

View File

@@ -1,49 +0,0 @@
#!/usr/bin/env python3
"""Inspect a lot page HTML to find viewing_time and pickup_date"""
import asyncio
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Use the known lot
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
content = await page.content()
print("Searching for patterns...")
print("="*60)
# Search for viewing time patterns
import re
patterns = {
'Bezichtigingen': r'Bezichtigingen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
'viewing': r'(?i)viewing.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
'Ophalen': r'Ophalen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
'pickup': r'(?i)pickup.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
'Status': r'Status\s+([^<]+)',
}
for name, pattern in patterns.items():
matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
if matches:
print(f"\n{name}:")
for match in matches[:3]:
print(f" {match[:200]}")
# Also look for structured data
print("\n\nSearching for 'Bezichtigingen' section:")
bez_match = re.search(r'Bezichtigingen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
if bez_match:
print(bez_match.group(0)[:500])
print("\n\nSearching for 'Ophalen' section:")
oph_match = re.search(r'Ophalen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
if oph_match:
print(oph_match.group(0)[:500])
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,45 +0,0 @@
#!/usr/bin/env python3
"""Intercept API calls to find where lot data comes from"""
import asyncio
import json
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
# Track API calls
api_calls = []
async def handle_response(response):
if 'api' in response.url.lower() or 'graphql' in response.url.lower():
try:
body = await response.json()
api_calls.append({
'url': response.url,
'status': response.status,
'body': body
})
print(f"\nAPI CALL: {response.url}")
print(f"Status: {response.status}")
if 'lot' in response.url.lower() or 'auction' in response.url.lower():
print(f"Body preview: {json.dumps(body, indent=2)[:500]}")
except:
pass
page.on('response', handle_response)
# Visit auction page
print("Loading auction page...")
await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
# Wait a bit for lazy loading
await asyncio.sleep(5)
print(f"\n\nCaptured {len(api_calls)} API calls")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,148 +0,0 @@
#!/usr/bin/env python3
"""
Migrate existing lot data to extract missing enriched fields
"""
import sqlite3
import json
import re
from datetime import datetime
import sys
sys.path.insert(0, 'src')
from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json
DB_PATH = "/mnt/okcomputer/output/cache.db"
def migrate_lot_attributes():
"""Extract attributes from cached lot pages"""
print("="*60)
print("MIGRATING EXISTING LOT DATA")
print("="*60)
conn = sqlite3.connect(DB_PATH)
# Get cached lot pages
cursor = conn.execute("""
SELECT url, content, timestamp
FROM cache
WHERE url LIKE '%/l/%'
ORDER BY timestamp DESC
""")
import zlib
updated_count = 0
for url, content_blob, timestamp in cursor:
try:
# Get lot_id from URL
lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url)
if not lot_id_match:
lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url)
if not lot_id_match:
continue
lot_id = lot_id_match.group(1)
# Check if lot exists in database
lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,))
lot_row = lot_cursor.fetchone()
if not lot_row:
continue
_, title, description = lot_row
# Decompress and parse __NEXT_DATA__
content = zlib.decompress(content_blob).decode('utf-8')
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
continue
data = json.loads(match.group(1))
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
if not lot_json:
continue
# Extract basic attributes
attrs = extract_attributes_from_lot_json(lot_json)
# Extract enriched attributes
page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')}
enriched = extract_enriched_attributes(lot_json, page_data)
# Merge
all_attrs = {**attrs, **enriched}
# Update database
conn.execute("""
UPDATE lots
SET brand = ?,
model = ?,
attributes_json = ?,
year_manufactured = ?,
condition_score = ?,
condition_description = ?,
serial_number = ?,
manufacturer = ?,
damage_description = ?
WHERE lot_id = ?
""", (
all_attrs.get('brand', ''),
all_attrs.get('model', ''),
all_attrs.get('attributes_json', ''),
all_attrs.get('year_manufactured'),
all_attrs.get('condition_score'),
all_attrs.get('condition_description', ''),
all_attrs.get('serial_number', ''),
all_attrs.get('manufacturer', ''),
all_attrs.get('damage_description', ''),
lot_id
))
updated_count += 1
if updated_count % 100 == 0:
print(f" Processed {updated_count} lots...")
conn.commit()
except Exception as e:
print(f" Error processing {url}: {e}")
continue
conn.commit()
print(f"\n✓ Updated {updated_count} lots with enriched attributes")
# Show stats
cursor = conn.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
FROM lots
""")
stats = cursor.fetchone()
print(f"\nENRICHMENT STATISTICS:")
print(f" Total lots: {stats[0]:,}")
print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
conn.close()
def main():
print("\nStarting migration of existing data...")
print(f"Database: {DB_PATH}\n")
migrate_lot_attributes()
print(f"\n{'='*60}")
print("MIGRATION COMPLETE")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()

View File

@@ -1,51 +0,0 @@
#!/usr/bin/env python3
"""Scrape a fresh auction page to see the lots array structure"""
import asyncio
import json
import re
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Get first auction
await page.goto("https://www.troostwijkauctions.com/auctions", wait_until='networkidle')
content = await page.content()
# Find first auction link
match = re.search(r'href="(/a/[^"]+)"', content)
if not match:
print("No auction found")
return
auction_url = f"https://www.troostwijkauctions.com{match.group(1)}"
print(f"Scraping: {auction_url}\n")
await page.goto(auction_url, wait_until='networkidle')
content = await page.content()
# Extract __NEXT_DATA__
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
if not match:
print("No __NEXT_DATA__ found")
return
data = json.loads(match.group(1))
page_props = data.get('props', {}).get('pageProps', {})
if 'auction' in page_props:
auction = page_props['auction']
print(f"Auction: {auction.get('name', '')[:50]}...")
print(f"Lots in array: {len(auction.get('lots', []))}")
if auction.get('lots'):
lot = auction['lots'][0]
print(f"\nFIRST LOT:")
print(json.dumps(lot, indent=2)[:1500])
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,47 +0,0 @@
#!/usr/bin/env python3
"""Search cached pages for viewing/pickup text"""
import sqlite3
import zlib
import re
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/l/%'
ORDER BY timestamp DESC
LIMIT 20
""")
for url, content_blob in cursor:
try:
content = zlib.decompress(content_blob).decode('utf-8')
# Look for viewing/pickup patterns
if 'bezichtig' in content.lower() or 'ophalen' in content.lower():
print(f"\n{'='*60}")
print(f"URL: {url}")
print(f"{'='*60}")
# Extract sections with context
patterns = [
(r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'),
(r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'),
]
for pattern, label in patterns:
matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
if matches:
print(f"\n{label}:")
for match in matches[:1]: # First match
# Clean up HTML
clean = re.sub(r'<[^>]+>', ' ', match)
clean = re.sub(r'\s+', ' ', clean).strip()
print(f" {clean[:200]}")
break # Found one, that's enough
except:
continue
conn.close()

View File

@@ -1,47 +0,0 @@
# PowerShell script to create Windows Task Scheduler job for Scaev Monitor
# Run as Administrator
$TaskName = "ScaevAuctionMonitor"
$ScriptPath = "C:\vibe\scaev\src\monitor.py"
$PythonPath = "python3" # Adjust if needed
$WorkingDir = "C:\vibe\scaev"
# Create the action (run Python script)
$Action = New-ScheduledTaskAction -Execute $PythonPath `
-Argument "$ScriptPath 30" `
-WorkingDirectory $WorkingDir
# Trigger: On system startup
$TriggerStartup = New-ScheduledTaskTrigger -AtStartup
# Settings
$Settings = New-ScheduledTaskSettingsSet `
-AllowStartIfOnBatteries `
-DontStopIfGoingOnBatteries `
-StartWhenAvailable `
-RestartCount 3 `
-RestartInterval (New-TimeSpan -Minutes 5)
# Principal: Run with highest privileges
$Principal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest
# Register the task
Register-ScheduledTask `
-TaskName $TaskName `
-Action $Action `
-Trigger $TriggerStartup `
-Settings $Settings `
-Principal $Principal `
-Description "Scaev auction monitor - polls for new auctions every 30 minutes" `
-Force
Write-Host "`nTask '$TaskName' created successfully!" -ForegroundColor Green
Write-Host "`nTo manage the task:"
Write-Host " 1. Open Task Scheduler (taskschd.msc)"
Write-Host " 2. Find 'ScaevAuctionMonitor' in Task Scheduler Library"
Write-Host " 3. Right-click to Run, Stop, or Disable"
Write-Host "`nOr use PowerShell commands:"
Write-Host " Start-ScheduledTask -TaskName '$TaskName'"
Write-Host " Stop-ScheduledTask -TaskName '$TaskName'"
Write-Host " Disable-ScheduledTask -TaskName '$TaskName'"
Write-Host " Get-ScheduledTask -TaskName '$TaskName' | Get-ScheduledTaskInfo"

View File

@@ -1,49 +0,0 @@
#!/usr/bin/env python3
"""Show migration statistics"""
import sqlite3
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
FROM lots
""")
stats = cursor.fetchone()
print("="*60)
print("MIGRATION RESULTS")
print("="*60)
print(f"\nTotal lots: {stats[0]:,}")
print(f"Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
print(f"Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
print(f"Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
print(f"Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
print(f"Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
# Show sample enriched data
print(f"\n{'='*60}")
print("SAMPLE ENRICHED LOTS")
print(f"{'='*60}")
cursor = conn.execute("""
SELECT lot_id, year_manufactured, manufacturer, model, condition_score
FROM lots
WHERE year_manufactured IS NOT NULL OR manufacturer != ''
LIMIT 5
""")
for row in cursor:
print(f"\n{row[0]}:")
print(f" Year: {row[1]}")
print(f" Manufacturer: {row[2]}")
print(f" Model: {row[3]}")
print(f" Condition: {row[4]}")
conn.close()

View File

@@ -19,8 +19,9 @@ class CacheManager:
self._init_db()
def _init_db(self):
"""Initialize cache and data storage database"""
"""Initialize cache and data storage database with consolidated schema"""
with sqlite3.connect(self.db_path) as conn:
# Cache table
conn.execute("""
CREATE TABLE IF NOT EXISTS cache (
url TEXT PRIMARY KEY,
@@ -32,6 +33,8 @@ class CacheManager:
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
""")
# Auctions table - consolidated schema
conn.execute("""
CREATE TABLE IF NOT EXISTS auctions (
auction_id TEXT PRIMARY KEY,
@@ -40,9 +43,18 @@ class CacheManager:
location TEXT,
lots_count INTEGER,
first_lot_closing_time TEXT,
scraped_at TEXT
scraped_at TEXT,
city TEXT,
country TEXT,
type TEXT,
lot_count INTEGER DEFAULT 0,
closing_time TEXT,
discovered_at INTEGER
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
# Lots table - consolidated schema with all fields from working database
conn.execute("""
CREATE TABLE IF NOT EXISTS lots (
lot_id TEXT PRIMARY KEY,
@@ -50,8 +62,6 @@ class CacheManager:
url TEXT UNIQUE,
title TEXT,
current_bid TEXT,
starting_bid TEXT,
minimum_bid TEXT,
bid_count INTEGER,
closing_time TEXT,
viewing_time TEXT,
@@ -60,9 +70,54 @@ class CacheManager:
description TEXT,
category TEXT,
scraped_at TEXT,
sale_id INTEGER,
manufacturer TEXT,
type TEXT,
year INTEGER,
currency TEXT DEFAULT 'EUR',
closing_notified INTEGER DEFAULT 0,
starting_bid TEXT,
minimum_bid TEXT,
status TEXT,
brand TEXT,
model TEXT,
attributes_json TEXT,
first_bid_time TEXT,
last_bid_time TEXT,
bid_velocity REAL,
bid_increment REAL,
year_manufactured INTEGER,
condition_score REAL,
condition_description TEXT,
serial_number TEXT,
damage_description TEXT,
followers_count INTEGER DEFAULT 0,
estimated_min_price REAL,
estimated_max_price REAL,
lot_condition TEXT,
appearance TEXT,
estimated_min REAL,
estimated_max REAL,
next_bid_step_cents INTEGER,
condition TEXT,
category_path TEXT,
city_location TEXT,
country_code TEXT,
bidding_status TEXT,
packaging TEXT,
quantity INTEGER,
vat REAL,
buyer_premium_percentage REAL,
remarks TEXT,
reserve_price REAL,
reserve_met INTEGER,
view_count INTEGER,
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
# Images table
conn.execute("""
CREATE TABLE IF NOT EXISTS images (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -70,86 +125,28 @@ class CacheManager:
url TEXT,
local_path TEXT,
downloaded INTEGER DEFAULT 0,
labels TEXT,
processed_at INTEGER,
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_images_lot_id ON images(lot_id)")
# Add new columns to auctions table if they don't exist
cursor = conn.execute("PRAGMA table_info(auctions)")
auction_columns = {row[1] for row in cursor.fetchall()}
# Remove duplicates before creating unique index
conn.execute("""
DELETE FROM images
WHERE id NOT IN (
SELECT MIN(id)
FROM images
GROUP BY lot_id, url
)
""")
conn.execute("""
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
ON images(lot_id, url)
""")
if 'city' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN city TEXT")
if 'country' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN country TEXT")
if 'type' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN type TEXT")
if 'lot_count' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN lot_count INTEGER DEFAULT 0")
if 'closing_time' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN closing_time TEXT")
if 'discovered_at' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN discovered_at INTEGER")
# Add index for country filtering
conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
# Add new columns to lots table if they don't exist
cursor = conn.execute("PRAGMA table_info(lots)")
columns = {row[1] for row in cursor.fetchall()}
if 'starting_bid' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
if 'minimum_bid' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
if 'status' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN status TEXT")
if 'brand' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT")
if 'model' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN model TEXT")
if 'attributes_json' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT")
# Bidding intelligence fields
if 'first_bid_time' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT")
if 'last_bid_time' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT")
if 'bid_velocity' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL")
if 'bid_increment' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL")
# Valuation intelligence fields
if 'year_manufactured' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER")
if 'condition_score' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL")
if 'condition_description' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT")
if 'serial_number' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT")
if 'manufacturer' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT")
if 'damage_description' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT")
# NEW: High-value API fields
if 'followers_count' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN followers_count INTEGER DEFAULT 0")
if 'estimated_min_price' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN estimated_min_price REAL")
if 'estimated_max_price' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN estimated_max_price REAL")
if 'lot_condition' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN lot_condition TEXT")
if 'appearance' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN appearance TEXT")
if 'scraped_at_timestamp' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN scraped_at_timestamp INTEGER")
# Create bid_history table
# Bid history table
conn.execute("""
CREATE TABLE IF NOT EXISTS bid_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -163,33 +160,15 @@ class CacheManager:
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
ON bid_history(lot_id, bid_time)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
ON bid_history(bidder_id)
""")
# Remove duplicates before creating unique index
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
conn.execute("""
DELETE FROM images
WHERE id NOT IN (
SELECT MIN(id)
FROM images
GROUP BY lot_id, url
)
""")
# Now create the unique index
conn.execute("""
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
ON images(lot_id, url)
""")
conn.commit()
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:

View File

@@ -1,256 +0,0 @@
#!/usr/bin/env python3
"""
Sync local database updates to server-compatible format
Creates incremental exports with only NEW or UPDATED records
"""
import sqlite3
import json
import csv
from datetime import datetime
from pathlib import Path
DB_PATH = "C:/mnt/okcomputer/output/cache.db"
OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
def fill_missing_auction_fields():
"""Fill in missing fields in auctions table from scraped data"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print("Filling missing auction fields...")
# Update closing_time from first_lot_closing_time
cursor.execute("""
UPDATE auctions
SET closing_time = first_lot_closing_time
WHERE closing_time IS NULL AND first_lot_closing_time IS NOT NULL
""")
updated = cursor.rowcount
print(f" ✓ Updated {updated} closing_time fields")
# Parse location to extract city and country
cursor.execute("""
SELECT auction_id, location
FROM auctions
WHERE location IS NOT NULL AND (city IS NULL OR country IS NULL)
""")
locations = cursor.fetchall()
city_updates = 0
for auction_id, location in locations:
if not location:
continue
# Parse "City, COUNTRY" or "City, Region, COUNTRY"
parts = [p.strip() for p in location.split(',')]
if len(parts) >= 2:
city = parts[0]
country = parts[-1]
cursor.execute("""
UPDATE auctions
SET city = ?, country = ?
WHERE auction_id = ?
""", (city, country, auction_id))
city_updates += 1
print(f" ✓ Updated {city_updates} city/country fields")
# Set type to 'online' for all (Troostwijk is online platform)
cursor.execute("""
UPDATE auctions
SET type = 'online'
WHERE type IS NULL
""")
type_updates = cursor.rowcount
print(f" ✓ Updated {type_updates} type fields")
conn.commit()
conn.close()
print(f"✓ Auction fields updated\n")
def get_last_sync_timestamp():
"""Get timestamp of last successful sync"""
sync_file = OUTPUT_DIR / ".last_sync"
if sync_file.exists():
return int(sync_file.read_text().strip())
return 0
def save_sync_timestamp(timestamp: int):
"""Save timestamp of successful sync"""
sync_file = OUTPUT_DIR / ".last_sync"
sync_file.write_text(str(timestamp))
def export_incremental():
"""Export only records that are new or updated since last sync"""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
last_sync = get_last_sync_timestamp()
current_time = int(datetime.now().timestamp())
print(f"Last sync: {datetime.fromtimestamp(last_sync).strftime('%Y-%m-%d %H:%M:%S') if last_sync else 'Never'}")
print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
# Get new/updated auctions
cursor.execute("""
SELECT * FROM auctions
WHERE discovered_at IS NULL OR discovered_at > ?
ORDER BY auction_id
""", (last_sync,))
new_auctions = [dict(row) for row in cursor.fetchall()]
# Get new/updated lots
cursor.execute("""
SELECT * FROM lots
WHERE scraped_at_timestamp IS NULL OR scraped_at_timestamp > ?
ORDER BY lot_id
""", (last_sync,))
new_lots = [dict(row) for row in cursor.fetchall()]
conn.close()
# Export to timestamped files
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results = {
'auctions': 0,
'lots': 0,
'files': {}
}
# Export auctions if any new
if new_auctions:
auctions_csv = OUTPUT_DIR / f'auctions_update_{timestamp}.csv'
auctions_json = OUTPUT_DIR / f'auctions_update_{timestamp}.json'
with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
writer.writeheader()
writer.writerows(new_auctions)
with open(auctions_json, 'w', encoding='utf-8') as f:
json.dump(new_auctions, f, indent=2, ensure_ascii=False)
results['auctions'] = len(new_auctions)
results['files']['auctions_csv'] = str(auctions_csv)
results['files']['auctions_json'] = str(auctions_json)
print(f"\n✓ Exported {len(new_auctions)} new/updated auctions")
print(f" CSV: {auctions_csv}")
print(f" JSON: {auctions_json}")
# Export lots if any new
if new_lots:
lots_csv = OUTPUT_DIR / f'lots_update_{timestamp}.csv'
lots_json = OUTPUT_DIR / f'lots_update_{timestamp}.json'
with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
writer.writeheader()
writer.writerows(new_lots)
with open(lots_json, 'w', encoding='utf-8') as f:
json.dump(new_lots, f, indent=2, ensure_ascii=False)
results['lots'] = len(new_lots)
results['files']['lots_csv'] = str(lots_csv)
results['files']['lots_json'] = str(lots_json)
print(f"\n✓ Exported {len(new_lots)} new/updated lots")
print(f" CSV: {lots_csv}")
print(f" JSON: {lots_json}")
if not new_auctions and not new_lots:
print("\n✓ No new updates since last sync")
return results
def create_upsert_export():
"""Create SQL script for server to UPSERT (update or insert) data"""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
last_sync = get_last_sync_timestamp()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Get new/updated auctions
cursor.execute("""
SELECT * FROM auctions
WHERE discovered_at IS NULL OR discovered_at > ?
""", (last_sync,))
new_auctions = [dict(row) for row in cursor.fetchall()]
if new_auctions:
sql_file = OUTPUT_DIR / f'upsert_auctions_{timestamp}.sql'
with open(sql_file, 'w', encoding='utf-8') as f:
f.write("-- UPSERT script for auctions (updates existing, inserts new)\n\n")
for auction in new_auctions:
# Create INSERT OR REPLACE statement
columns = list(auction.keys())
placeholders = []
for col, val in auction.items():
if val is None:
placeholders.append("NULL")
elif isinstance(val, (int, float)):
placeholders.append(str(val))
else:
# Escape single quotes
escaped = str(val).replace("'", "''")
placeholders.append(f"'{escaped}'")
f.write(f"INSERT OR REPLACE INTO auctions ({', '.join(columns)})\n")
f.write(f"VALUES ({', '.join(placeholders)});\n\n")
print(f"\n✓ Created UPSERT SQL script: {sql_file}")
print(f" Server can execute this to avoid constraint errors")
conn.close()
def main():
"""Main sync process"""
print("="*60)
print("DATABASE SYNC UTILITY")
print("="*60)
print(f"Database: {DB_PATH}")
print(f"Output: {OUTPUT_DIR}")
print("="*60)
# Step 1: Fill missing fields
fill_missing_auction_fields()
# Step 2: Export incremental updates
print("Exporting incremental updates...")
results = export_incremental()
# Step 3: Create UPSERT SQL (prevents constraint errors on server)
if results['auctions'] > 0:
create_upsert_export()
# Step 4: Save sync timestamp
current_time = int(datetime.now().timestamp())
save_sync_timestamp(current_time)
print("\n" + "="*60)
print("SYNC COMPLETE")
print("="*60)
print(f"New auctions: {results['auctions']}")
print(f"New lots: {results['lots']}")
if results['files']:
print("\nFiles ready for server import:")
for key, path in results['files'].items():
print(f" {key}: {path}")
print("\nNext sync will only export records newer than:")
print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
if __name__ == "__main__":
main()

View File

@@ -1,28 +0,0 @@
#!/usr/bin/env python3
"""Test auction data fetch"""
import asyncio
import json
import sys
sys.path.insert(0, 'src')
from graphql_client import fetch_auction_data, format_auction_data
async def main():
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
print(f"Fetching auction: {auction_id}\n")
auction_data = await fetch_auction_data(auction_id)
if auction_data:
print("Raw Auction Data:")
print(json.dumps(auction_data, indent=2))
print("\n\nFormatted:")
formatted = format_auction_data(auction_data)
print(f"Viewing: {formatted['viewing_time']}")
print(f"Pickup: {formatted['pickup_date']}")
else:
print("No auction data returned")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,59 +0,0 @@
#!/usr/bin/env python3
"""Test if the auction query works at all"""
import asyncio
import aiohttp
import json
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
# Try a simpler query first
SIMPLE_QUERY = """
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
auction(id: $auctionId, locale: $locale, platform: $platform) {
id
displayId
viewingDays {
startDate
endDate
city
countryCode
}
collectionDays {
startDate
endDate
city
countryCode
}
}
}
"""
async def main():
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
variables = {
"auctionId": auction_id,
"locale": "nl",
"platform": "TWK"
}
payload = {
"query": SIMPLE_QUERY,
"variables": variables
}
async with aiohttp.ClientSession() as session:
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
print(f"Status: {response.status}")
text = await response.text()
print(f"Response: {text}")
try:
data = await response.json()
print(f"\nParsed:")
print(json.dumps(data, indent=2))
except:
pass
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,95 +0,0 @@
#!/usr/bin/env python3
"""Test comprehensive data enrichment"""
import asyncio
import sys
sys.path.insert(0, 'src')
from scraper import TroostwijkScraper
async def main():
scraper = TroostwijkScraper()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
# Test with lot that has bids
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
print(f"Testing comprehensive extraction\n")
result = await scraper.crawl_page(page, lot_url)
if result:
print(f"\n{'='*60}")
print("COMPREHENSIVE DATA EXTRACTION:")
print(f"{'='*60}")
print(f"Lot ID: {result.get('lot_id')}")
print(f"Title: {result.get('title', '')[:50]}...")
print(f"\n[Bidding Intelligence]")
print(f" Status: {result.get('status')}")
print(f" Current Bid: {result.get('current_bid')}")
print(f" Starting Bid: {result.get('starting_bid')}")
print(f" Bid Increment: EUR {result.get('bid_increment', 0):.2f}")
print(f" Bid Count: {result.get('bid_count')}")
print(f" First Bid: {result.get('first_bid_time', 'N/A')}")
print(f" Last Bid: {result.get('last_bid_time', 'N/A')}")
print(f" Bid Velocity: {result.get('bid_velocity', 0)} bids/hour")
print(f"\n[Valuation Intelligence]")
print(f" Brand: {result.get('brand', 'N/A')}")
print(f" Model: {result.get('model', 'N/A')}")
print(f" Year: {result.get('year_manufactured', 'N/A')}")
print(f" Manufacturer: {result.get('manufacturer', 'N/A')}")
print(f" Condition Score: {result.get('condition_score', 'N/A')}")
print(f" Condition: {result.get('condition_description', 'N/A')}")
print(f" Serial#: {result.get('serial_number', 'N/A')}")
print(f" Damage: {result.get('damage_description', 'N/A')[:50] if result.get('damage_description') else 'N/A'}...")
await browser.close()
# Verify database
import sqlite3
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
# Check lot data
cursor = conn.execute("""
SELECT bid_velocity, first_bid_time, year_manufactured, condition_score
FROM lots
WHERE lot_id = ?
""", (result.get('lot_id'),))
row = cursor.fetchone()
if row:
print(f"\n{'='*60}")
print("DATABASE VERIFICATION (lots table):")
print(f"{'='*60}")
print(f" Bid Velocity: {row[0]}")
print(f" First Bid Time: {row[1]}")
print(f" Year: {row[2]}")
print(f" Condition Score: {row[3]}")
# Check bid history
cursor = conn.execute("""
SELECT COUNT(*), MIN(bid_time), MAX(bid_time), SUM(is_autobid)
FROM bid_history
WHERE lot_id = ?
""", (result.get('lot_id'),))
row = cursor.fetchone()
if row and row[0] > 0:
print(f"\n{'='*60}")
print("DATABASE VERIFICATION (bid_history table):")
print(f"{'='*60}")
print(f" Total Bids Stored: {row[0]}")
print(f" First Bid: {row[1]}")
print(f" Last Bid: {row[2]}")
print(f" Autobids: {row[3]}")
conn.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,49 +0,0 @@
#!/usr/bin/env python3
"""Test concurrent image downloads"""
import asyncio
import time
import sys
sys.path.insert(0, 'src')
from scraper import TroostwijkScraper
async def main():
scraper = TroostwijkScraper()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
# Test with a lot that has multiple images
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
print(f"Testing concurrent image downloads\n")
print(f"Lot: {lot_url}\n")
start_time = time.time()
result = await scraper.crawl_page(page, lot_url)
elapsed = time.time() - start_time
print(f"\n{'='*60}")
print(f"TIMING RESULTS:")
print(f"{'='*60}")
print(f"Total time: {elapsed:.2f}s")
image_count = len(result.get('images', []))
print(f"Images: {image_count}")
if image_count > 1:
print(f"Time per image: {elapsed/image_count:.2f}s (if sequential)")
print(f"Actual time: {elapsed:.2f}s (concurrent!)")
speedup = (image_count * 0.5) / elapsed if elapsed > 0 else 1
print(f"Speedup factor: {speedup:.1f}x")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,66 +0,0 @@
#!/usr/bin/env python3
"""Test the full scraper with one lot"""
import asyncio
import sys
sys.path.insert(0, 'src')
from scraper import TroostwijkScraper
async def main():
scraper = TroostwijkScraper()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
# Test with a known lot
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
print(f"Testing with: {lot_url}\n")
result = await scraper.crawl_page(page, lot_url)
if result:
print(f"\n{'='*60}")
print("FINAL RESULT:")
print(f"{'='*60}")
print(f"Lot ID: {result.get('lot_id')}")
print(f"Title: {result.get('title', '')[:50]}...")
print(f"Current Bid: {result.get('current_bid')}")
print(f"Starting Bid: {result.get('starting_bid')}")
print(f"Minimum Bid: {result.get('minimum_bid')}")
print(f"Bid Count: {result.get('bid_count')}")
print(f"Closing Time: {result.get('closing_time')}")
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
print(f"Location: {result.get('location')}")
await browser.close()
# Verify database
import sqlite3
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("""
SELECT current_bid, starting_bid, minimum_bid, bid_count, closing_time
FROM lots
WHERE lot_id = 'A1-28505-5'
""")
row = cursor.fetchone()
conn.close()
if row:
print(f"\n{'='*60}")
print("DATABASE VERIFICATION:")
print(f"{'='*60}")
print(f"Current Bid: {row[0]}")
print(f"Starting Bid: {row[1]}")
print(f"Minimum Bid: {row[2]}")
print(f"Bid Count: {row[3]}")
print(f"Closing Time: {row[4]}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,32 +0,0 @@
#!/usr/bin/env python3
"""Test the updated scraper with GraphQL integration"""
import asyncio
import sys
sys.path.insert(0, 'src')
from graphql_client import fetch_lot_bidding_data, format_bid_data
async def main():
# Test with known lot ID
lot_id = "A1-28505-5"
print(f"Testing GraphQL API with lot: {lot_id}\n")
bidding_data = await fetch_lot_bidding_data(lot_id)
if bidding_data:
print("Raw GraphQL Response:")
print("="*60)
import json
print(json.dumps(bidding_data, indent=2))
print("\n\nFormatted Data:")
print("="*60)
formatted = format_bid_data(bidding_data)
for key, value in formatted.items():
print(f" {key}: {value}")
else:
print("Failed to fetch bidding data")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,43 +0,0 @@
#!/usr/bin/env python3
"""Test scraping a single live lot page"""
import asyncio
import sys
sys.path.insert(0, 'src')
from scraper import TroostwijkScraper
async def main():
scraper = TroostwijkScraper()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Get a lot URL from the database
import sqlite3
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("SELECT url FROM lots LIMIT 1")
row = cursor.fetchone()
conn.close()
if not row:
print("No lots in database")
return
lot_url = row[0]
print(f"Fetching: {lot_url}\n")
result = await scraper.crawl_page(page, lot_url)
if result:
print(f"\nExtracted Data:")
print(f" current_bid: {result.get('current_bid')}")
print(f" bid_count: {result.get('bid_count')}")
print(f" closing_time: {result.get('closing_time')}")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,64 +0,0 @@
#!/usr/bin/env python3
"""Test the new fields extraction"""
import asyncio
import sys
sys.path.insert(0, 'src')
from scraper import TroostwijkScraper
async def main():
scraper = TroostwijkScraper()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
# Test with lot that has attributes
lot_url = "https://www.troostwijkauctions.com/l/47-5kg-hexagon-dumbbell-%25282x%2529-A1-40668-34"
print(f"Testing new fields with: {lot_url}\n")
result = await scraper.crawl_page(page, lot_url)
if result:
print(f"\n{'='*60}")
print("EXTRACTED FIELDS:")
print(f"{'='*60}")
print(f"Lot ID: {result.get('lot_id')}")
print(f"Title: {result.get('title', '')[:50]}...")
print(f"Status: {result.get('status')}")
print(f"Brand: {result.get('brand')}")
print(f"Model: {result.get('model')}")
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
print(f"Attributes: {result.get('attributes_json', '')[:100]}...")
await browser.close()
# Verify database
import sqlite3
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("""
SELECT status, brand, model, viewing_time, pickup_date
FROM lots
WHERE lot_id = ?
""", (result.get('lot_id'),))
row = cursor.fetchone()
conn.close()
if row:
print(f"\n{'='*60}")
print("DATABASE VERIFICATION:")
print(f"{'='*60}")
print(f"Status: {row[0]}")
print(f"Brand: {row[1]}")
print(f"Model: {row[2]}")
print(f"Viewing: {row[3][:100] if row[3] else 'N/A'}...")
print(f"Pickup: {row[4][:100] if row[4] else 'N/A'}...")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,306 +0,0 @@
"""
Validate data quality and completeness in the database.
Checks if scraped data matches expectations and API capabilities.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
import sqlite3
from datetime import datetime
from typing import Dict, List, Tuple
from cache import CacheManager
cache = CacheManager()
DB_PATH = cache.db_path
def get_db_stats() -> Dict:
"""Get comprehensive database statistics"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
stats = {}
# Total counts
stats['total_auctions'] = cursor.execute("SELECT COUNT(*) FROM auctions").fetchone()[0]
stats['total_lots'] = cursor.execute("SELECT COUNT(*) FROM lots").fetchone()[0]
stats['total_images'] = cursor.execute("SELECT COUNT(*) FROM images").fetchone()[0]
stats['total_bid_history'] = cursor.execute("SELECT COUNT(*) FROM bid_history").fetchone()[0]
# Auctions completeness
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
SUM(CASE WHEN lots_count IS NOT NULL THEN 1 ELSE 0 END) as has_lots_count,
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
SUM(CASE WHEN first_lot_closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_lot_closing
FROM auctions
""")
row = cursor.fetchone()
stats['auctions'] = {
'total': row[0],
'has_title': row[1],
'has_lots_count': row[2],
'has_closing_time': row[3],
'has_first_lot_closing': row[4]
}
# Lots completeness - Core fields
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
SUM(CASE WHEN current_bid IS NOT NULL THEN 1 ELSE 0 END) as has_current_bid,
SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid,
SUM(CASE WHEN minimum_bid IS NOT NULL THEN 1 ELSE 0 END) as has_minimum_bid,
SUM(CASE WHEN bid_count IS NOT NULL AND bid_count > 0 THEN 1 ELSE 0 END) as has_bids,
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
SUM(CASE WHEN status IS NOT NULL AND status != '' THEN 1 ELSE 0 END) as has_status
FROM lots
""")
row = cursor.fetchone()
stats['lots_core'] = {
'total': row[0],
'has_title': row[1],
'has_current_bid': row[2],
'has_starting_bid': row[3],
'has_minimum_bid': row[4],
'has_bids': row[5],
'has_closing_time': row[6],
'has_status': row[7]
}
# Lots completeness - Enriched fields
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN brand IS NOT NULL AND brand != '' THEN 1 ELSE 0 END) as has_brand,
SUM(CASE WHEN model IS NOT NULL AND model != '' THEN 1 ELSE 0 END) as has_model,
SUM(CASE WHEN manufacturer IS NOT NULL AND manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition_score,
SUM(CASE WHEN condition_description IS NOT NULL AND condition_description != '' THEN 1 ELSE 0 END) as has_condition_desc,
SUM(CASE WHEN serial_number IS NOT NULL AND serial_number != '' THEN 1 ELSE 0 END) as has_serial,
SUM(CASE WHEN damage_description IS NOT NULL AND damage_description != '' THEN 1 ELSE 0 END) as has_damage
FROM lots
""")
row = cursor.fetchone()
stats['lots_enriched'] = {
'total': row[0],
'has_brand': row[1],
'has_model': row[2],
'has_manufacturer': row[3],
'has_year': row[4],
'has_condition_score': row[5],
'has_condition_desc': row[6],
'has_serial': row[7],
'has_damage': row[8]
}
# Lots completeness - Bid intelligence
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN first_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_bid_time,
SUM(CASE WHEN last_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_last_bid_time,
SUM(CASE WHEN bid_velocity IS NOT NULL THEN 1 ELSE 0 END) as has_bid_velocity,
SUM(CASE WHEN bid_increment IS NOT NULL THEN 1 ELSE 0 END) as has_bid_increment
FROM lots
""")
row = cursor.fetchone()
stats['lots_bid_intelligence'] = {
'total': row[0],
'has_first_bid_time': row[1],
'has_last_bid_time': row[2],
'has_bid_velocity': row[3],
'has_bid_increment': row[4]
}
# Bid history stats
cursor.execute("""
SELECT
COUNT(DISTINCT lot_id) as lots_with_history,
COUNT(*) as total_bids,
SUM(CASE WHEN is_autobid = 1 THEN 1 ELSE 0 END) as autobids,
SUM(CASE WHEN bidder_id IS NOT NULL THEN 1 ELSE 0 END) as has_bidder_id
FROM bid_history
""")
row = cursor.fetchone()
stats['bid_history'] = {
'lots_with_history': row[0],
'total_bids': row[1],
'autobids': row[2],
'has_bidder_id': row[3]
}
# Image stats
cursor.execute("""
SELECT
COUNT(DISTINCT lot_id) as lots_with_images,
COUNT(*) as total_images,
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded_images,
SUM(CASE WHEN local_path IS NOT NULL THEN 1 ELSE 0 END) as has_local_path
FROM images
""")
row = cursor.fetchone()
stats['images'] = {
'lots_with_images': row[0],
'total_images': row[1],
'downloaded_images': row[2],
'has_local_path': row[3]
}
conn.close()
return stats
def check_data_quality() -> List[Tuple[str, str, str]]:
"""Check for data quality issues"""
issues = []
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Check for lots without auction
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
""")
orphaned_lots = cursor.fetchone()[0]
if orphaned_lots > 0:
issues.append(("ERROR", "Orphaned Lots", f"{orphaned_lots} lots without matching auction"))
# Check for lots with bids but no bid history
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE bid_count > 0
AND lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
""")
missing_history = cursor.fetchone()[0]
if missing_history > 0:
issues.append(("WARNING", "Missing Bid History", f"{missing_history} lots have bids but no bid history records"))
# Check for lots with closing time in past but still active
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE closing_time IS NOT NULL
AND closing_time < datetime('now')
AND status NOT LIKE '%gesloten%'
""")
past_closing = cursor.fetchone()[0]
if past_closing > 0:
issues.append(("INFO", "Past Closing Time", f"{past_closing} lots have closing time in past"))
# Check for duplicate lot_ids
cursor.execute("""
SELECT lot_id, COUNT(*) FROM lots
GROUP BY lot_id
HAVING COUNT(*) > 1
""")
duplicates = cursor.fetchall()
if duplicates:
issues.append(("ERROR", "Duplicate Lot IDs", f"{len(duplicates)} duplicate lot_id values found"))
# Check for lots without images
cursor.execute("""
SELECT COUNT(*) FROM lots
WHERE lot_id NOT IN (SELECT DISTINCT lot_id FROM images)
""")
no_images = cursor.fetchone()[0]
if no_images > 0:
issues.append(("WARNING", "No Images", f"{no_images} lots have no images"))
conn.close()
return issues
def print_validation_report():
"""Print comprehensive validation report"""
print("=" * 80)
print("DATABASE VALIDATION REPORT")
print("=" * 80)
print()
stats = get_db_stats()
# Overall counts
print("OVERALL COUNTS:")
print(f" Auctions: {stats['total_auctions']:,}")
print(f" Lots: {stats['total_lots']:,}")
print(f" Images: {stats['total_images']:,}")
print(f" Bid History Records: {stats['total_bid_history']:,}")
print()
# Auctions completeness
print("AUCTIONS COMPLETENESS:")
a = stats['auctions']
print(f" Title: {a['has_title']:,} / {a['total']:,} ({a['has_title']/a['total']*100:.1f}%)")
print(f" Lots Count: {a['has_lots_count']:,} / {a['total']:,} ({a['has_lots_count']/a['total']*100:.1f}%)")
print(f" Closing Time: {a['has_closing_time']:,} / {a['total']:,} ({a['has_closing_time']/a['total']*100:.1f}%)")
print(f" First Lot Closing: {a['has_first_lot_closing']:,} / {a['total']:,} ({a['has_first_lot_closing']/a['total']*100:.1f}%)")
print()
# Lots core completeness
print("LOTS CORE FIELDS:")
l = stats['lots_core']
print(f" Title: {l['has_title']:,} / {l['total']:,} ({l['has_title']/l['total']*100:.1f}%)")
print(f" Current Bid: {l['has_current_bid']:,} / {l['total']:,} ({l['has_current_bid']/l['total']*100:.1f}%)")
print(f" Starting Bid: {l['has_starting_bid']:,} / {l['total']:,} ({l['has_starting_bid']/l['total']*100:.1f}%)")
print(f" Minimum Bid: {l['has_minimum_bid']:,} / {l['total']:,} ({l['has_minimum_bid']/l['total']*100:.1f}%)")
print(f" Has Bids (>0): {l['has_bids']:,} / {l['total']:,} ({l['has_bids']/l['total']*100:.1f}%)")
print(f" Closing Time: {l['has_closing_time']:,} / {l['total']:,} ({l['has_closing_time']/l['total']*100:.1f}%)")
print(f" Status: {l['has_status']:,} / {l['total']:,} ({l['has_status']/l['total']*100:.1f}%)")
print()
# Lots enriched fields
print("LOTS ENRICHED FIELDS:")
e = stats['lots_enriched']
print(f" Brand: {e['has_brand']:,} / {e['total']:,} ({e['has_brand']/e['total']*100:.1f}%)")
print(f" Model: {e['has_model']:,} / {e['total']:,} ({e['has_model']/e['total']*100:.1f}%)")
print(f" Manufacturer: {e['has_manufacturer']:,} / {e['total']:,} ({e['has_manufacturer']/e['total']*100:.1f}%)")
print(f" Year: {e['has_year']:,} / {e['total']:,} ({e['has_year']/e['total']*100:.1f}%)")
print(f" Condition Score: {e['has_condition_score']:,} / {e['total']:,} ({e['has_condition_score']/e['total']*100:.1f}%)")
print(f" Condition Desc: {e['has_condition_desc']:,} / {e['total']:,} ({e['has_condition_desc']/e['total']*100:.1f}%)")
print(f" Serial Number: {e['has_serial']:,} / {e['total']:,} ({e['has_serial']/e['total']*100:.1f}%)")
print(f" Damage Desc: {e['has_damage']:,} / {e['total']:,} ({e['has_damage']/e['total']*100:.1f}%)")
print()
# Bid intelligence
print("LOTS BID INTELLIGENCE:")
b = stats['lots_bid_intelligence']
print(f" First Bid Time: {b['has_first_bid_time']:,} / {b['total']:,} ({b['has_first_bid_time']/b['total']*100:.1f}%)")
print(f" Last Bid Time: {b['has_last_bid_time']:,} / {b['total']:,} ({b['has_last_bid_time']/b['total']*100:.1f}%)")
print(f" Bid Velocity: {b['has_bid_velocity']:,} / {b['total']:,} ({b['has_bid_velocity']/b['total']*100:.1f}%)")
print(f" Bid Increment: {b['has_bid_increment']:,} / {b['total']:,} ({b['has_bid_increment']/b['total']*100:.1f}%)")
print()
# Bid history
print("BID HISTORY:")
h = stats['bid_history']
print(f" Lots with History: {h['lots_with_history']:,}")
print(f" Total Bid Records: {h['total_bids']:,}")
print(f" Autobids: {h['autobids']:,} ({h['autobids']/max(h['total_bids'],1)*100:.1f}%)")
print(f" Has Bidder ID: {h['has_bidder_id']:,} ({h['has_bidder_id']/max(h['total_bids'],1)*100:.1f}%)")
print()
# Images
print("IMAGES:")
i = stats['images']
print(f" Lots with Images: {i['lots_with_images']:,}")
print(f" Total Images: {i['total_images']:,}")
print(f" Downloaded: {i['downloaded_images']:,} ({i['downloaded_images']/max(i['total_images'],1)*100:.1f}%)")
print(f" Has Local Path: {i['has_local_path']:,} ({i['has_local_path']/max(i['total_images'],1)*100:.1f}%)")
print()
# Data quality issues
print("=" * 80)
print("DATA QUALITY ISSUES:")
print("=" * 80)
issues = check_data_quality()
if issues:
for severity, category, message in issues:
print(f" [{severity}] {category}: {message}")
else:
print(" No issues found!")
print()
if __name__ == "__main__":
print_validation_report()

View File

@@ -1,92 +0,0 @@
#!/usr/bin/env python3
"""
Verification script to check image download status and duplicates
Run this after deployment to verify the scraper is working correctly
"""
import sqlite3
import sys
from pathlib import Path
DB_PATH = "/mnt/okcomputer/output/cache.db"
def verify_database():
"""Run verification queries on the database"""
if not Path(DB_PATH).exists():
print(f"❌ Database not found: {DB_PATH}")
sys.exit(1)
conn = sqlite3.connect(DB_PATH)
print("=" * 60)
print("IMAGE DOWNLOAD VERIFICATION")
print("=" * 60)
# Check download success rate
print("\n[*] Download Success Rate:")
cursor = conn.execute("""
SELECT
COUNT(*) as total_images,
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
FROM images
""")
row = cursor.fetchone()
print(f" Total images: {row[0]:,}")
print(f" Downloaded: {row[1]:,}")
print(f" Not downloaded: {row[2]:,}")
print(f" Success rate: {row[3]}%")
# Check for duplicates
print("\n[*] Duplicate Check:")
cursor = conn.execute("""
SELECT lot_id, url, COUNT(*) as dup_count
FROM images
GROUP BY lot_id, url
HAVING COUNT(*) > 1
LIMIT 5
""")
duplicates = cursor.fetchall()
if duplicates:
print(f" [!] Found {len(duplicates)} duplicate entries!")
for lot_id, url, count in duplicates:
print(f" {lot_id}: {url[:50]}... (x{count})")
else:
print(" [+] No duplicates found!")
# Verify file system
print("\n[*] File System Verification:")
cursor = conn.execute("""
SELECT COUNT(*)
FROM images
WHERE downloaded = 1
AND local_path IS NOT NULL
AND local_path != ''
""")
files_with_path = cursor.fetchone()[0]
print(f" Images with local_path: {files_with_path:,}")
# Sample some downloaded images
print("\n[*] Sample Downloaded Images:")
cursor = conn.execute("""
SELECT lot_id, local_path
FROM images
WHERE downloaded = 1
AND local_path IS NOT NULL
LIMIT 5
""")
samples = cursor.fetchall()
for lot_id, path in samples:
exists = "[+]" if Path(path).exists() else "[!]"
print(f" {exists} {lot_id}: {path}")
conn.close()
print("\n" + "=" * 60)
print("VERIFICATION COMPLETE")
print("=" * 60)
if __name__ == "__main__":
verify_database()