enrich data
This commit is contained in:
@@ -1,54 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check for Apollo state or other embedded data"""
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
|
||||
content = await page.content()
|
||||
|
||||
# Look for embedded data structures
|
||||
patterns = [
|
||||
(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', "NEXT_DATA"),
|
||||
(r'window\.__APOLLO_STATE__\s*=\s*({.+?});', "APOLLO_STATE"),
|
||||
(r'"lots"\s*:\s*\[(.+?)\]', "LOTS_ARRAY"),
|
||||
]
|
||||
|
||||
for pattern, name in patterns:
|
||||
match = re.search(pattern, content, re.DOTALL)
|
||||
if match:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"FOUND: {name}")
|
||||
print(f"{'='*60}")
|
||||
try:
|
||||
if name == "LOTS_ARRAY":
|
||||
print(f"Preview: {match.group(1)[:500]}")
|
||||
else:
|
||||
data = json.loads(match.group(1))
|
||||
print(json.dumps(data, indent=2)[:2000])
|
||||
except:
|
||||
print(f"Preview: {match.group(1)[:1000]}")
|
||||
|
||||
# Also check for any script tags with "lot" and "bid" and "end"
|
||||
print(f"\n{'='*60}")
|
||||
print("SEARCHING FOR LOT DATA IN ALL SCRIPTS")
|
||||
print(f"{'='*60}")
|
||||
|
||||
scripts = re.findall(r'<script[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
for i, script in enumerate(scripts):
|
||||
if all(term in script.lower() for term in ['lot', 'bid', 'end']):
|
||||
print(f"\nScript #{i} (first 500 chars):")
|
||||
print(script[:500])
|
||||
if i > 3: # Limit output
|
||||
break
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,54 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check current data quality in cache.db"""
|
||||
import sqlite3
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
print("=" * 60)
|
||||
print("CURRENT DATA QUALITY CHECK")
|
||||
print("=" * 60)
|
||||
|
||||
# Check lots table
|
||||
print("\n[*] Sample Lot Data:")
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, current_bid, bid_count, closing_time
|
||||
FROM lots
|
||||
LIMIT 10
|
||||
""")
|
||||
for row in cursor:
|
||||
print(f" Lot: {row[0]}")
|
||||
print(f" Current Bid: {row[1]}")
|
||||
print(f" Bid Count: {row[2]}")
|
||||
print(f" Closing Time: {row[3]}")
|
||||
|
||||
# Check auctions table
|
||||
print("\n[*] Sample Auction Data:")
|
||||
cursor = conn.execute("""
|
||||
SELECT auction_id, title, closing_time, first_lot_closing_time
|
||||
FROM auctions
|
||||
LIMIT 5
|
||||
""")
|
||||
for row in cursor:
|
||||
print(f" Auction: {row[0]}")
|
||||
print(f" Title: {row[1][:50]}...")
|
||||
print(f" Closing Time: {row[2] if len(row) > 2 else 'N/A'}")
|
||||
print(f" First Lot Closing: {row[3]}")
|
||||
|
||||
# Data completeness stats
|
||||
print("\n[*] Data Completeness:")
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN current_bid IS NULL OR current_bid = '' THEN 1 ELSE 0 END) as missing_current_bid,
|
||||
SUM(CASE WHEN closing_time IS NULL OR closing_time = '' THEN 1 ELSE 0 END) as missing_closing_time,
|
||||
SUM(CASE WHEN bid_count IS NULL OR bid_count = 0 THEN 1 ELSE 0 END) as zero_bid_count
|
||||
FROM lots
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
print(f" Total lots: {row[0]:,}")
|
||||
print(f" Missing current_bid: {row[1]:,} ({100*row[1]/row[0]:.1f}%)")
|
||||
print(f" Missing closing_time: {row[2]:,} ({100*row[2]/row[0]:.1f}%)")
|
||||
print(f" Zero bid_count: {row[3]:,} ({100*row[3]/row[0]:.1f}%)")
|
||||
|
||||
conn.close()
|
||||
print("\n" + "=" * 60)
|
||||
@@ -1,67 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check if GraphQL has viewing/pickup data"""
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from graphql_client import GRAPHQL_ENDPOINT
|
||||
import aiohttp
|
||||
|
||||
# Expanded query to check for all available fields
|
||||
EXTENDED_QUERY = """
|
||||
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
||||
lot {
|
||||
id
|
||||
displayId
|
||||
auctionId
|
||||
currentBidAmount { cents currency }
|
||||
initialAmount { cents currency }
|
||||
nextMinimalBid { cents currency }
|
||||
bidsCount
|
||||
startDate
|
||||
endDate
|
||||
|
||||
# Try to find viewing/pickup fields
|
||||
viewingDays { startDate endDate city countryCode }
|
||||
collectionDays { startDate endDate city countryCode }
|
||||
pickupDays { startDate endDate city countryCode }
|
||||
}
|
||||
auction {
|
||||
id
|
||||
displayId
|
||||
viewingDays { startDate endDate city countryCode }
|
||||
collectionDays { startDate endDate city countryCode }
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
async def main():
|
||||
variables = {
|
||||
"lotDisplayId": "A1-28505-5",
|
||||
"locale": "nl",
|
||||
"platform": "TWK"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"query": EXTENDED_QUERY,
|
||||
"variables": variables
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
print("Full GraphQL Response:")
|
||||
print(json.dumps(data, indent=2))
|
||||
else:
|
||||
print(f"Error: {response.status}")
|
||||
print(await response.text())
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,72 +0,0 @@
|
||||
"""Check how lots link to auctions"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
from cache import CacheManager
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get a lot page from cache
|
||||
cursor.execute("SELECT url, content FROM cache WHERE url LIKE '%/l/%' LIMIT 1")
|
||||
url, content_blob = cursor.fetchone()
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
|
||||
# Extract __NEXT_DATA__
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
data = json.loads(match.group(1))
|
||||
|
||||
props = data.get('props', {}).get('pageProps', {})
|
||||
print("PageProps keys:", list(props.keys()))
|
||||
|
||||
lot = props.get('lot', {})
|
||||
print("\nLot data:")
|
||||
print(f" displayId: {lot.get('displayId')}")
|
||||
print(f" auctionId (UUID): {lot.get('auctionId')}")
|
||||
|
||||
# Check if auction data is also included
|
||||
auction = props.get('auction')
|
||||
if auction:
|
||||
print("\nAuction data IS included in lot page!")
|
||||
print(f" Auction displayId: {auction.get('displayId')}")
|
||||
print(f" Auction id (UUID): {auction.get('id')}")
|
||||
print(f" Auction name: {auction.get('name', '')[:60]}")
|
||||
else:
|
||||
print("\nAuction data NOT included in lot page")
|
||||
print("Need to look up auction by UUID")
|
||||
|
||||
# Check if we can find the auction by UUID
|
||||
lot_auction_uuid = lot.get('auctionId')
|
||||
if lot_auction_uuid:
|
||||
# Try to find auction page with this UUID
|
||||
cursor.execute("""
|
||||
SELECT url, content FROM cache
|
||||
WHERE url LIKE '%/a/%'
|
||||
LIMIT 10
|
||||
""")
|
||||
|
||||
found_match = False
|
||||
for auction_url, auction_content_blob in cursor.fetchall():
|
||||
auction_content = zlib.decompress(auction_content_blob).decode('utf-8')
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', auction_content, re.DOTALL)
|
||||
if match:
|
||||
auction_data = json.loads(match.group(1))
|
||||
auction_obj = auction_data.get('props', {}).get('pageProps', {}).get('auction', {})
|
||||
if auction_obj.get('id') == lot_auction_uuid:
|
||||
print(f"\n✓ Found matching auction!")
|
||||
print(f" Auction displayId: {auction_obj.get('displayId')}")
|
||||
print(f" Auction UUID: {auction_obj.get('id')}")
|
||||
print(f" Auction URL: {auction_url}")
|
||||
found_match = True
|
||||
break
|
||||
|
||||
if not found_match:
|
||||
print(f"\n✗ Could not find auction with UUID {lot_auction_uuid} in first 10 cached auctions")
|
||||
|
||||
conn.close()
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check viewing time data"""
|
||||
import sqlite3
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
# Check if viewing_time has data
|
||||
cursor = conn.execute("""
|
||||
SELECT viewing_time, pickup_date
|
||||
FROM lots
|
||||
WHERE viewing_time IS NOT NULL AND viewing_time != ''
|
||||
LIMIT 5
|
||||
""")
|
||||
|
||||
rows = cursor.fetchall()
|
||||
print("Existing viewing_time data:")
|
||||
for r in rows:
|
||||
print(f" Viewing: {r[0]}")
|
||||
print(f" Pickup: {r[1]}")
|
||||
print()
|
||||
|
||||
# Check overall completeness
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN viewing_time IS NOT NULL AND viewing_time != '' THEN 1 ELSE 0 END) as has_viewing,
|
||||
SUM(CASE WHEN pickup_date IS NOT NULL AND pickup_date != '' THEN 1 ELSE 0 END) as has_pickup
|
||||
FROM lots
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
print(f"Completeness:")
|
||||
print(f" Total lots: {row[0]}")
|
||||
print(f" Has viewing_time: {row[1]} ({100*row[1]/row[0]:.1f}%)")
|
||||
print(f" Has pickup_date: {row[2]} ({100*row[2]/row[0]:.1f}%)")
|
||||
|
||||
conn.close()
|
||||
@@ -1,35 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check if viewing time is in the GraphQL response"""
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
responses = []
|
||||
|
||||
async def capture_response(response):
|
||||
if 'graphql' in response.url and 'LotBiddingData' in await response.text():
|
||||
try:
|
||||
body = await response.json()
|
||||
responses.append(body)
|
||||
except:
|
||||
pass
|
||||
|
||||
page.on('response', capture_response)
|
||||
|
||||
await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle')
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if responses:
|
||||
print("Full LotBiddingData Response:")
|
||||
print("="*60)
|
||||
print(json.dumps(responses[0], indent=2))
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,69 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Debug lot data structure from cached page"""
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from parse import DataParser
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
# Get a recent lot page
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
print("No lot pages found")
|
||||
exit(1)
|
||||
|
||||
url, content_blob = row
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
|
||||
parser = DataParser()
|
||||
result = parser.parse_page(content, url)
|
||||
|
||||
if result:
|
||||
print(f"URL: {url}")
|
||||
print(f"\nParsed Data:")
|
||||
print(f" type: {result.get('type')}")
|
||||
print(f" lot_id: {result.get('lot_id')}")
|
||||
print(f" title: {result.get('title', '')[:50]}...")
|
||||
print(f" current_bid: {result.get('current_bid')}")
|
||||
print(f" bid_count: {result.get('bid_count')}")
|
||||
print(f" closing_time: {result.get('closing_time')}")
|
||||
print(f" location: {result.get('location')}")
|
||||
|
||||
# Also dump the raw JSON
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if match:
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
if 'lot' in page_props:
|
||||
lot = page_props['lot']
|
||||
print(f"\nRAW __NEXT_DATA__.lot keys: {list(lot.keys())}")
|
||||
print(f"\nSearching for bid/timing fields...")
|
||||
|
||||
# Deep search for these fields
|
||||
def deep_search(obj, prefix=""):
|
||||
if isinstance(obj, dict):
|
||||
for k, v in obj.items():
|
||||
if any(term in k.lower() for term in ['bid', 'end', 'close', 'date', 'time']):
|
||||
print(f" {prefix}{k}: {v}")
|
||||
if isinstance(v, (dict, list)):
|
||||
deep_search(v, prefix + k + ".")
|
||||
elif isinstance(obj, list) and len(obj) > 0:
|
||||
deep_search(obj[0], prefix + "[0].")
|
||||
|
||||
deep_search(lot)
|
||||
|
||||
conn.close()
|
||||
@@ -1,65 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Deep inspect lot JSON for viewing/pickup data"""
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
|
||||
row = cursor.fetchone()
|
||||
url, content_blob = row
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
data = json.loads(match.group(1))
|
||||
lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
||||
|
||||
print(f"Inspecting: {url}\n")
|
||||
|
||||
# Check onboarding
|
||||
if 'onboarding' in lot:
|
||||
print("ONBOARDING:")
|
||||
print(json.dumps(lot['onboarding'], indent=2))
|
||||
print()
|
||||
|
||||
# Check attributes
|
||||
if 'attributes' in lot:
|
||||
print("ATTRIBUTES:")
|
||||
attrs = lot['attributes']
|
||||
print(json.dumps(attrs[:3] if isinstance(attrs, list) else attrs, indent=2))
|
||||
print()
|
||||
|
||||
# Check condition
|
||||
if 'condition' in lot:
|
||||
print("CONDITION:")
|
||||
print(json.dumps(lot['condition'], indent=2))
|
||||
print()
|
||||
|
||||
# Check appearance
|
||||
if 'appearance' in lot:
|
||||
print("APPEARANCE:")
|
||||
print(json.dumps(lot['appearance'], indent=2))
|
||||
print()
|
||||
|
||||
# Check location
|
||||
if 'location' in lot:
|
||||
print("LOCATION:")
|
||||
print(json.dumps(lot['location'], indent=2))
|
||||
print()
|
||||
|
||||
# Check for any field with "view", "pick", "collect", "date", "time"
|
||||
print("\nFIELDS WITH VIEWING/PICKUP/TIME:")
|
||||
for key in lot.keys():
|
||||
if any(term in key.lower() for term in ['view', 'pick', 'collect', 'date', 'time', 'day']):
|
||||
print(f" {key}: {lot[key]}")
|
||||
|
||||
conn.close()
|
||||
@@ -1,120 +0,0 @@
|
||||
"""
|
||||
Enrich existing lots with new intelligence fields:
|
||||
- followers_count
|
||||
- estimated_min_price / estimated_max_price
|
||||
- lot_condition
|
||||
- appearance
|
||||
|
||||
Reads from cached lot pages __NEXT_DATA__ JSON
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
import asyncio
|
||||
from cache import CacheManager
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
from graphql_client import fetch_lot_bidding_data, format_bid_data
|
||||
|
||||
async def enrich_existing_lots():
|
||||
"""Enrich existing lots with new fields from GraphQL API"""
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all lot IDs
|
||||
cursor.execute("SELECT lot_id FROM lots")
|
||||
lot_ids = [r[0] for r in cursor.fetchall()]
|
||||
|
||||
print(f"Found {len(lot_ids)} lots to enrich")
|
||||
print("Fetching enrichment data from GraphQL API...")
|
||||
print("This will take ~{:.1f} minutes (0.5s rate limit)".format(len(lot_ids) * 0.5 / 60))
|
||||
|
||||
enriched = 0
|
||||
failed = 0
|
||||
no_data = 0
|
||||
|
||||
for i, lot_id in enumerate(lot_ids):
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f"Progress: {i+1}/{len(lot_ids)} ({enriched} enriched, {no_data} no data, {failed} failed)", end='\r')
|
||||
|
||||
try:
|
||||
# Fetch from GraphQL API
|
||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
||||
|
||||
if bidding_data:
|
||||
formatted_data = format_bid_data(bidding_data)
|
||||
|
||||
# Update lot with new fields
|
||||
cursor.execute("""
|
||||
UPDATE lots
|
||||
SET followers_count = ?,
|
||||
estimated_min_price = ?,
|
||||
estimated_max_price = ?,
|
||||
lot_condition = ?,
|
||||
appearance = ?
|
||||
WHERE lot_id = ?
|
||||
""", (
|
||||
formatted_data.get('followers_count', 0),
|
||||
formatted_data.get('estimated_min_price'),
|
||||
formatted_data.get('estimated_max_price'),
|
||||
formatted_data.get('lot_condition', ''),
|
||||
formatted_data.get('appearance', ''),
|
||||
lot_id
|
||||
))
|
||||
|
||||
enriched += 1
|
||||
|
||||
# Commit every 50 lots
|
||||
if enriched % 50 == 0:
|
||||
conn.commit()
|
||||
|
||||
else:
|
||||
no_data += 1
|
||||
|
||||
# Rate limit
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n\nComplete!")
|
||||
print(f"Total lots: {len(lot_ids)}")
|
||||
print(f"Enriched: {enriched}")
|
||||
print(f"No data: {no_data}")
|
||||
print(f"Failed: {failed}")
|
||||
|
||||
# Show statistics
|
||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE followers_count > 0")
|
||||
with_followers = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE estimated_min_price IS NOT NULL")
|
||||
with_estimates = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE lot_condition IS NOT NULL AND lot_condition != ''")
|
||||
with_condition = cursor.fetchone()[0]
|
||||
|
||||
print(f"\nEnrichment statistics:")
|
||||
print(f" Lots with followers_count: {with_followers} ({with_followers/len(lot_ids)*100:.1f}%)")
|
||||
print(f" Lots with estimated prices: {with_estimates} ({with_estimates/len(lot_ids)*100:.1f}%)")
|
||||
print(f" Lots with condition: {with_condition} ({with_condition/len(lot_ids)*100:.1f}%)")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("WARNING: This will make ~16,800 API calls at 0.5s intervals (~2.3 hours)")
|
||||
print("Press Ctrl+C to cancel, or wait 5 seconds to continue...")
|
||||
import time
|
||||
try:
|
||||
time.sleep(5)
|
||||
except KeyboardInterrupt:
|
||||
print("\nCancelled")
|
||||
sys.exit(0)
|
||||
|
||||
asyncio.run(enrich_existing_lots())
|
||||
@@ -1,370 +0,0 @@
|
||||
"""
|
||||
Explore API responses to identify additional fields available for intelligence.
|
||||
Tests GraphQL and REST API responses for field coverage.
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
import json
|
||||
import aiohttp
|
||||
from graphql_client import fetch_lot_bidding_data, GRAPHQL_ENDPOINT
|
||||
from bid_history_client import fetch_bid_history, BID_HISTORY_ENDPOINT
|
||||
|
||||
async def explore_graphql_schema():
|
||||
"""Query GraphQL schema to see all available fields"""
|
||||
print("=" * 80)
|
||||
print("GRAPHQL SCHEMA EXPLORATION")
|
||||
print("=" * 80)
|
||||
|
||||
# Introspection query for LotDetails type
|
||||
introspection_query = """
|
||||
query IntrospectionQuery {
|
||||
__type(name: "LotDetails") {
|
||||
name
|
||||
fields {
|
||||
name
|
||||
type {
|
||||
name
|
||||
kind
|
||||
ofType {
|
||||
name
|
||||
kind
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
async with session.post(
|
||||
GRAPHQL_ENDPOINT,
|
||||
json={
|
||||
"query": introspection_query,
|
||||
"variables": {}
|
||||
},
|
||||
headers={"Content-Type": "application/json"}
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
lot_type = data.get('data', {}).get('__type')
|
||||
if lot_type:
|
||||
print("\nLotDetails available fields:")
|
||||
for field in lot_type.get('fields', []):
|
||||
field_name = field['name']
|
||||
field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex')
|
||||
print(f" - {field_name}: {field_type}")
|
||||
print()
|
||||
else:
|
||||
print(f"Failed with status {response.status}")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
# Also try Lot type
|
||||
introspection_query_lot = """
|
||||
query IntrospectionQuery {
|
||||
__type(name: "Lot") {
|
||||
name
|
||||
fields {
|
||||
name
|
||||
type {
|
||||
name
|
||||
kind
|
||||
ofType {
|
||||
name
|
||||
kind
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
async with session.post(
|
||||
GRAPHQL_ENDPOINT,
|
||||
json={
|
||||
"query": introspection_query_lot,
|
||||
"variables": {}
|
||||
},
|
||||
headers={"Content-Type": "application/json"}
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
lot_type = data.get('data', {}).get('__type')
|
||||
if lot_type:
|
||||
print("\nLot type available fields:")
|
||||
for field in lot_type.get('fields', []):
|
||||
field_name = field['name']
|
||||
field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex')
|
||||
print(f" - {field_name}: {field_type}")
|
||||
print()
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
async def test_graphql_full_query():
|
||||
"""Test a comprehensive GraphQL query to see all returned data"""
|
||||
print("=" * 80)
|
||||
print("GRAPHQL FULL QUERY TEST")
|
||||
print("=" * 80)
|
||||
|
||||
# Test with a real lot ID
|
||||
lot_id = "A1-34731-107" # Example from database
|
||||
|
||||
comprehensive_query = """
|
||||
query ComprehensiveLotQuery($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
||||
lot {
|
||||
id
|
||||
displayId
|
||||
title
|
||||
description
|
||||
currentBidAmount { cents currency }
|
||||
initialAmount { cents currency }
|
||||
nextMinimalBid { cents currency }
|
||||
bidsCount
|
||||
startDate
|
||||
endDate
|
||||
minimumBidAmountMet
|
||||
lotNumber
|
||||
auctionId
|
||||
lotState
|
||||
location {
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
viewingDays {
|
||||
city
|
||||
countryCode
|
||||
addressLine1
|
||||
addressLine2
|
||||
endDate
|
||||
startDate
|
||||
}
|
||||
collectionDays {
|
||||
city
|
||||
countryCode
|
||||
addressLine1
|
||||
addressLine2
|
||||
endDate
|
||||
startDate
|
||||
}
|
||||
images {
|
||||
url
|
||||
thumbnailUrl
|
||||
}
|
||||
attributes {
|
||||
name
|
||||
value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
async with session.post(
|
||||
GRAPHQL_ENDPOINT,
|
||||
json={
|
||||
"query": comprehensive_query,
|
||||
"variables": {
|
||||
"lotDisplayId": lot_id,
|
||||
"locale": "nl_NL",
|
||||
"platform": "WEB"
|
||||
}
|
||||
},
|
||||
headers={"Content-Type": "application/json"}
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
print(f"\nFull GraphQL response for {lot_id}:")
|
||||
print(json.dumps(data, indent=2))
|
||||
print()
|
||||
else:
|
||||
print(f"Failed with status {response.status}")
|
||||
print(await response.text())
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
async def test_bid_history_response():
|
||||
"""Test bid history API to see all returned fields"""
|
||||
print("=" * 80)
|
||||
print("BID HISTORY API TEST")
|
||||
print("=" * 80)
|
||||
|
||||
# Get a lot with bids from database
|
||||
import sqlite3
|
||||
from cache import CacheManager
|
||||
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Find a lot with bids
|
||||
cursor.execute("""
|
||||
SELECT lot_id, url FROM lots
|
||||
WHERE bid_count > 0
|
||||
ORDER BY bid_count DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
lot_id, url = result
|
||||
# Extract UUID from URL
|
||||
import re
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>', url)
|
||||
# We need to get UUID from cached page
|
||||
cursor.execute("SELECT content FROM cache WHERE url = ?", (url,))
|
||||
page_result = cursor.fetchone()
|
||||
|
||||
if page_result:
|
||||
import zlib
|
||||
content = zlib.decompress(page_result[0]).decode('utf-8')
|
||||
match = re.search(r'"lot":\s*\{[^}]*"id":\s*"([^"]+)"', content)
|
||||
if match:
|
||||
lot_uuid = match.group(1)
|
||||
print(f"\nTesting with lot {lot_id} (UUID: {lot_uuid})")
|
||||
|
||||
# Fetch bid history
|
||||
bid_history = await fetch_bid_history(lot_uuid)
|
||||
if bid_history:
|
||||
print(f"\nBid history sample (first 3 records):")
|
||||
for i, bid in enumerate(bid_history[:3]):
|
||||
print(f"\nBid {i+1}:")
|
||||
print(json.dumps(bid, indent=2))
|
||||
|
||||
print(f"\n\nAll available fields in bid records:")
|
||||
if bid_history:
|
||||
all_keys = set()
|
||||
for bid in bid_history:
|
||||
all_keys.update(bid.keys())
|
||||
for key in sorted(all_keys):
|
||||
print(f" - {key}")
|
||||
else:
|
||||
print("No bid history found")
|
||||
|
||||
conn.close()
|
||||
|
||||
async def check_auction_api():
|
||||
"""Check if there's an auction details API"""
|
||||
print("=" * 80)
|
||||
print("AUCTION API EXPLORATION")
|
||||
print("=" * 80)
|
||||
|
||||
auction_query = """
|
||||
query AuctionDetails($auctionId: String!, $locale: String!, $platform: Platform!) {
|
||||
auctionDetails(auctionId: $auctionId, locale: $locale, platform: $platform) {
|
||||
auction {
|
||||
id
|
||||
title
|
||||
description
|
||||
startDate
|
||||
endDate
|
||||
firstLotEndDate
|
||||
location {
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
viewingDays {
|
||||
city
|
||||
countryCode
|
||||
startDate
|
||||
endDate
|
||||
addressLine1
|
||||
addressLine2
|
||||
}
|
||||
collectionDays {
|
||||
city
|
||||
countryCode
|
||||
startDate
|
||||
endDate
|
||||
addressLine1
|
||||
addressLine2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
# Get an auction ID from database
|
||||
import sqlite3
|
||||
from cache import CacheManager
|
||||
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get auction ID from a lot
|
||||
cursor.execute("SELECT DISTINCT auction_id FROM lots WHERE auction_id IS NOT NULL LIMIT 1")
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
auction_id = result[0]
|
||||
print(f"\nTesting with auction {auction_id}")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
async with session.post(
|
||||
GRAPHQL_ENDPOINT,
|
||||
json={
|
||||
"query": auction_query,
|
||||
"variables": {
|
||||
"auctionId": auction_id,
|
||||
"locale": "nl_NL",
|
||||
"platform": "WEB"
|
||||
}
|
||||
},
|
||||
headers={"Content-Type": "application/json"}
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
print("\nAuction API response:")
|
||||
print(json.dumps(data, indent=2))
|
||||
else:
|
||||
print(f"Failed with status {response.status}")
|
||||
print(await response.text())
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
conn.close()
|
||||
|
||||
async def main():
|
||||
"""Run all API explorations"""
|
||||
await explore_graphql_schema()
|
||||
await test_graphql_full_query()
|
||||
await test_bid_history_response()
|
||||
await check_auction_api()
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("SUMMARY: AVAILABLE DATA FIELDS")
|
||||
print("=" * 80)
|
||||
print("""
|
||||
CURRENTLY CAPTURED:
|
||||
- Lot bidding data: current_bid, starting_bid, minimum_bid, bid_count, closing_time
|
||||
- Lot attributes: brand, model, manufacturer, year, condition, serial_number
|
||||
- Bid history: bid_amount, bid_time, bidder_id, is_autobid
|
||||
- Bid intelligence: first_bid_time, last_bid_time, bid_velocity, bid_increment
|
||||
- Images: URLs and local paths
|
||||
|
||||
POTENTIALLY AVAILABLE (TO CHECK):
|
||||
- Viewing/collection times with full address and date ranges
|
||||
- Lot location details (city, country)
|
||||
- Lot state/status
|
||||
- Image thumbnails
|
||||
- More detailed attributes
|
||||
|
||||
NOT AVAILABLE:
|
||||
- Watch count (not exposed in API)
|
||||
- Reserve price (not exposed in API)
|
||||
- Estimated min/max value (not exposed in API)
|
||||
- Bidder identities (anonymized)
|
||||
""")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,93 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Explore the actual auction schema"""
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
|
||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||
|
||||
# Try different field structures
|
||||
QUERIES = {
|
||||
"viewingDays_simple": """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
viewingDays {
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
"viewingDays_with_times": """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
viewingDays {
|
||||
from
|
||||
to
|
||||
city
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
"full_auction": """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
id
|
||||
displayId
|
||||
biddingStatus
|
||||
buyersPremium
|
||||
viewingDays {
|
||||
city
|
||||
countryCode
|
||||
from
|
||||
to
|
||||
}
|
||||
collectionDays {
|
||||
city
|
||||
countryCode
|
||||
from
|
||||
to
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
}
|
||||
|
||||
async def test_query(name, query, auction_id):
|
||||
variables = {
|
||||
"auctionId": auction_id,
|
||||
"locale": "nl",
|
||||
"platform": "TWK"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"query": query,
|
||||
"variables": variables
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||
data = await response.json()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"QUERY: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if 'errors' in data:
|
||||
print("ERRORS:")
|
||||
for error in data['errors']:
|
||||
print(f" {error}")
|
||||
else:
|
||||
print("SUCCESS:")
|
||||
print(json.dumps(data, indent=2))
|
||||
|
||||
async def main():
|
||||
# Test with the auction we know exists
|
||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||
|
||||
for name, query in QUERIES.items():
|
||||
await test_query(name, query, auction_id)
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,141 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Export only NEW auctions/lots that haven't been sent to server yet
|
||||
Prevents UNIQUE constraint errors on server import
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import csv
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = "C:/mnt/okcomputer/output/cache.db"
|
||||
OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
|
||||
SYNC_STATE_FILE = OUTPUT_DIR / ".server_sync_state"
|
||||
|
||||
def get_last_export_timestamp():
|
||||
"""Get timestamp of last successful export to server"""
|
||||
if SYNC_STATE_FILE.exists():
|
||||
return int(SYNC_STATE_FILE.read_text().strip())
|
||||
return 0
|
||||
|
||||
def save_export_timestamp(timestamp: int):
|
||||
"""Save timestamp of successful export"""
|
||||
SYNC_STATE_FILE.write_text(str(timestamp))
|
||||
|
||||
def export_new_data():
|
||||
"""Export only records that are NEW since last server import"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
last_export = get_last_export_timestamp()
|
||||
current_time = int(datetime.now().timestamp())
|
||||
|
||||
print("="*60)
|
||||
print("INCREMENTAL EXPORT FOR SERVER")
|
||||
print("="*60)
|
||||
print(f"Last export: {datetime.fromtimestamp(last_export).strftime('%Y-%m-%d %H:%M:%S') if last_export else 'Never (will export ALL)'}")
|
||||
print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print()
|
||||
|
||||
# Get new auctions (discovered_at > last_export)
|
||||
if last_export == 0:
|
||||
# First run: export all
|
||||
cursor.execute("SELECT * FROM auctions ORDER BY auction_id")
|
||||
else:
|
||||
# Subsequent runs: only new ones
|
||||
cursor.execute("""
|
||||
SELECT * FROM auctions
|
||||
WHERE discovered_at > ?
|
||||
ORDER BY auction_id
|
||||
""", (last_export,))
|
||||
|
||||
new_auctions = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# Get new lots (scraped_at_timestamp > last_export)
|
||||
if last_export == 0:
|
||||
cursor.execute("SELECT * FROM lots ORDER BY lot_id")
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT * FROM lots
|
||||
WHERE scraped_at_timestamp > ?
|
||||
ORDER BY lot_id
|
||||
""", (last_export,))
|
||||
|
||||
new_lots = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
conn.close()
|
||||
|
||||
# Export to server-ready files
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
files_created = []
|
||||
|
||||
# Export auctions
|
||||
if new_auctions:
|
||||
auctions_csv = OUTPUT_DIR / f'auctions_{timestamp}.csv'
|
||||
auctions_json = OUTPUT_DIR / f'auctions_{timestamp}.json'
|
||||
|
||||
with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(new_auctions)
|
||||
|
||||
with open(auctions_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(new_auctions, f, indent=2, ensure_ascii=False)
|
||||
|
||||
files_created.extend([auctions_csv, auctions_json])
|
||||
print(f"✓ Exported {len(new_auctions)} auctions")
|
||||
print(f" CSV: {auctions_csv}")
|
||||
print(f" JSON: {auctions_json}")
|
||||
else:
|
||||
print("✓ No new auctions to export")
|
||||
|
||||
# Export lots
|
||||
if new_lots:
|
||||
lots_csv = OUTPUT_DIR / f'lots_{timestamp}.csv'
|
||||
lots_json = OUTPUT_DIR / f'lots_{timestamp}.json'
|
||||
|
||||
with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(new_lots)
|
||||
|
||||
with open(lots_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(new_lots, f, indent=2, ensure_ascii=False)
|
||||
|
||||
files_created.extend([lots_csv, lots_json])
|
||||
print(f"✓ Exported {len(new_lots)} lots")
|
||||
print(f" CSV: {lots_csv}")
|
||||
print(f" JSON: {lots_json}")
|
||||
else:
|
||||
print("✓ No new lots to export")
|
||||
|
||||
# Save sync state
|
||||
if new_auctions or new_lots:
|
||||
save_export_timestamp(current_time)
|
||||
print()
|
||||
print("="*60)
|
||||
print("EXPORT COMPLETE")
|
||||
print("="*60)
|
||||
print(f"New auctions: {len(new_auctions)}")
|
||||
print(f"New lots: {len(new_lots)}")
|
||||
print()
|
||||
print("Next export will only include records newer than:")
|
||||
print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
else:
|
||||
print()
|
||||
print("="*60)
|
||||
print("NOTHING TO EXPORT")
|
||||
print("="*60)
|
||||
print("All data already exported to server")
|
||||
|
||||
return {
|
||||
'auctions': len(new_auctions),
|
||||
'lots': len(new_lots),
|
||||
'files': [str(f) for f in files_created]
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
export_new_data()
|
||||
@@ -1,53 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract the GraphQL query being used"""
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
graphql_requests = []
|
||||
|
||||
async def capture_request(request):
|
||||
if 'graphql' in request.url:
|
||||
graphql_requests.append({
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'post_data': request.post_data,
|
||||
'headers': dict(request.headers)
|
||||
})
|
||||
|
||||
page.on('request', capture_request)
|
||||
|
||||
await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle')
|
||||
await asyncio.sleep(2)
|
||||
|
||||
print(f"Captured {len(graphql_requests)} GraphQL requests\n")
|
||||
|
||||
for i, req in enumerate(graphql_requests):
|
||||
print(f"{'='*60}")
|
||||
print(f"REQUEST #{i+1}")
|
||||
print(f"{'='*60}")
|
||||
print(f"URL: {req['url']}")
|
||||
print(f"Method: {req['method']}")
|
||||
|
||||
if req['post_data']:
|
||||
try:
|
||||
data = json.loads(req['post_data'])
|
||||
print(f"\nQuery Name: {data.get('operationName', 'N/A')}")
|
||||
print(f"\nVariables:")
|
||||
print(json.dumps(data.get('variables', {}), indent=2))
|
||||
print(f"\nQuery:")
|
||||
print(data.get('query', '')[:1000])
|
||||
except:
|
||||
print(f"\nPOST Data: {req['post_data'][:500]}")
|
||||
|
||||
print()
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,45 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find viewing/pickup in actual HTML"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
import re
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# Try a lot that should have viewing times
|
||||
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
||||
|
||||
# Get text content
|
||||
text_content = await page.evaluate("document.body.innerText")
|
||||
|
||||
print("Searching for viewing/pickup patterns...\n")
|
||||
|
||||
# Look for "Bezichtigingen" section
|
||||
lines = text_content.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
if 'bezichtig' in line.lower() or 'viewing' in line.lower():
|
||||
# Print surrounding context
|
||||
context = lines[max(0, i-1):min(len(lines), i+5)]
|
||||
print("FOUND Bezichtigingen:")
|
||||
for c in context:
|
||||
print(f" {c}")
|
||||
print()
|
||||
break
|
||||
|
||||
# Look for "Ophalen" section
|
||||
for i, line in enumerate(lines):
|
||||
if 'ophalen' in line.lower() or 'collection' in line.lower() or 'pickup' in line.lower():
|
||||
context = lines[max(0, i-1):min(len(lines), i+5)]
|
||||
print("FOUND Ophalen:")
|
||||
for c in context:
|
||||
print(f" {c}")
|
||||
print()
|
||||
break
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,166 +0,0 @@
|
||||
"""
|
||||
Fetch bid history for existing lots that have bids but no bid history records.
|
||||
Reads cached lot pages to get lot UUIDs, then calls bid history API.
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
import asyncio
|
||||
from cache import CacheManager
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
from bid_history_client import fetch_bid_history, parse_bid_history
|
||||
|
||||
async def fetch_missing_bid_history():
|
||||
"""Fetch bid history for lots that have bids but no history records"""
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get lots with bids but no bid history
|
||||
cursor.execute("""
|
||||
SELECT l.lot_id, l.bid_count
|
||||
FROM lots l
|
||||
WHERE l.bid_count > 0
|
||||
AND l.lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
|
||||
ORDER BY l.bid_count DESC
|
||||
""")
|
||||
|
||||
lots_to_fetch = cursor.fetchall()
|
||||
print(f"Found {len(lots_to_fetch)} lots with bids but no bid history")
|
||||
|
||||
if not lots_to_fetch:
|
||||
print("No lots to process!")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Build mapping from lot_id to lot UUID from cached pages
|
||||
print("Building lot_id -> UUID mapping from cache...")
|
||||
|
||||
cursor.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
""")
|
||||
|
||||
lot_id_to_uuid = {}
|
||||
total_cached = 0
|
||||
|
||||
for url, content_blob in cursor:
|
||||
total_cached += 1
|
||||
|
||||
if total_cached % 100 == 0:
|
||||
print(f"Processed {total_cached} cached pages...", end='\r')
|
||||
|
||||
try:
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
|
||||
if not match:
|
||||
continue
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
||||
|
||||
if not lot:
|
||||
continue
|
||||
|
||||
lot_display_id = lot.get('displayId')
|
||||
lot_uuid = lot.get('id')
|
||||
|
||||
if lot_display_id and lot_uuid:
|
||||
lot_id_to_uuid[lot_display_id] = lot_uuid
|
||||
|
||||
except:
|
||||
continue
|
||||
|
||||
print(f"\n\nBuilt UUID mapping for {len(lot_id_to_uuid)} lots")
|
||||
|
||||
# Fetch bid history for each lot
|
||||
print("\nFetching bid history from API...")
|
||||
|
||||
fetched = 0
|
||||
failed = 0
|
||||
no_uuid = 0
|
||||
|
||||
for lot_id, bid_count in lots_to_fetch:
|
||||
lot_uuid = lot_id_to_uuid.get(lot_id)
|
||||
|
||||
if not lot_uuid:
|
||||
no_uuid += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
print(f"\nFetching bid history for {lot_id} ({bid_count} bids)...")
|
||||
bid_history = await fetch_bid_history(lot_uuid)
|
||||
|
||||
if bid_history:
|
||||
bid_data = parse_bid_history(bid_history, lot_id)
|
||||
|
||||
# Update lots table with bid intelligence
|
||||
cursor.execute("""
|
||||
UPDATE lots
|
||||
SET first_bid_time = ?,
|
||||
last_bid_time = ?,
|
||||
bid_velocity = ?
|
||||
WHERE lot_id = ?
|
||||
""", (
|
||||
bid_data['first_bid_time'],
|
||||
bid_data['last_bid_time'],
|
||||
bid_data['bid_velocity'],
|
||||
lot_id
|
||||
))
|
||||
|
||||
# Save bid history records
|
||||
cache.save_bid_history(lot_id, bid_data['bid_records'])
|
||||
|
||||
fetched += 1
|
||||
print(f" Saved {len(bid_data['bid_records'])} bid records")
|
||||
print(f" Bid velocity: {bid_data['bid_velocity']:.2f} bids/hour")
|
||||
|
||||
# Commit every 10 lots
|
||||
if fetched % 10 == 0:
|
||||
conn.commit()
|
||||
print(f"\nProgress: {fetched}/{len(lots_to_fetch)} lots processed...")
|
||||
|
||||
# Rate limit to be respectful
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error fetching bid history for {lot_id}: {e}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n\nComplete!")
|
||||
print(f"Total lots to process: {len(lots_to_fetch)}")
|
||||
print(f"Successfully fetched: {fetched}")
|
||||
print(f"Failed: {failed}")
|
||||
print(f"No UUID found: {no_uuid}")
|
||||
|
||||
# Verify fix
|
||||
cursor.execute("""
|
||||
SELECT COUNT(DISTINCT lot_id) FROM bid_history
|
||||
""")
|
||||
lots_with_history = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM lots WHERE bid_count > 0
|
||||
""")
|
||||
lots_with_bids = cursor.fetchone()[0]
|
||||
|
||||
print(f"\nLots with bids: {lots_with_bids}")
|
||||
print(f"Lots with bid history: {lots_with_history}")
|
||||
print(f"Coverage: {lots_with_history/lots_with_bids*100:.1f}%")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(fetch_missing_bid_history())
|
||||
@@ -1,64 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find the API endpoint by monitoring network requests"""
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
requests = []
|
||||
responses = []
|
||||
|
||||
async def log_request(request):
|
||||
if any(term in request.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
|
||||
requests.append({
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'headers': dict(request.headers),
|
||||
'post_data': request.post_data
|
||||
})
|
||||
|
||||
async def log_response(response):
|
||||
if any(term in response.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
|
||||
try:
|
||||
body = await response.text()
|
||||
responses.append({
|
||||
'url': response.url,
|
||||
'status': response.status,
|
||||
'body': body[:1000]
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
page.on('request', log_request)
|
||||
page.on('response', log_response)
|
||||
|
||||
print("Loading lot page...")
|
||||
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
||||
|
||||
# Wait for dynamic content
|
||||
await asyncio.sleep(3)
|
||||
|
||||
print(f"\nFound {len(requests)} relevant requests")
|
||||
print(f"Found {len(responses)} relevant responses\n")
|
||||
|
||||
for req in requests[:10]:
|
||||
print(f"REQUEST: {req['method']} {req['url']}")
|
||||
if req['post_data']:
|
||||
print(f" POST DATA: {req['post_data'][:200]}")
|
||||
|
||||
print("\n" + "="*60 + "\n")
|
||||
|
||||
for resp in responses[:10]:
|
||||
print(f"RESPONSE: {resp['url']}")
|
||||
print(f" Status: {resp['status']}")
|
||||
print(f" Body: {resp['body'][:300]}")
|
||||
print()
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,70 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find API endpoint using a valid lot from database"""
|
||||
import asyncio
|
||||
import sqlite3
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
# Get a valid lot URL
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
cursor = conn.execute("SELECT url FROM lots WHERE url LIKE '%/l/%' LIMIT 5")
|
||||
lot_urls = [row[0] for row in cursor.fetchall()]
|
||||
conn.close()
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
api_calls = []
|
||||
|
||||
async def log_response(response):
|
||||
url = response.url
|
||||
# Look for API calls
|
||||
if ('api' in url.lower() or 'graphql' in url.lower() or
|
||||
'/v2/' in url or '/v3/' in url or '/v4/' in url or
|
||||
'query' in url.lower() or 'mutation' in url.lower()):
|
||||
try:
|
||||
body = await response.text()
|
||||
api_calls.append({
|
||||
'url': url,
|
||||
'status': response.status,
|
||||
'body': body
|
||||
})
|
||||
print(f"\nAPI: {url}")
|
||||
except:
|
||||
pass
|
||||
|
||||
page.on('response', log_response)
|
||||
|
||||
for lot_url in lot_urls[:2]:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Loading: {lot_url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
try:
|
||||
await page.goto(lot_url, wait_until='networkidle', timeout=30000)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Check if page has bid info
|
||||
content = await page.content()
|
||||
if 'currentBid' in content or 'Current bid' in content or 'Huidig bod' in content:
|
||||
print("[+] Page contains bid information")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"[!] Error: {e}")
|
||||
continue
|
||||
|
||||
print(f"\n\n{'='*60}")
|
||||
print(f"CAPTURED {len(api_calls)} API CALLS")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for call in api_calls:
|
||||
print(f"\n{call['url']}")
|
||||
print(f"Status: {call['status']}")
|
||||
if 'json' in call['body'][:100].lower() or call['body'].startswith('{'):
|
||||
print(f"Body (first 500 chars): {call['body'][:500]}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,48 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find an auction page with lots data"""
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/a/%'
|
||||
""")
|
||||
|
||||
for row in cursor:
|
||||
url, content_blob = row
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
if 'auction' in page_props:
|
||||
auction = page_props['auction']
|
||||
lots = auction.get('lots', [])
|
||||
|
||||
if lots and len(lots) > 0:
|
||||
print(f"Found auction with {len(lots)} lots: {url}\n")
|
||||
|
||||
lot = lots[0]
|
||||
print(f"SAMPLE LOT FROM AUCTION.LOTS[]:")
|
||||
print(f" displayId: {lot.get('displayId')}")
|
||||
print(f" title: {lot.get('title', '')[:50]}...")
|
||||
print(f" urlSlug: {lot.get('urlSlug')}")
|
||||
print(f"\nBIDDING FIELDS:")
|
||||
for key in ['currentBid', 'highestBid', 'startingBid', 'minimumBidAmount', 'bidCount', 'numberOfBids']:
|
||||
print(f" {key}: {lot.get(key)}")
|
||||
print(f"\nTIMING FIELDS:")
|
||||
for key in ['endDate', 'startDate', 'closingTime']:
|
||||
print(f" {key}: {lot.get(key)}")
|
||||
print(f"\nALL KEYS: {list(lot.keys())[:30]}...")
|
||||
break
|
||||
|
||||
conn.close()
|
||||
@@ -1,155 +0,0 @@
|
||||
"""
|
||||
Fix auctions table by replacing with correct data from cached auction pages.
|
||||
The auctions table currently has wrong auction_ids (numeric instead of displayId).
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
from cache import CacheManager
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
def fix_auctions_table():
|
||||
"""Rebuild auctions table from cached auction pages"""
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing auctions table
|
||||
print("Clearing auctions table...")
|
||||
cursor.execute("DELETE FROM auctions")
|
||||
conn.commit()
|
||||
|
||||
# Get all auction pages from cache
|
||||
cursor.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/a/%'
|
||||
""")
|
||||
|
||||
auction_pages = cursor.fetchall()
|
||||
print(f"Found {len(auction_pages)} auction pages in cache")
|
||||
|
||||
total = 0
|
||||
inserted = 0
|
||||
errors = 0
|
||||
|
||||
print("Extracting auction data from cached pages...")
|
||||
|
||||
for url, content_blob in auction_pages:
|
||||
total += 1
|
||||
|
||||
if total % 10 == 0:
|
||||
print(f"Processed {total}/{len(auction_pages)}...", end='\r')
|
||||
|
||||
try:
|
||||
# Decompress and parse __NEXT_DATA__
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
|
||||
if not match:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
auction = page_props.get('auction', {})
|
||||
|
||||
if not auction:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Extract auction data
|
||||
auction_id = auction.get('displayId')
|
||||
if not auction_id:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
title = auction.get('name', '')
|
||||
|
||||
# Get location
|
||||
location = ''
|
||||
viewing_days = auction.get('viewingDays', [])
|
||||
if viewing_days and isinstance(viewing_days, list) and len(viewing_days) > 0:
|
||||
loc = viewing_days[0]
|
||||
city = loc.get('city', '')
|
||||
country = loc.get('countryCode', '').upper()
|
||||
location = f"{city}, {country}" if city and country else (city or country)
|
||||
|
||||
lots_count = auction.get('lotCount', 0)
|
||||
|
||||
# Get first lot closing time
|
||||
first_lot_closing = ''
|
||||
min_end_date = auction.get('minEndDate', '')
|
||||
if min_end_date:
|
||||
# Format timestamp
|
||||
try:
|
||||
dt = datetime.fromisoformat(min_end_date.replace('Z', '+00:00'))
|
||||
first_lot_closing = dt.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
first_lot_closing = min_end_date
|
||||
|
||||
scraped_at = datetime.now().isoformat()
|
||||
|
||||
# Insert into auctions table
|
||||
cursor.execute("""
|
||||
INSERT OR REPLACE INTO auctions
|
||||
(auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""", (auction_id, url, title, location, lots_count, first_lot_closing, scraped_at))
|
||||
|
||||
inserted += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n\nComplete!")
|
||||
print(f"Total auction pages processed: {total}")
|
||||
print(f"Auctions inserted: {inserted}")
|
||||
print(f"Errors: {errors}")
|
||||
|
||||
# Verify fix
|
||||
cursor.execute("SELECT COUNT(*) FROM auctions")
|
||||
total_auctions = cursor.fetchone()[0]
|
||||
print(f"\nTotal auctions in table: {total_auctions}")
|
||||
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM lots
|
||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
||||
AND auction_id != ''
|
||||
""")
|
||||
orphaned = cursor.fetchone()[0]
|
||||
|
||||
print(f"Orphaned lots remaining: {orphaned}")
|
||||
|
||||
if orphaned == 0:
|
||||
print("\nSUCCESS! All lots now have matching auctions!")
|
||||
else:
|
||||
# Show sample of remaining orphans
|
||||
cursor.execute("""
|
||||
SELECT lot_id, auction_id FROM lots
|
||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
||||
AND auction_id != ''
|
||||
LIMIT 5
|
||||
""")
|
||||
print("\nSample remaining orphaned lots:")
|
||||
for lot_id, auction_id in cursor.fetchall():
|
||||
print(f" {lot_id} -> auction_id: {auction_id}")
|
||||
|
||||
# Show what auction_ids we do have
|
||||
cursor.execute("SELECT auction_id FROM auctions LIMIT 10")
|
||||
print("\nSample auction_ids in auctions table:")
|
||||
for row in cursor.fetchall():
|
||||
print(f" {row[0]}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_auctions_table()
|
||||
@@ -1,136 +0,0 @@
|
||||
"""
|
||||
Fix orphaned lots by updating auction_id from UUID to displayId.
|
||||
This migration reads cached lot pages and extracts the correct auction displayId.
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
from cache import CacheManager
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
|
||||
def fix_orphaned_lots():
|
||||
"""Update lot auction_id from UUID to auction displayId"""
|
||||
cache = CacheManager()
|
||||
conn = sqlite3.connect(cache.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all lots that need fixing (have UUID auction_id)
|
||||
cursor.execute("""
|
||||
SELECT l.lot_id, l.auction_id
|
||||
FROM lots l
|
||||
WHERE length(l.auction_id) > 20 -- UUID is longer than displayId like "A1-12345"
|
||||
""")
|
||||
|
||||
lots_to_fix = {lot_id: auction_uuid for lot_id, auction_uuid in cursor.fetchall()}
|
||||
print(f"Found {len(lots_to_fix)} lots with UUID auction_id that need fixing")
|
||||
|
||||
if not lots_to_fix:
|
||||
print("No lots to fix!")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Build mapping from lot displayId to auction displayId from cached pages
|
||||
print("Building lot displayId -> auction displayId mapping from cache...")
|
||||
|
||||
cursor.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
""")
|
||||
|
||||
lot_to_auction_map = {}
|
||||
total = 0
|
||||
errors = 0
|
||||
|
||||
for url, content_blob in cursor:
|
||||
total += 1
|
||||
|
||||
if total % 100 == 0:
|
||||
print(f"Processing cached pages... {total}", end='\r')
|
||||
|
||||
try:
|
||||
# Decompress and parse __NEXT_DATA__
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
|
||||
if not match:
|
||||
continue
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
lot = page_props.get('lot', {})
|
||||
auction = page_props.get('auction', {})
|
||||
|
||||
if not lot or not auction:
|
||||
continue
|
||||
|
||||
lot_display_id = lot.get('displayId')
|
||||
auction_display_id = auction.get('displayId')
|
||||
|
||||
if lot_display_id and auction_display_id:
|
||||
lot_to_auction_map[lot_display_id] = auction_display_id
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
print(f"\n\nBuilt mapping for {len(lot_to_auction_map)} lots")
|
||||
print(f"Errors while parsing: {errors}")
|
||||
|
||||
# Now update the lots table
|
||||
print("\nUpdating lots table...")
|
||||
updated = 0
|
||||
not_found = 0
|
||||
|
||||
for lot_id, old_auction_uuid in lots_to_fix.items():
|
||||
if lot_id in lot_to_auction_map:
|
||||
new_auction_id = lot_to_auction_map[lot_id]
|
||||
cursor.execute("""
|
||||
UPDATE lots
|
||||
SET auction_id = ?
|
||||
WHERE lot_id = ?
|
||||
""", (new_auction_id, lot_id))
|
||||
updated += 1
|
||||
else:
|
||||
not_found += 1
|
||||
|
||||
if (updated + not_found) % 100 == 0:
|
||||
print(f"Updated: {updated}, not found: {not_found}", end='\r')
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n\nComplete!")
|
||||
print(f"Total cached pages processed: {total}")
|
||||
print(f"Lots updated with auction displayId: {updated}")
|
||||
print(f"Lots not found in cache: {not_found}")
|
||||
print(f"Parse errors: {errors}")
|
||||
|
||||
# Verify fix
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM lots
|
||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
||||
""")
|
||||
orphaned = cursor.fetchone()[0]
|
||||
|
||||
print(f"\nOrphaned lots remaining: {orphaned}")
|
||||
|
||||
if orphaned > 0:
|
||||
# Show sample of remaining orphans
|
||||
cursor.execute("""
|
||||
SELECT lot_id, auction_id FROM lots
|
||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
||||
LIMIT 5
|
||||
""")
|
||||
print("\nSample remaining orphaned lots:")
|
||||
for lot_id, auction_id in cursor.fetchall():
|
||||
print(f" {lot_id} -> auction_id: {auction_id}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_orphaned_lots()
|
||||
@@ -1,69 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract and inspect __NEXT_DATA__ from a cached lot page"""
|
||||
import sqlite3
|
||||
import zlib
|
||||
import json
|
||||
import re
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
# Get a cached auction page
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/a/%'
|
||||
LIMIT 1
|
||||
""")
|
||||
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
print("No cached lot pages found")
|
||||
exit(1)
|
||||
|
||||
url, content_blob = row
|
||||
print(f"Inspecting: {url}\n")
|
||||
|
||||
# Decompress
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
|
||||
# Extract __NEXT_DATA__
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if not match:
|
||||
print("No __NEXT_DATA__ found")
|
||||
exit(1)
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
if 'auction' in page_props:
|
||||
auction = page_props['auction']
|
||||
print("AUCTION DATA STRUCTURE:")
|
||||
print("=" * 60)
|
||||
print(f"displayId: {auction.get('displayId')}")
|
||||
print(f"name: {auction.get('name', '')[:50]}...")
|
||||
print(f"lots count: {len(auction.get('lots', []))}")
|
||||
|
||||
if auction.get('lots'):
|
||||
lot = auction['lots'][0]
|
||||
print(f"\nFIRST LOT STRUCTURE:")
|
||||
print(f" displayId: {lot.get('displayId')}")
|
||||
print(f" title: {lot.get('title', '')[:50]}...")
|
||||
print(f"\n BIDDING:")
|
||||
print(f" currentBid: {lot.get('currentBid')}")
|
||||
print(f" highestBid: {lot.get('highestBid')}")
|
||||
print(f" startingBid: {lot.get('startingBid')}")
|
||||
print(f" minimumBidAmount: {lot.get('minimumBidAmount')}")
|
||||
print(f" bidCount: {lot.get('bidCount')}")
|
||||
print(f" numberOfBids: {lot.get('numberOfBids')}")
|
||||
print(f" TIMING:")
|
||||
print(f" endDate: {lot.get('endDate')}")
|
||||
print(f" startDate: {lot.get('startDate')}")
|
||||
print(f" closingTime: {lot.get('closingTime')}")
|
||||
print(f" ALL KEYS: {list(lot.keys())}")
|
||||
|
||||
print(f"\nAUCTION TIMING:")
|
||||
print(f" minEndDate: {auction.get('minEndDate')}")
|
||||
print(f" maxEndDate: {auction.get('maxEndDate')}")
|
||||
print(f" ALL KEYS: {list(auction.keys())}")
|
||||
|
||||
conn.close()
|
||||
@@ -1,49 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Inspect a lot page HTML to find viewing_time and pickup_date"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# Use the known lot
|
||||
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
||||
content = await page.content()
|
||||
|
||||
print("Searching for patterns...")
|
||||
print("="*60)
|
||||
|
||||
# Search for viewing time patterns
|
||||
import re
|
||||
patterns = {
|
||||
'Bezichtigingen': r'Bezichtigingen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||
'viewing': r'(?i)viewing.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||
'Ophalen': r'Ophalen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||
'pickup': r'(?i)pickup.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
||||
'Status': r'Status\s+([^<]+)',
|
||||
}
|
||||
|
||||
for name, pattern in patterns.items():
|
||||
matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
|
||||
if matches:
|
||||
print(f"\n{name}:")
|
||||
for match in matches[:3]:
|
||||
print(f" {match[:200]}")
|
||||
|
||||
# Also look for structured data
|
||||
print("\n\nSearching for 'Bezichtigingen' section:")
|
||||
bez_match = re.search(r'Bezichtigingen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
|
||||
if bez_match:
|
||||
print(bez_match.group(0)[:500])
|
||||
|
||||
print("\n\nSearching for 'Ophalen' section:")
|
||||
oph_match = re.search(r'Ophalen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
|
||||
if oph_match:
|
||||
print(oph_match.group(0)[:500])
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,45 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Intercept API calls to find where lot data comes from"""
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
# Track API calls
|
||||
api_calls = []
|
||||
|
||||
async def handle_response(response):
|
||||
if 'api' in response.url.lower() or 'graphql' in response.url.lower():
|
||||
try:
|
||||
body = await response.json()
|
||||
api_calls.append({
|
||||
'url': response.url,
|
||||
'status': response.status,
|
||||
'body': body
|
||||
})
|
||||
print(f"\nAPI CALL: {response.url}")
|
||||
print(f"Status: {response.status}")
|
||||
if 'lot' in response.url.lower() or 'auction' in response.url.lower():
|
||||
print(f"Body preview: {json.dumps(body, indent=2)[:500]}")
|
||||
except:
|
||||
pass
|
||||
|
||||
page.on('response', handle_response)
|
||||
|
||||
# Visit auction page
|
||||
print("Loading auction page...")
|
||||
await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
|
||||
|
||||
# Wait a bit for lazy loading
|
||||
await asyncio.sleep(5)
|
||||
|
||||
print(f"\n\nCaptured {len(api_calls)} API calls")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,148 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Migrate existing lot data to extract missing enriched fields
|
||||
"""
|
||||
import sqlite3
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json
|
||||
|
||||
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
||||
|
||||
def migrate_lot_attributes():
|
||||
"""Extract attributes from cached lot pages"""
|
||||
print("="*60)
|
||||
print("MIGRATING EXISTING LOT DATA")
|
||||
print("="*60)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
|
||||
# Get cached lot pages
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content, timestamp
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
ORDER BY timestamp DESC
|
||||
""")
|
||||
|
||||
import zlib
|
||||
updated_count = 0
|
||||
|
||||
for url, content_blob, timestamp in cursor:
|
||||
try:
|
||||
# Get lot_id from URL
|
||||
lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url)
|
||||
if not lot_id_match:
|
||||
lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url)
|
||||
if not lot_id_match:
|
||||
continue
|
||||
|
||||
lot_id = lot_id_match.group(1)
|
||||
|
||||
# Check if lot exists in database
|
||||
lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,))
|
||||
lot_row = lot_cursor.fetchone()
|
||||
if not lot_row:
|
||||
continue
|
||||
|
||||
_, title, description = lot_row
|
||||
|
||||
# Decompress and parse __NEXT_DATA__
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
||||
if not lot_json:
|
||||
continue
|
||||
|
||||
# Extract basic attributes
|
||||
attrs = extract_attributes_from_lot_json(lot_json)
|
||||
|
||||
# Extract enriched attributes
|
||||
page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')}
|
||||
enriched = extract_enriched_attributes(lot_json, page_data)
|
||||
|
||||
# Merge
|
||||
all_attrs = {**attrs, **enriched}
|
||||
|
||||
# Update database
|
||||
conn.execute("""
|
||||
UPDATE lots
|
||||
SET brand = ?,
|
||||
model = ?,
|
||||
attributes_json = ?,
|
||||
year_manufactured = ?,
|
||||
condition_score = ?,
|
||||
condition_description = ?,
|
||||
serial_number = ?,
|
||||
manufacturer = ?,
|
||||
damage_description = ?
|
||||
WHERE lot_id = ?
|
||||
""", (
|
||||
all_attrs.get('brand', ''),
|
||||
all_attrs.get('model', ''),
|
||||
all_attrs.get('attributes_json', ''),
|
||||
all_attrs.get('year_manufactured'),
|
||||
all_attrs.get('condition_score'),
|
||||
all_attrs.get('condition_description', ''),
|
||||
all_attrs.get('serial_number', ''),
|
||||
all_attrs.get('manufacturer', ''),
|
||||
all_attrs.get('damage_description', ''),
|
||||
lot_id
|
||||
))
|
||||
|
||||
updated_count += 1
|
||||
if updated_count % 100 == 0:
|
||||
print(f" Processed {updated_count} lots...")
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error processing {url}: {e}")
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
print(f"\n✓ Updated {updated_count} lots with enriched attributes")
|
||||
|
||||
# Show stats
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
||||
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
|
||||
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
||||
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
|
||||
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
|
||||
FROM lots
|
||||
""")
|
||||
stats = cursor.fetchone()
|
||||
|
||||
print(f"\nENRICHMENT STATISTICS:")
|
||||
print(f" Total lots: {stats[0]:,}")
|
||||
print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
|
||||
print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
|
||||
print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
|
||||
print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
|
||||
print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
print("\nStarting migration of existing data...")
|
||||
print(f"Database: {DB_PATH}\n")
|
||||
|
||||
migrate_lot_attributes()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("MIGRATION COMPLETE")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,51 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Scrape a fresh auction page to see the lots array structure"""
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# Get first auction
|
||||
await page.goto("https://www.troostwijkauctions.com/auctions", wait_until='networkidle')
|
||||
content = await page.content()
|
||||
|
||||
# Find first auction link
|
||||
match = re.search(r'href="(/a/[^"]+)"', content)
|
||||
if not match:
|
||||
print("No auction found")
|
||||
return
|
||||
|
||||
auction_url = f"https://www.troostwijkauctions.com{match.group(1)}"
|
||||
print(f"Scraping: {auction_url}\n")
|
||||
|
||||
await page.goto(auction_url, wait_until='networkidle')
|
||||
content = await page.content()
|
||||
|
||||
# Extract __NEXT_DATA__
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if not match:
|
||||
print("No __NEXT_DATA__ found")
|
||||
return
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
if 'auction' in page_props:
|
||||
auction = page_props['auction']
|
||||
print(f"Auction: {auction.get('name', '')[:50]}...")
|
||||
print(f"Lots in array: {len(auction.get('lots', []))}")
|
||||
|
||||
if auction.get('lots'):
|
||||
lot = auction['lots'][0]
|
||||
print(f"\nFIRST LOT:")
|
||||
print(json.dumps(lot, indent=2)[:1500])
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,47 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Search cached pages for viewing/pickup text"""
|
||||
import sqlite3
|
||||
import zlib
|
||||
import re
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 20
|
||||
""")
|
||||
|
||||
for url, content_blob in cursor:
|
||||
try:
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
|
||||
# Look for viewing/pickup patterns
|
||||
if 'bezichtig' in content.lower() or 'ophalen' in content.lower():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"URL: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Extract sections with context
|
||||
patterns = [
|
||||
(r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'),
|
||||
(r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'),
|
||||
]
|
||||
|
||||
for pattern, label in patterns:
|
||||
matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
|
||||
if matches:
|
||||
print(f"\n{label}:")
|
||||
for match in matches[:1]: # First match
|
||||
# Clean up HTML
|
||||
clean = re.sub(r'<[^>]+>', ' ', match)
|
||||
clean = re.sub(r'\s+', ' ', clean).strip()
|
||||
print(f" {clean[:200]}")
|
||||
|
||||
break # Found one, that's enough
|
||||
except:
|
||||
continue
|
||||
|
||||
conn.close()
|
||||
@@ -1,47 +0,0 @@
|
||||
# PowerShell script to create Windows Task Scheduler job for Scaev Monitor
|
||||
# Run as Administrator
|
||||
|
||||
$TaskName = "ScaevAuctionMonitor"
|
||||
$ScriptPath = "C:\vibe\scaev\src\monitor.py"
|
||||
$PythonPath = "python3" # Adjust if needed
|
||||
$WorkingDir = "C:\vibe\scaev"
|
||||
|
||||
# Create the action (run Python script)
|
||||
$Action = New-ScheduledTaskAction -Execute $PythonPath `
|
||||
-Argument "$ScriptPath 30" `
|
||||
-WorkingDirectory $WorkingDir
|
||||
|
||||
# Trigger: On system startup
|
||||
$TriggerStartup = New-ScheduledTaskTrigger -AtStartup
|
||||
|
||||
# Settings
|
||||
$Settings = New-ScheduledTaskSettingsSet `
|
||||
-AllowStartIfOnBatteries `
|
||||
-DontStopIfGoingOnBatteries `
|
||||
-StartWhenAvailable `
|
||||
-RestartCount 3 `
|
||||
-RestartInterval (New-TimeSpan -Minutes 5)
|
||||
|
||||
# Principal: Run with highest privileges
|
||||
$Principal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest
|
||||
|
||||
# Register the task
|
||||
Register-ScheduledTask `
|
||||
-TaskName $TaskName `
|
||||
-Action $Action `
|
||||
-Trigger $TriggerStartup `
|
||||
-Settings $Settings `
|
||||
-Principal $Principal `
|
||||
-Description "Scaev auction monitor - polls for new auctions every 30 minutes" `
|
||||
-Force
|
||||
|
||||
Write-Host "`nTask '$TaskName' created successfully!" -ForegroundColor Green
|
||||
Write-Host "`nTo manage the task:"
|
||||
Write-Host " 1. Open Task Scheduler (taskschd.msc)"
|
||||
Write-Host " 2. Find 'ScaevAuctionMonitor' in Task Scheduler Library"
|
||||
Write-Host " 3. Right-click to Run, Stop, or Disable"
|
||||
Write-Host "`nOr use PowerShell commands:"
|
||||
Write-Host " Start-ScheduledTask -TaskName '$TaskName'"
|
||||
Write-Host " Stop-ScheduledTask -TaskName '$TaskName'"
|
||||
Write-Host " Disable-ScheduledTask -TaskName '$TaskName'"
|
||||
Write-Host " Get-ScheduledTask -TaskName '$TaskName' | Get-ScheduledTaskInfo"
|
||||
@@ -1,49 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Show migration statistics"""
|
||||
import sqlite3
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
||||
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
|
||||
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
||||
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
|
||||
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
|
||||
FROM lots
|
||||
""")
|
||||
|
||||
stats = cursor.fetchone()
|
||||
|
||||
print("="*60)
|
||||
print("MIGRATION RESULTS")
|
||||
print("="*60)
|
||||
print(f"\nTotal lots: {stats[0]:,}")
|
||||
print(f"Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
|
||||
print(f"Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
|
||||
print(f"Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
|
||||
print(f"Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
|
||||
print(f"Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
|
||||
|
||||
# Show sample enriched data
|
||||
print(f"\n{'='*60}")
|
||||
print("SAMPLE ENRICHED LOTS")
|
||||
print(f"{'='*60}")
|
||||
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, year_manufactured, manufacturer, model, condition_score
|
||||
FROM lots
|
||||
WHERE year_manufactured IS NOT NULL OR manufacturer != ''
|
||||
LIMIT 5
|
||||
""")
|
||||
|
||||
for row in cursor:
|
||||
print(f"\n{row[0]}:")
|
||||
print(f" Year: {row[1]}")
|
||||
print(f" Manufacturer: {row[2]}")
|
||||
print(f" Model: {row[3]}")
|
||||
print(f" Condition: {row[4]}")
|
||||
|
||||
conn.close()
|
||||
173
src/cache.py
173
src/cache.py
@@ -19,8 +19,9 @@ class CacheManager:
|
||||
self._init_db()
|
||||
|
||||
def _init_db(self):
|
||||
"""Initialize cache and data storage database"""
|
||||
"""Initialize cache and data storage database with consolidated schema"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Cache table
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
@@ -32,6 +33,8 @@ class CacheManager:
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
|
||||
""")
|
||||
|
||||
# Auctions table - consolidated schema
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS auctions (
|
||||
auction_id TEXT PRIMARY KEY,
|
||||
@@ -40,9 +43,18 @@ class CacheManager:
|
||||
location TEXT,
|
||||
lots_count INTEGER,
|
||||
first_lot_closing_time TEXT,
|
||||
scraped_at TEXT
|
||||
scraped_at TEXT,
|
||||
city TEXT,
|
||||
country TEXT,
|
||||
type TEXT,
|
||||
lot_count INTEGER DEFAULT 0,
|
||||
closing_time TEXT,
|
||||
discovered_at INTEGER
|
||||
)
|
||||
""")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
|
||||
|
||||
# Lots table - consolidated schema with all fields from working database
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS lots (
|
||||
lot_id TEXT PRIMARY KEY,
|
||||
@@ -50,8 +62,6 @@ class CacheManager:
|
||||
url TEXT UNIQUE,
|
||||
title TEXT,
|
||||
current_bid TEXT,
|
||||
starting_bid TEXT,
|
||||
minimum_bid TEXT,
|
||||
bid_count INTEGER,
|
||||
closing_time TEXT,
|
||||
viewing_time TEXT,
|
||||
@@ -60,9 +70,54 @@ class CacheManager:
|
||||
description TEXT,
|
||||
category TEXT,
|
||||
scraped_at TEXT,
|
||||
sale_id INTEGER,
|
||||
manufacturer TEXT,
|
||||
type TEXT,
|
||||
year INTEGER,
|
||||
currency TEXT DEFAULT 'EUR',
|
||||
closing_notified INTEGER DEFAULT 0,
|
||||
starting_bid TEXT,
|
||||
minimum_bid TEXT,
|
||||
status TEXT,
|
||||
brand TEXT,
|
||||
model TEXT,
|
||||
attributes_json TEXT,
|
||||
first_bid_time TEXT,
|
||||
last_bid_time TEXT,
|
||||
bid_velocity REAL,
|
||||
bid_increment REAL,
|
||||
year_manufactured INTEGER,
|
||||
condition_score REAL,
|
||||
condition_description TEXT,
|
||||
serial_number TEXT,
|
||||
damage_description TEXT,
|
||||
followers_count INTEGER DEFAULT 0,
|
||||
estimated_min_price REAL,
|
||||
estimated_max_price REAL,
|
||||
lot_condition TEXT,
|
||||
appearance TEXT,
|
||||
estimated_min REAL,
|
||||
estimated_max REAL,
|
||||
next_bid_step_cents INTEGER,
|
||||
condition TEXT,
|
||||
category_path TEXT,
|
||||
city_location TEXT,
|
||||
country_code TEXT,
|
||||
bidding_status TEXT,
|
||||
packaging TEXT,
|
||||
quantity INTEGER,
|
||||
vat REAL,
|
||||
buyer_premium_percentage REAL,
|
||||
remarks TEXT,
|
||||
reserve_price REAL,
|
||||
reserve_met INTEGER,
|
||||
view_count INTEGER,
|
||||
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
|
||||
)
|
||||
""")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
|
||||
|
||||
# Images table
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS images (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -70,86 +125,28 @@ class CacheManager:
|
||||
url TEXT,
|
||||
local_path TEXT,
|
||||
downloaded INTEGER DEFAULT 0,
|
||||
labels TEXT,
|
||||
processed_at INTEGER,
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
)
|
||||
""")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_images_lot_id ON images(lot_id)")
|
||||
|
||||
# Add new columns to auctions table if they don't exist
|
||||
cursor = conn.execute("PRAGMA table_info(auctions)")
|
||||
auction_columns = {row[1] for row in cursor.fetchall()}
|
||||
# Remove duplicates before creating unique index
|
||||
conn.execute("""
|
||||
DELETE FROM images
|
||||
WHERE id NOT IN (
|
||||
SELECT MIN(id)
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
|
||||
ON images(lot_id, url)
|
||||
""")
|
||||
|
||||
if 'city' not in auction_columns:
|
||||
conn.execute("ALTER TABLE auctions ADD COLUMN city TEXT")
|
||||
if 'country' not in auction_columns:
|
||||
conn.execute("ALTER TABLE auctions ADD COLUMN country TEXT")
|
||||
if 'type' not in auction_columns:
|
||||
conn.execute("ALTER TABLE auctions ADD COLUMN type TEXT")
|
||||
if 'lot_count' not in auction_columns:
|
||||
conn.execute("ALTER TABLE auctions ADD COLUMN lot_count INTEGER DEFAULT 0")
|
||||
if 'closing_time' not in auction_columns:
|
||||
conn.execute("ALTER TABLE auctions ADD COLUMN closing_time TEXT")
|
||||
if 'discovered_at' not in auction_columns:
|
||||
conn.execute("ALTER TABLE auctions ADD COLUMN discovered_at INTEGER")
|
||||
|
||||
# Add index for country filtering
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
|
||||
|
||||
# Add new columns to lots table if they don't exist
|
||||
cursor = conn.execute("PRAGMA table_info(lots)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if 'starting_bid' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
|
||||
if 'minimum_bid' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
|
||||
if 'status' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN status TEXT")
|
||||
if 'brand' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT")
|
||||
if 'model' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN model TEXT")
|
||||
if 'attributes_json' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT")
|
||||
|
||||
# Bidding intelligence fields
|
||||
if 'first_bid_time' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT")
|
||||
if 'last_bid_time' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT")
|
||||
if 'bid_velocity' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL")
|
||||
if 'bid_increment' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL")
|
||||
|
||||
# Valuation intelligence fields
|
||||
if 'year_manufactured' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER")
|
||||
if 'condition_score' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL")
|
||||
if 'condition_description' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT")
|
||||
if 'serial_number' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT")
|
||||
if 'manufacturer' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT")
|
||||
if 'damage_description' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT")
|
||||
|
||||
# NEW: High-value API fields
|
||||
if 'followers_count' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN followers_count INTEGER DEFAULT 0")
|
||||
if 'estimated_min_price' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN estimated_min_price REAL")
|
||||
if 'estimated_max_price' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN estimated_max_price REAL")
|
||||
if 'lot_condition' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN lot_condition TEXT")
|
||||
if 'appearance' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN appearance TEXT")
|
||||
if 'scraped_at_timestamp' not in columns:
|
||||
conn.execute("ALTER TABLE lots ADD COLUMN scraped_at_timestamp INTEGER")
|
||||
|
||||
# Create bid_history table
|
||||
# Bid history table
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS bid_history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -163,33 +160,15 @@ class CacheManager:
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
)
|
||||
""")
|
||||
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
|
||||
ON bid_history(lot_id, bid_time)
|
||||
""")
|
||||
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
|
||||
ON bid_history(bidder_id)
|
||||
""")
|
||||
|
||||
# Remove duplicates before creating unique index
|
||||
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
|
||||
conn.execute("""
|
||||
DELETE FROM images
|
||||
WHERE id NOT IN (
|
||||
SELECT MIN(id)
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
)
|
||||
""")
|
||||
|
||||
# Now create the unique index
|
||||
conn.execute("""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
|
||||
ON images(lot_id, url)
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
|
||||
|
||||
256
sync_updates.py
256
sync_updates.py
@@ -1,256 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sync local database updates to server-compatible format
|
||||
Creates incremental exports with only NEW or UPDATED records
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import csv
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = "C:/mnt/okcomputer/output/cache.db"
|
||||
OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
|
||||
|
||||
def fill_missing_auction_fields():
|
||||
"""Fill in missing fields in auctions table from scraped data"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print("Filling missing auction fields...")
|
||||
|
||||
# Update closing_time from first_lot_closing_time
|
||||
cursor.execute("""
|
||||
UPDATE auctions
|
||||
SET closing_time = first_lot_closing_time
|
||||
WHERE closing_time IS NULL AND first_lot_closing_time IS NOT NULL
|
||||
""")
|
||||
updated = cursor.rowcount
|
||||
print(f" ✓ Updated {updated} closing_time fields")
|
||||
|
||||
# Parse location to extract city and country
|
||||
cursor.execute("""
|
||||
SELECT auction_id, location
|
||||
FROM auctions
|
||||
WHERE location IS NOT NULL AND (city IS NULL OR country IS NULL)
|
||||
""")
|
||||
locations = cursor.fetchall()
|
||||
|
||||
city_updates = 0
|
||||
for auction_id, location in locations:
|
||||
if not location:
|
||||
continue
|
||||
|
||||
# Parse "City, COUNTRY" or "City, Region, COUNTRY"
|
||||
parts = [p.strip() for p in location.split(',')]
|
||||
if len(parts) >= 2:
|
||||
city = parts[0]
|
||||
country = parts[-1]
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE auctions
|
||||
SET city = ?, country = ?
|
||||
WHERE auction_id = ?
|
||||
""", (city, country, auction_id))
|
||||
city_updates += 1
|
||||
|
||||
print(f" ✓ Updated {city_updates} city/country fields")
|
||||
|
||||
# Set type to 'online' for all (Troostwijk is online platform)
|
||||
cursor.execute("""
|
||||
UPDATE auctions
|
||||
SET type = 'online'
|
||||
WHERE type IS NULL
|
||||
""")
|
||||
type_updates = cursor.rowcount
|
||||
print(f" ✓ Updated {type_updates} type fields")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print(f"✓ Auction fields updated\n")
|
||||
|
||||
def get_last_sync_timestamp():
|
||||
"""Get timestamp of last successful sync"""
|
||||
sync_file = OUTPUT_DIR / ".last_sync"
|
||||
if sync_file.exists():
|
||||
return int(sync_file.read_text().strip())
|
||||
return 0
|
||||
|
||||
def save_sync_timestamp(timestamp: int):
|
||||
"""Save timestamp of successful sync"""
|
||||
sync_file = OUTPUT_DIR / ".last_sync"
|
||||
sync_file.write_text(str(timestamp))
|
||||
|
||||
def export_incremental():
|
||||
"""Export only records that are new or updated since last sync"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
last_sync = get_last_sync_timestamp()
|
||||
current_time = int(datetime.now().timestamp())
|
||||
|
||||
print(f"Last sync: {datetime.fromtimestamp(last_sync).strftime('%Y-%m-%d %H:%M:%S') if last_sync else 'Never'}")
|
||||
print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# Get new/updated auctions
|
||||
cursor.execute("""
|
||||
SELECT * FROM auctions
|
||||
WHERE discovered_at IS NULL OR discovered_at > ?
|
||||
ORDER BY auction_id
|
||||
""", (last_sync,))
|
||||
new_auctions = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# Get new/updated lots
|
||||
cursor.execute("""
|
||||
SELECT * FROM lots
|
||||
WHERE scraped_at_timestamp IS NULL OR scraped_at_timestamp > ?
|
||||
ORDER BY lot_id
|
||||
""", (last_sync,))
|
||||
new_lots = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
conn.close()
|
||||
|
||||
# Export to timestamped files
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
|
||||
results = {
|
||||
'auctions': 0,
|
||||
'lots': 0,
|
||||
'files': {}
|
||||
}
|
||||
|
||||
# Export auctions if any new
|
||||
if new_auctions:
|
||||
auctions_csv = OUTPUT_DIR / f'auctions_update_{timestamp}.csv'
|
||||
auctions_json = OUTPUT_DIR / f'auctions_update_{timestamp}.json'
|
||||
|
||||
with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(new_auctions)
|
||||
|
||||
with open(auctions_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(new_auctions, f, indent=2, ensure_ascii=False)
|
||||
|
||||
results['auctions'] = len(new_auctions)
|
||||
results['files']['auctions_csv'] = str(auctions_csv)
|
||||
results['files']['auctions_json'] = str(auctions_json)
|
||||
|
||||
print(f"\n✓ Exported {len(new_auctions)} new/updated auctions")
|
||||
print(f" CSV: {auctions_csv}")
|
||||
print(f" JSON: {auctions_json}")
|
||||
|
||||
# Export lots if any new
|
||||
if new_lots:
|
||||
lots_csv = OUTPUT_DIR / f'lots_update_{timestamp}.csv'
|
||||
lots_json = OUTPUT_DIR / f'lots_update_{timestamp}.json'
|
||||
|
||||
with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(new_lots)
|
||||
|
||||
with open(lots_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(new_lots, f, indent=2, ensure_ascii=False)
|
||||
|
||||
results['lots'] = len(new_lots)
|
||||
results['files']['lots_csv'] = str(lots_csv)
|
||||
results['files']['lots_json'] = str(lots_json)
|
||||
|
||||
print(f"\n✓ Exported {len(new_lots)} new/updated lots")
|
||||
print(f" CSV: {lots_csv}")
|
||||
print(f" JSON: {lots_json}")
|
||||
|
||||
if not new_auctions and not new_lots:
|
||||
print("\n✓ No new updates since last sync")
|
||||
|
||||
return results
|
||||
|
||||
def create_upsert_export():
|
||||
"""Create SQL script for server to UPSERT (update or insert) data"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
last_sync = get_last_sync_timestamp()
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
|
||||
# Get new/updated auctions
|
||||
cursor.execute("""
|
||||
SELECT * FROM auctions
|
||||
WHERE discovered_at IS NULL OR discovered_at > ?
|
||||
""", (last_sync,))
|
||||
new_auctions = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
if new_auctions:
|
||||
sql_file = OUTPUT_DIR / f'upsert_auctions_{timestamp}.sql'
|
||||
|
||||
with open(sql_file, 'w', encoding='utf-8') as f:
|
||||
f.write("-- UPSERT script for auctions (updates existing, inserts new)\n\n")
|
||||
|
||||
for auction in new_auctions:
|
||||
# Create INSERT OR REPLACE statement
|
||||
columns = list(auction.keys())
|
||||
placeholders = []
|
||||
|
||||
for col, val in auction.items():
|
||||
if val is None:
|
||||
placeholders.append("NULL")
|
||||
elif isinstance(val, (int, float)):
|
||||
placeholders.append(str(val))
|
||||
else:
|
||||
# Escape single quotes
|
||||
escaped = str(val).replace("'", "''")
|
||||
placeholders.append(f"'{escaped}'")
|
||||
|
||||
f.write(f"INSERT OR REPLACE INTO auctions ({', '.join(columns)})\n")
|
||||
f.write(f"VALUES ({', '.join(placeholders)});\n\n")
|
||||
|
||||
print(f"\n✓ Created UPSERT SQL script: {sql_file}")
|
||||
print(f" Server can execute this to avoid constraint errors")
|
||||
|
||||
conn.close()
|
||||
|
||||
def main():
|
||||
"""Main sync process"""
|
||||
print("="*60)
|
||||
print("DATABASE SYNC UTILITY")
|
||||
print("="*60)
|
||||
print(f"Database: {DB_PATH}")
|
||||
print(f"Output: {OUTPUT_DIR}")
|
||||
print("="*60)
|
||||
|
||||
# Step 1: Fill missing fields
|
||||
fill_missing_auction_fields()
|
||||
|
||||
# Step 2: Export incremental updates
|
||||
print("Exporting incremental updates...")
|
||||
results = export_incremental()
|
||||
|
||||
# Step 3: Create UPSERT SQL (prevents constraint errors on server)
|
||||
if results['auctions'] > 0:
|
||||
create_upsert_export()
|
||||
|
||||
# Step 4: Save sync timestamp
|
||||
current_time = int(datetime.now().timestamp())
|
||||
save_sync_timestamp(current_time)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("SYNC COMPLETE")
|
||||
print("="*60)
|
||||
print(f"New auctions: {results['auctions']}")
|
||||
print(f"New lots: {results['lots']}")
|
||||
|
||||
if results['files']:
|
||||
print("\nFiles ready for server import:")
|
||||
for key, path in results['files'].items():
|
||||
print(f" {key}: {path}")
|
||||
|
||||
print("\nNext sync will only export records newer than:")
|
||||
print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,28 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test auction data fetch"""
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from graphql_client import fetch_auction_data, format_auction_data
|
||||
|
||||
async def main():
|
||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||
|
||||
print(f"Fetching auction: {auction_id}\n")
|
||||
auction_data = await fetch_auction_data(auction_id)
|
||||
|
||||
if auction_data:
|
||||
print("Raw Auction Data:")
|
||||
print(json.dumps(auction_data, indent=2))
|
||||
|
||||
print("\n\nFormatted:")
|
||||
formatted = format_auction_data(auction_data)
|
||||
print(f"Viewing: {formatted['viewing_time']}")
|
||||
print(f"Pickup: {formatted['pickup_date']}")
|
||||
else:
|
||||
print("No auction data returned")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,59 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test if the auction query works at all"""
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
|
||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
||||
|
||||
# Try a simpler query first
|
||||
SIMPLE_QUERY = """
|
||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
||||
id
|
||||
displayId
|
||||
viewingDays {
|
||||
startDate
|
||||
endDate
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
collectionDays {
|
||||
startDate
|
||||
endDate
|
||||
city
|
||||
countryCode
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
async def main():
|
||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
||||
|
||||
variables = {
|
||||
"auctionId": auction_id,
|
||||
"locale": "nl",
|
||||
"platform": "TWK"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"query": SIMPLE_QUERY,
|
||||
"variables": variables
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
||||
print(f"Status: {response.status}")
|
||||
text = await response.text()
|
||||
print(f"Response: {text}")
|
||||
|
||||
try:
|
||||
data = await response.json()
|
||||
print(f"\nParsed:")
|
||||
print(json.dumps(data, indent=2))
|
||||
except:
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,95 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test comprehensive data enrichment"""
|
||||
import asyncio
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
async def main():
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
)
|
||||
|
||||
# Test with lot that has bids
|
||||
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
|
||||
|
||||
print(f"Testing comprehensive extraction\n")
|
||||
result = await scraper.crawl_page(page, lot_url)
|
||||
|
||||
if result:
|
||||
print(f"\n{'='*60}")
|
||||
print("COMPREHENSIVE DATA EXTRACTION:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Lot ID: {result.get('lot_id')}")
|
||||
print(f"Title: {result.get('title', '')[:50]}...")
|
||||
print(f"\n[Bidding Intelligence]")
|
||||
print(f" Status: {result.get('status')}")
|
||||
print(f" Current Bid: {result.get('current_bid')}")
|
||||
print(f" Starting Bid: {result.get('starting_bid')}")
|
||||
print(f" Bid Increment: EUR {result.get('bid_increment', 0):.2f}")
|
||||
print(f" Bid Count: {result.get('bid_count')}")
|
||||
print(f" First Bid: {result.get('first_bid_time', 'N/A')}")
|
||||
print(f" Last Bid: {result.get('last_bid_time', 'N/A')}")
|
||||
print(f" Bid Velocity: {result.get('bid_velocity', 0)} bids/hour")
|
||||
print(f"\n[Valuation Intelligence]")
|
||||
print(f" Brand: {result.get('brand', 'N/A')}")
|
||||
print(f" Model: {result.get('model', 'N/A')}")
|
||||
print(f" Year: {result.get('year_manufactured', 'N/A')}")
|
||||
print(f" Manufacturer: {result.get('manufacturer', 'N/A')}")
|
||||
print(f" Condition Score: {result.get('condition_score', 'N/A')}")
|
||||
print(f" Condition: {result.get('condition_description', 'N/A')}")
|
||||
print(f" Serial#: {result.get('serial_number', 'N/A')}")
|
||||
print(f" Damage: {result.get('damage_description', 'N/A')[:50] if result.get('damage_description') else 'N/A'}...")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Verify database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
# Check lot data
|
||||
cursor = conn.execute("""
|
||||
SELECT bid_velocity, first_bid_time, year_manufactured, condition_score
|
||||
FROM lots
|
||||
WHERE lot_id = ?
|
||||
""", (result.get('lot_id'),))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
print(f"\n{'='*60}")
|
||||
print("DATABASE VERIFICATION (lots table):")
|
||||
print(f"{'='*60}")
|
||||
print(f" Bid Velocity: {row[0]}")
|
||||
print(f" First Bid Time: {row[1]}")
|
||||
print(f" Year: {row[2]}")
|
||||
print(f" Condition Score: {row[3]}")
|
||||
|
||||
# Check bid history
|
||||
cursor = conn.execute("""
|
||||
SELECT COUNT(*), MIN(bid_time), MAX(bid_time), SUM(is_autobid)
|
||||
FROM bid_history
|
||||
WHERE lot_id = ?
|
||||
""", (result.get('lot_id'),))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row and row[0] > 0:
|
||||
print(f"\n{'='*60}")
|
||||
print("DATABASE VERIFICATION (bid_history table):")
|
||||
print(f"{'='*60}")
|
||||
print(f" Total Bids Stored: {row[0]}")
|
||||
print(f" First Bid: {row[1]}")
|
||||
print(f" Last Bid: {row[2]}")
|
||||
print(f" Autobids: {row[3]}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,49 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test concurrent image downloads"""
|
||||
import asyncio
|
||||
import time
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
async def main():
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
)
|
||||
|
||||
# Test with a lot that has multiple images
|
||||
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
|
||||
|
||||
print(f"Testing concurrent image downloads\n")
|
||||
print(f"Lot: {lot_url}\n")
|
||||
|
||||
start_time = time.time()
|
||||
result = await scraper.crawl_page(page, lot_url)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TIMING RESULTS:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total time: {elapsed:.2f}s")
|
||||
|
||||
image_count = len(result.get('images', []))
|
||||
print(f"Images: {image_count}")
|
||||
|
||||
if image_count > 1:
|
||||
print(f"Time per image: {elapsed/image_count:.2f}s (if sequential)")
|
||||
print(f"Actual time: {elapsed:.2f}s (concurrent!)")
|
||||
speedup = (image_count * 0.5) / elapsed if elapsed > 0 else 1
|
||||
print(f"Speedup factor: {speedup:.1f}x")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,66 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test the full scraper with one lot"""
|
||||
import asyncio
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
async def main():
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
)
|
||||
|
||||
# Test with a known lot
|
||||
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
|
||||
|
||||
print(f"Testing with: {lot_url}\n")
|
||||
result = await scraper.crawl_page(page, lot_url)
|
||||
|
||||
if result:
|
||||
print(f"\n{'='*60}")
|
||||
print("FINAL RESULT:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Lot ID: {result.get('lot_id')}")
|
||||
print(f"Title: {result.get('title', '')[:50]}...")
|
||||
print(f"Current Bid: {result.get('current_bid')}")
|
||||
print(f"Starting Bid: {result.get('starting_bid')}")
|
||||
print(f"Minimum Bid: {result.get('minimum_bid')}")
|
||||
print(f"Bid Count: {result.get('bid_count')}")
|
||||
print(f"Closing Time: {result.get('closing_time')}")
|
||||
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
|
||||
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
|
||||
print(f"Location: {result.get('location')}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Verify database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
cursor = conn.execute("""
|
||||
SELECT current_bid, starting_bid, minimum_bid, bid_count, closing_time
|
||||
FROM lots
|
||||
WHERE lot_id = 'A1-28505-5'
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
print(f"\n{'='*60}")
|
||||
print("DATABASE VERIFICATION:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Current Bid: {row[0]}")
|
||||
print(f"Starting Bid: {row[1]}")
|
||||
print(f"Minimum Bid: {row[2]}")
|
||||
print(f"Bid Count: {row[3]}")
|
||||
print(f"Closing Time: {row[4]}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,32 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test the updated scraper with GraphQL integration"""
|
||||
import asyncio
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from graphql_client import fetch_lot_bidding_data, format_bid_data
|
||||
|
||||
async def main():
|
||||
# Test with known lot ID
|
||||
lot_id = "A1-28505-5"
|
||||
|
||||
print(f"Testing GraphQL API with lot: {lot_id}\n")
|
||||
|
||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
||||
|
||||
if bidding_data:
|
||||
print("Raw GraphQL Response:")
|
||||
print("="*60)
|
||||
import json
|
||||
print(json.dumps(bidding_data, indent=2))
|
||||
|
||||
print("\n\nFormatted Data:")
|
||||
print("="*60)
|
||||
formatted = format_bid_data(bidding_data)
|
||||
for key, value in formatted.items():
|
||||
print(f" {key}: {value}")
|
||||
else:
|
||||
print("Failed to fetch bidding data")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,43 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test scraping a single live lot page"""
|
||||
import asyncio
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
async def main():
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# Get a lot URL from the database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
cursor = conn.execute("SELECT url FROM lots LIMIT 1")
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row:
|
||||
print("No lots in database")
|
||||
return
|
||||
|
||||
lot_url = row[0]
|
||||
print(f"Fetching: {lot_url}\n")
|
||||
|
||||
result = await scraper.crawl_page(page, lot_url)
|
||||
|
||||
if result:
|
||||
print(f"\nExtracted Data:")
|
||||
print(f" current_bid: {result.get('current_bid')}")
|
||||
print(f" bid_count: {result.get('bid_count')}")
|
||||
print(f" closing_time: {result.get('closing_time')}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,64 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test the new fields extraction"""
|
||||
import asyncio
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from scraper import TroostwijkScraper
|
||||
|
||||
async def main():
|
||||
scraper = TroostwijkScraper()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
)
|
||||
|
||||
# Test with lot that has attributes
|
||||
lot_url = "https://www.troostwijkauctions.com/l/47-5kg-hexagon-dumbbell-%25282x%2529-A1-40668-34"
|
||||
|
||||
print(f"Testing new fields with: {lot_url}\n")
|
||||
result = await scraper.crawl_page(page, lot_url)
|
||||
|
||||
if result:
|
||||
print(f"\n{'='*60}")
|
||||
print("EXTRACTED FIELDS:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Lot ID: {result.get('lot_id')}")
|
||||
print(f"Title: {result.get('title', '')[:50]}...")
|
||||
print(f"Status: {result.get('status')}")
|
||||
print(f"Brand: {result.get('brand')}")
|
||||
print(f"Model: {result.get('model')}")
|
||||
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
|
||||
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
|
||||
print(f"Attributes: {result.get('attributes_json', '')[:100]}...")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Verify database
|
||||
import sqlite3
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
cursor = conn.execute("""
|
||||
SELECT status, brand, model, viewing_time, pickup_date
|
||||
FROM lots
|
||||
WHERE lot_id = ?
|
||||
""", (result.get('lot_id'),))
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
print(f"\n{'='*60}")
|
||||
print("DATABASE VERIFICATION:")
|
||||
print(f"{'='*60}")
|
||||
print(f"Status: {row[0]}")
|
||||
print(f"Brand: {row[1]}")
|
||||
print(f"Model: {row[2]}")
|
||||
print(f"Viewing: {row[3][:100] if row[3] else 'N/A'}...")
|
||||
print(f"Pickup: {row[4][:100] if row[4] else 'N/A'}...")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
306
validate_data.py
306
validate_data.py
@@ -1,306 +0,0 @@
|
||||
"""
|
||||
Validate data quality and completeness in the database.
|
||||
Checks if scraped data matches expectations and API capabilities.
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Tuple
|
||||
from cache import CacheManager
|
||||
|
||||
cache = CacheManager()
|
||||
DB_PATH = cache.db_path
|
||||
|
||||
def get_db_stats() -> Dict:
|
||||
"""Get comprehensive database statistics"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
stats = {}
|
||||
|
||||
# Total counts
|
||||
stats['total_auctions'] = cursor.execute("SELECT COUNT(*) FROM auctions").fetchone()[0]
|
||||
stats['total_lots'] = cursor.execute("SELECT COUNT(*) FROM lots").fetchone()[0]
|
||||
stats['total_images'] = cursor.execute("SELECT COUNT(*) FROM images").fetchone()[0]
|
||||
stats['total_bid_history'] = cursor.execute("SELECT COUNT(*) FROM bid_history").fetchone()[0]
|
||||
|
||||
# Auctions completeness
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
|
||||
SUM(CASE WHEN lots_count IS NOT NULL THEN 1 ELSE 0 END) as has_lots_count,
|
||||
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
|
||||
SUM(CASE WHEN first_lot_closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_lot_closing
|
||||
FROM auctions
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
stats['auctions'] = {
|
||||
'total': row[0],
|
||||
'has_title': row[1],
|
||||
'has_lots_count': row[2],
|
||||
'has_closing_time': row[3],
|
||||
'has_first_lot_closing': row[4]
|
||||
}
|
||||
|
||||
# Lots completeness - Core fields
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
|
||||
SUM(CASE WHEN current_bid IS NOT NULL THEN 1 ELSE 0 END) as has_current_bid,
|
||||
SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid,
|
||||
SUM(CASE WHEN minimum_bid IS NOT NULL THEN 1 ELSE 0 END) as has_minimum_bid,
|
||||
SUM(CASE WHEN bid_count IS NOT NULL AND bid_count > 0 THEN 1 ELSE 0 END) as has_bids,
|
||||
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
|
||||
SUM(CASE WHEN status IS NOT NULL AND status != '' THEN 1 ELSE 0 END) as has_status
|
||||
FROM lots
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
stats['lots_core'] = {
|
||||
'total': row[0],
|
||||
'has_title': row[1],
|
||||
'has_current_bid': row[2],
|
||||
'has_starting_bid': row[3],
|
||||
'has_minimum_bid': row[4],
|
||||
'has_bids': row[5],
|
||||
'has_closing_time': row[6],
|
||||
'has_status': row[7]
|
||||
}
|
||||
|
||||
# Lots completeness - Enriched fields
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN brand IS NOT NULL AND brand != '' THEN 1 ELSE 0 END) as has_brand,
|
||||
SUM(CASE WHEN model IS NOT NULL AND model != '' THEN 1 ELSE 0 END) as has_model,
|
||||
SUM(CASE WHEN manufacturer IS NOT NULL AND manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
||||
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
||||
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition_score,
|
||||
SUM(CASE WHEN condition_description IS NOT NULL AND condition_description != '' THEN 1 ELSE 0 END) as has_condition_desc,
|
||||
SUM(CASE WHEN serial_number IS NOT NULL AND serial_number != '' THEN 1 ELSE 0 END) as has_serial,
|
||||
SUM(CASE WHEN damage_description IS NOT NULL AND damage_description != '' THEN 1 ELSE 0 END) as has_damage
|
||||
FROM lots
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
stats['lots_enriched'] = {
|
||||
'total': row[0],
|
||||
'has_brand': row[1],
|
||||
'has_model': row[2],
|
||||
'has_manufacturer': row[3],
|
||||
'has_year': row[4],
|
||||
'has_condition_score': row[5],
|
||||
'has_condition_desc': row[6],
|
||||
'has_serial': row[7],
|
||||
'has_damage': row[8]
|
||||
}
|
||||
|
||||
# Lots completeness - Bid intelligence
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN first_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_bid_time,
|
||||
SUM(CASE WHEN last_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_last_bid_time,
|
||||
SUM(CASE WHEN bid_velocity IS NOT NULL THEN 1 ELSE 0 END) as has_bid_velocity,
|
||||
SUM(CASE WHEN bid_increment IS NOT NULL THEN 1 ELSE 0 END) as has_bid_increment
|
||||
FROM lots
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
stats['lots_bid_intelligence'] = {
|
||||
'total': row[0],
|
||||
'has_first_bid_time': row[1],
|
||||
'has_last_bid_time': row[2],
|
||||
'has_bid_velocity': row[3],
|
||||
'has_bid_increment': row[4]
|
||||
}
|
||||
|
||||
# Bid history stats
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
COUNT(DISTINCT lot_id) as lots_with_history,
|
||||
COUNT(*) as total_bids,
|
||||
SUM(CASE WHEN is_autobid = 1 THEN 1 ELSE 0 END) as autobids,
|
||||
SUM(CASE WHEN bidder_id IS NOT NULL THEN 1 ELSE 0 END) as has_bidder_id
|
||||
FROM bid_history
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
stats['bid_history'] = {
|
||||
'lots_with_history': row[0],
|
||||
'total_bids': row[1],
|
||||
'autobids': row[2],
|
||||
'has_bidder_id': row[3]
|
||||
}
|
||||
|
||||
# Image stats
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
COUNT(DISTINCT lot_id) as lots_with_images,
|
||||
COUNT(*) as total_images,
|
||||
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded_images,
|
||||
SUM(CASE WHEN local_path IS NOT NULL THEN 1 ELSE 0 END) as has_local_path
|
||||
FROM images
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
stats['images'] = {
|
||||
'lots_with_images': row[0],
|
||||
'total_images': row[1],
|
||||
'downloaded_images': row[2],
|
||||
'has_local_path': row[3]
|
||||
}
|
||||
|
||||
conn.close()
|
||||
return stats
|
||||
|
||||
def check_data_quality() -> List[Tuple[str, str, str]]:
|
||||
"""Check for data quality issues"""
|
||||
issues = []
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check for lots without auction
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM lots
|
||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
||||
""")
|
||||
orphaned_lots = cursor.fetchone()[0]
|
||||
if orphaned_lots > 0:
|
||||
issues.append(("ERROR", "Orphaned Lots", f"{orphaned_lots} lots without matching auction"))
|
||||
|
||||
# Check for lots with bids but no bid history
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM lots
|
||||
WHERE bid_count > 0
|
||||
AND lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
|
||||
""")
|
||||
missing_history = cursor.fetchone()[0]
|
||||
if missing_history > 0:
|
||||
issues.append(("WARNING", "Missing Bid History", f"{missing_history} lots have bids but no bid history records"))
|
||||
|
||||
# Check for lots with closing time in past but still active
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM lots
|
||||
WHERE closing_time IS NOT NULL
|
||||
AND closing_time < datetime('now')
|
||||
AND status NOT LIKE '%gesloten%'
|
||||
""")
|
||||
past_closing = cursor.fetchone()[0]
|
||||
if past_closing > 0:
|
||||
issues.append(("INFO", "Past Closing Time", f"{past_closing} lots have closing time in past"))
|
||||
|
||||
# Check for duplicate lot_ids
|
||||
cursor.execute("""
|
||||
SELECT lot_id, COUNT(*) FROM lots
|
||||
GROUP BY lot_id
|
||||
HAVING COUNT(*) > 1
|
||||
""")
|
||||
duplicates = cursor.fetchall()
|
||||
if duplicates:
|
||||
issues.append(("ERROR", "Duplicate Lot IDs", f"{len(duplicates)} duplicate lot_id values found"))
|
||||
|
||||
# Check for lots without images
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM lots
|
||||
WHERE lot_id NOT IN (SELECT DISTINCT lot_id FROM images)
|
||||
""")
|
||||
no_images = cursor.fetchone()[0]
|
||||
if no_images > 0:
|
||||
issues.append(("WARNING", "No Images", f"{no_images} lots have no images"))
|
||||
|
||||
conn.close()
|
||||
return issues
|
||||
|
||||
def print_validation_report():
|
||||
"""Print comprehensive validation report"""
|
||||
print("=" * 80)
|
||||
print("DATABASE VALIDATION REPORT")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
stats = get_db_stats()
|
||||
|
||||
# Overall counts
|
||||
print("OVERALL COUNTS:")
|
||||
print(f" Auctions: {stats['total_auctions']:,}")
|
||||
print(f" Lots: {stats['total_lots']:,}")
|
||||
print(f" Images: {stats['total_images']:,}")
|
||||
print(f" Bid History Records: {stats['total_bid_history']:,}")
|
||||
print()
|
||||
|
||||
# Auctions completeness
|
||||
print("AUCTIONS COMPLETENESS:")
|
||||
a = stats['auctions']
|
||||
print(f" Title: {a['has_title']:,} / {a['total']:,} ({a['has_title']/a['total']*100:.1f}%)")
|
||||
print(f" Lots Count: {a['has_lots_count']:,} / {a['total']:,} ({a['has_lots_count']/a['total']*100:.1f}%)")
|
||||
print(f" Closing Time: {a['has_closing_time']:,} / {a['total']:,} ({a['has_closing_time']/a['total']*100:.1f}%)")
|
||||
print(f" First Lot Closing: {a['has_first_lot_closing']:,} / {a['total']:,} ({a['has_first_lot_closing']/a['total']*100:.1f}%)")
|
||||
print()
|
||||
|
||||
# Lots core completeness
|
||||
print("LOTS CORE FIELDS:")
|
||||
l = stats['lots_core']
|
||||
print(f" Title: {l['has_title']:,} / {l['total']:,} ({l['has_title']/l['total']*100:.1f}%)")
|
||||
print(f" Current Bid: {l['has_current_bid']:,} / {l['total']:,} ({l['has_current_bid']/l['total']*100:.1f}%)")
|
||||
print(f" Starting Bid: {l['has_starting_bid']:,} / {l['total']:,} ({l['has_starting_bid']/l['total']*100:.1f}%)")
|
||||
print(f" Minimum Bid: {l['has_minimum_bid']:,} / {l['total']:,} ({l['has_minimum_bid']/l['total']*100:.1f}%)")
|
||||
print(f" Has Bids (>0): {l['has_bids']:,} / {l['total']:,} ({l['has_bids']/l['total']*100:.1f}%)")
|
||||
print(f" Closing Time: {l['has_closing_time']:,} / {l['total']:,} ({l['has_closing_time']/l['total']*100:.1f}%)")
|
||||
print(f" Status: {l['has_status']:,} / {l['total']:,} ({l['has_status']/l['total']*100:.1f}%)")
|
||||
print()
|
||||
|
||||
# Lots enriched fields
|
||||
print("LOTS ENRICHED FIELDS:")
|
||||
e = stats['lots_enriched']
|
||||
print(f" Brand: {e['has_brand']:,} / {e['total']:,} ({e['has_brand']/e['total']*100:.1f}%)")
|
||||
print(f" Model: {e['has_model']:,} / {e['total']:,} ({e['has_model']/e['total']*100:.1f}%)")
|
||||
print(f" Manufacturer: {e['has_manufacturer']:,} / {e['total']:,} ({e['has_manufacturer']/e['total']*100:.1f}%)")
|
||||
print(f" Year: {e['has_year']:,} / {e['total']:,} ({e['has_year']/e['total']*100:.1f}%)")
|
||||
print(f" Condition Score: {e['has_condition_score']:,} / {e['total']:,} ({e['has_condition_score']/e['total']*100:.1f}%)")
|
||||
print(f" Condition Desc: {e['has_condition_desc']:,} / {e['total']:,} ({e['has_condition_desc']/e['total']*100:.1f}%)")
|
||||
print(f" Serial Number: {e['has_serial']:,} / {e['total']:,} ({e['has_serial']/e['total']*100:.1f}%)")
|
||||
print(f" Damage Desc: {e['has_damage']:,} / {e['total']:,} ({e['has_damage']/e['total']*100:.1f}%)")
|
||||
print()
|
||||
|
||||
# Bid intelligence
|
||||
print("LOTS BID INTELLIGENCE:")
|
||||
b = stats['lots_bid_intelligence']
|
||||
print(f" First Bid Time: {b['has_first_bid_time']:,} / {b['total']:,} ({b['has_first_bid_time']/b['total']*100:.1f}%)")
|
||||
print(f" Last Bid Time: {b['has_last_bid_time']:,} / {b['total']:,} ({b['has_last_bid_time']/b['total']*100:.1f}%)")
|
||||
print(f" Bid Velocity: {b['has_bid_velocity']:,} / {b['total']:,} ({b['has_bid_velocity']/b['total']*100:.1f}%)")
|
||||
print(f" Bid Increment: {b['has_bid_increment']:,} / {b['total']:,} ({b['has_bid_increment']/b['total']*100:.1f}%)")
|
||||
print()
|
||||
|
||||
# Bid history
|
||||
print("BID HISTORY:")
|
||||
h = stats['bid_history']
|
||||
print(f" Lots with History: {h['lots_with_history']:,}")
|
||||
print(f" Total Bid Records: {h['total_bids']:,}")
|
||||
print(f" Autobids: {h['autobids']:,} ({h['autobids']/max(h['total_bids'],1)*100:.1f}%)")
|
||||
print(f" Has Bidder ID: {h['has_bidder_id']:,} ({h['has_bidder_id']/max(h['total_bids'],1)*100:.1f}%)")
|
||||
print()
|
||||
|
||||
# Images
|
||||
print("IMAGES:")
|
||||
i = stats['images']
|
||||
print(f" Lots with Images: {i['lots_with_images']:,}")
|
||||
print(f" Total Images: {i['total_images']:,}")
|
||||
print(f" Downloaded: {i['downloaded_images']:,} ({i['downloaded_images']/max(i['total_images'],1)*100:.1f}%)")
|
||||
print(f" Has Local Path: {i['has_local_path']:,} ({i['has_local_path']/max(i['total_images'],1)*100:.1f}%)")
|
||||
print()
|
||||
|
||||
# Data quality issues
|
||||
print("=" * 80)
|
||||
print("DATA QUALITY ISSUES:")
|
||||
print("=" * 80)
|
||||
issues = check_data_quality()
|
||||
if issues:
|
||||
for severity, category, message in issues:
|
||||
print(f" [{severity}] {category}: {message}")
|
||||
else:
|
||||
print(" No issues found!")
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
print_validation_report()
|
||||
@@ -1,92 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Verification script to check image download status and duplicates
|
||||
Run this after deployment to verify the scraper is working correctly
|
||||
"""
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
||||
|
||||
def verify_database():
|
||||
"""Run verification queries on the database"""
|
||||
|
||||
if not Path(DB_PATH).exists():
|
||||
print(f"❌ Database not found: {DB_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
|
||||
print("=" * 60)
|
||||
print("IMAGE DOWNLOAD VERIFICATION")
|
||||
print("=" * 60)
|
||||
|
||||
# Check download success rate
|
||||
print("\n[*] Download Success Rate:")
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total_images,
|
||||
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
|
||||
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
|
||||
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
|
||||
FROM images
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
print(f" Total images: {row[0]:,}")
|
||||
print(f" Downloaded: {row[1]:,}")
|
||||
print(f" Not downloaded: {row[2]:,}")
|
||||
print(f" Success rate: {row[3]}%")
|
||||
|
||||
# Check for duplicates
|
||||
print("\n[*] Duplicate Check:")
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, url, COUNT(*) as dup_count
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
HAVING COUNT(*) > 1
|
||||
LIMIT 5
|
||||
""")
|
||||
duplicates = cursor.fetchall()
|
||||
|
||||
if duplicates:
|
||||
print(f" [!] Found {len(duplicates)} duplicate entries!")
|
||||
for lot_id, url, count in duplicates:
|
||||
print(f" {lot_id}: {url[:50]}... (x{count})")
|
||||
else:
|
||||
print(" [+] No duplicates found!")
|
||||
|
||||
# Verify file system
|
||||
print("\n[*] File System Verification:")
|
||||
cursor = conn.execute("""
|
||||
SELECT COUNT(*)
|
||||
FROM images
|
||||
WHERE downloaded = 1
|
||||
AND local_path IS NOT NULL
|
||||
AND local_path != ''
|
||||
""")
|
||||
files_with_path = cursor.fetchone()[0]
|
||||
print(f" Images with local_path: {files_with_path:,}")
|
||||
|
||||
# Sample some downloaded images
|
||||
print("\n[*] Sample Downloaded Images:")
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, local_path
|
||||
FROM images
|
||||
WHERE downloaded = 1
|
||||
AND local_path IS NOT NULL
|
||||
LIMIT 5
|
||||
""")
|
||||
samples = cursor.fetchall()
|
||||
for lot_id, path in samples:
|
||||
exists = "[+]" if Path(path).exists() else "[!]"
|
||||
print(f" {exists} {lot_id}: {path}")
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("VERIFICATION COMPLETE")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
verify_database()
|
||||
Reference in New Issue
Block a user