diff --git a/check_apollo_state.py b/check_apollo_state.py
deleted file mode 100644
index 287981a..0000000
--- a/check_apollo_state.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-"""Check for Apollo state or other embedded data"""
-import asyncio
-import json
-import re
-from playwright.async_api import async_playwright
-
-async def main():
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page()
-
- await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
- content = await page.content()
-
- # Look for embedded data structures
- patterns = [
- (r'', "NEXT_DATA"),
- (r'window\.__APOLLO_STATE__\s*=\s*({.+?});', "APOLLO_STATE"),
- (r'"lots"\s*:\s*\[(.+?)\]', "LOTS_ARRAY"),
- ]
-
- for pattern, name in patterns:
- match = re.search(pattern, content, re.DOTALL)
- if match:
- print(f"\n{'='*60}")
- print(f"FOUND: {name}")
- print(f"{'='*60}")
- try:
- if name == "LOTS_ARRAY":
- print(f"Preview: {match.group(1)[:500]}")
- else:
- data = json.loads(match.group(1))
- print(json.dumps(data, indent=2)[:2000])
- except:
- print(f"Preview: {match.group(1)[:1000]}")
-
- # Also check for any script tags with "lot" and "bid" and "end"
- print(f"\n{'='*60}")
- print("SEARCHING FOR LOT DATA IN ALL SCRIPTS")
- print(f"{'='*60}")
-
- scripts = re.findall(r'', content, re.DOTALL)
- for i, script in enumerate(scripts):
- if all(term in script.lower() for term in ['lot', 'bid', 'end']):
- print(f"\nScript #{i} (first 500 chars):")
- print(script[:500])
- if i > 3: # Limit output
- break
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/check_data.py b/check_data.py
deleted file mode 100644
index c35f646..0000000
--- a/check_data.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-"""Check current data quality in cache.db"""
-import sqlite3
-
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
-print("=" * 60)
-print("CURRENT DATA QUALITY CHECK")
-print("=" * 60)
-
-# Check lots table
-print("\n[*] Sample Lot Data:")
-cursor = conn.execute("""
- SELECT lot_id, current_bid, bid_count, closing_time
- FROM lots
- LIMIT 10
-""")
-for row in cursor:
- print(f" Lot: {row[0]}")
- print(f" Current Bid: {row[1]}")
- print(f" Bid Count: {row[2]}")
- print(f" Closing Time: {row[3]}")
-
-# Check auctions table
-print("\n[*] Sample Auction Data:")
-cursor = conn.execute("""
- SELECT auction_id, title, closing_time, first_lot_closing_time
- FROM auctions
- LIMIT 5
-""")
-for row in cursor:
- print(f" Auction: {row[0]}")
- print(f" Title: {row[1][:50]}...")
- print(f" Closing Time: {row[2] if len(row) > 2 else 'N/A'}")
- print(f" First Lot Closing: {row[3]}")
-
-# Data completeness stats
-print("\n[*] Data Completeness:")
-cursor = conn.execute("""
- SELECT
- COUNT(*) as total,
- SUM(CASE WHEN current_bid IS NULL OR current_bid = '' THEN 1 ELSE 0 END) as missing_current_bid,
- SUM(CASE WHEN closing_time IS NULL OR closing_time = '' THEN 1 ELSE 0 END) as missing_closing_time,
- SUM(CASE WHEN bid_count IS NULL OR bid_count = 0 THEN 1 ELSE 0 END) as zero_bid_count
- FROM lots
-""")
-row = cursor.fetchone()
-print(f" Total lots: {row[0]:,}")
-print(f" Missing current_bid: {row[1]:,} ({100*row[1]/row[0]:.1f}%)")
-print(f" Missing closing_time: {row[2]:,} ({100*row[2]/row[0]:.1f}%)")
-print(f" Zero bid_count: {row[3]:,} ({100*row[3]/row[0]:.1f}%)")
-
-conn.close()
-print("\n" + "=" * 60)
diff --git a/check_graphql_full.py b/check_graphql_full.py
deleted file mode 100644
index 09dc901..0000000
--- a/check_graphql_full.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env python3
-"""Check if GraphQL has viewing/pickup data"""
-import asyncio
-import json
-import sys
-sys.path.insert(0, 'src')
-
-from graphql_client import GRAPHQL_ENDPOINT
-import aiohttp
-
-# Expanded query to check for all available fields
-EXTENDED_QUERY = """
-query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
- lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
- lot {
- id
- displayId
- auctionId
- currentBidAmount { cents currency }
- initialAmount { cents currency }
- nextMinimalBid { cents currency }
- bidsCount
- startDate
- endDate
-
- # Try to find viewing/pickup fields
- viewingDays { startDate endDate city countryCode }
- collectionDays { startDate endDate city countryCode }
- pickupDays { startDate endDate city countryCode }
- }
- auction {
- id
- displayId
- viewingDays { startDate endDate city countryCode }
- collectionDays { startDate endDate city countryCode }
- }
- }
-}
-"""
-
-async def main():
- variables = {
- "lotDisplayId": "A1-28505-5",
- "locale": "nl",
- "platform": "TWK"
- }
-
- payload = {
- "query": EXTENDED_QUERY,
- "variables": variables
- }
-
- try:
- async with aiohttp.ClientSession() as session:
- async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
- if response.status == 200:
- data = await response.json()
- print("Full GraphQL Response:")
- print(json.dumps(data, indent=2))
- else:
- print(f"Error: {response.status}")
- print(await response.text())
- except Exception as e:
- print(f"Exception: {e}")
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/check_lot_auction_link.py b/check_lot_auction_link.py
deleted file mode 100644
index 9268e0d..0000000
--- a/check_lot_auction_link.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""Check how lots link to auctions"""
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
-
-from cache import CacheManager
-import sqlite3
-import zlib
-import json
-import re
-
-cache = CacheManager()
-conn = sqlite3.connect(cache.db_path)
-cursor = conn.cursor()
-
-# Get a lot page from cache
-cursor.execute("SELECT url, content FROM cache WHERE url LIKE '%/l/%' LIMIT 1")
-url, content_blob = cursor.fetchone()
-content = zlib.decompress(content_blob).decode('utf-8')
-
-# Extract __NEXT_DATA__
-match = re.search(r'', content, re.DOTALL)
-data = json.loads(match.group(1))
-
-props = data.get('props', {}).get('pageProps', {})
-print("PageProps keys:", list(props.keys()))
-
-lot = props.get('lot', {})
-print("\nLot data:")
-print(f" displayId: {lot.get('displayId')}")
-print(f" auctionId (UUID): {lot.get('auctionId')}")
-
-# Check if auction data is also included
-auction = props.get('auction')
-if auction:
- print("\nAuction data IS included in lot page!")
- print(f" Auction displayId: {auction.get('displayId')}")
- print(f" Auction id (UUID): {auction.get('id')}")
- print(f" Auction name: {auction.get('name', '')[:60]}")
-else:
- print("\nAuction data NOT included in lot page")
- print("Need to look up auction by UUID")
-
-# Check if we can find the auction by UUID
-lot_auction_uuid = lot.get('auctionId')
-if lot_auction_uuid:
- # Try to find auction page with this UUID
- cursor.execute("""
- SELECT url, content FROM cache
- WHERE url LIKE '%/a/%'
- LIMIT 10
- """)
-
- found_match = False
- for auction_url, auction_content_blob in cursor.fetchall():
- auction_content = zlib.decompress(auction_content_blob).decode('utf-8')
- match = re.search(r'', auction_content, re.DOTALL)
- if match:
- auction_data = json.loads(match.group(1))
- auction_obj = auction_data.get('props', {}).get('pageProps', {}).get('auction', {})
- if auction_obj.get('id') == lot_auction_uuid:
- print(f"\n✓ Found matching auction!")
- print(f" Auction displayId: {auction_obj.get('displayId')}")
- print(f" Auction UUID: {auction_obj.get('id')}")
- print(f" Auction URL: {auction_url}")
- found_match = True
- break
-
- if not found_match:
- print(f"\n✗ Could not find auction with UUID {lot_auction_uuid} in first 10 cached auctions")
-
-conn.close()
diff --git a/check_viewing_data.py b/check_viewing_data.py
deleted file mode 100644
index e8e3405..0000000
--- a/check_viewing_data.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env python3
-"""Check viewing time data"""
-import sqlite3
-
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
-# Check if viewing_time has data
-cursor = conn.execute("""
- SELECT viewing_time, pickup_date
- FROM lots
- WHERE viewing_time IS NOT NULL AND viewing_time != ''
- LIMIT 5
-""")
-
-rows = cursor.fetchall()
-print("Existing viewing_time data:")
-for r in rows:
- print(f" Viewing: {r[0]}")
- print(f" Pickup: {r[1]}")
- print()
-
-# Check overall completeness
-cursor = conn.execute("""
- SELECT
- COUNT(*) as total,
- SUM(CASE WHEN viewing_time IS NOT NULL AND viewing_time != '' THEN 1 ELSE 0 END) as has_viewing,
- SUM(CASE WHEN pickup_date IS NOT NULL AND pickup_date != '' THEN 1 ELSE 0 END) as has_pickup
- FROM lots
-""")
-row = cursor.fetchone()
-print(f"Completeness:")
-print(f" Total lots: {row[0]}")
-print(f" Has viewing_time: {row[1]} ({100*row[1]/row[0]:.1f}%)")
-print(f" Has pickup_date: {row[2]} ({100*row[2]/row[0]:.1f}%)")
-
-conn.close()
diff --git a/check_viewing_time.py b/check_viewing_time.py
deleted file mode 100644
index 4688b54..0000000
--- a/check_viewing_time.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env python3
-"""Check if viewing time is in the GraphQL response"""
-import asyncio
-import json
-from playwright.async_api import async_playwright
-
-async def main():
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page()
-
- responses = []
-
- async def capture_response(response):
- if 'graphql' in response.url and 'LotBiddingData' in await response.text():
- try:
- body = await response.json()
- responses.append(body)
- except:
- pass
-
- page.on('response', capture_response)
-
- await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle')
- await asyncio.sleep(2)
-
- if responses:
- print("Full LotBiddingData Response:")
- print("="*60)
- print(json.dumps(responses[0], indent=2))
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/debug_lot_structure.py b/debug_lot_structure.py
deleted file mode 100644
index 8a8148d..0000000
--- a/debug_lot_structure.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env python3
-"""Debug lot data structure from cached page"""
-import sqlite3
-import zlib
-import json
-import re
-import sys
-sys.path.insert(0, 'src')
-
-from parse import DataParser
-
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
-# Get a recent lot page
-cursor = conn.execute("""
- SELECT url, content
- FROM cache
- WHERE url LIKE '%/l/%'
- ORDER BY timestamp DESC
- LIMIT 1
-""")
-
-row = cursor.fetchone()
-if not row:
- print("No lot pages found")
- exit(1)
-
-url, content_blob = row
-content = zlib.decompress(content_blob).decode('utf-8')
-
-parser = DataParser()
-result = parser.parse_page(content, url)
-
-if result:
- print(f"URL: {url}")
- print(f"\nParsed Data:")
- print(f" type: {result.get('type')}")
- print(f" lot_id: {result.get('lot_id')}")
- print(f" title: {result.get('title', '')[:50]}...")
- print(f" current_bid: {result.get('current_bid')}")
- print(f" bid_count: {result.get('bid_count')}")
- print(f" closing_time: {result.get('closing_time')}")
- print(f" location: {result.get('location')}")
-
-# Also dump the raw JSON
-match = re.search(r'', content, re.DOTALL)
-if match:
- data = json.loads(match.group(1))
- page_props = data.get('props', {}).get('pageProps', {})
-
- if 'lot' in page_props:
- lot = page_props['lot']
- print(f"\nRAW __NEXT_DATA__.lot keys: {list(lot.keys())}")
- print(f"\nSearching for bid/timing fields...")
-
- # Deep search for these fields
- def deep_search(obj, prefix=""):
- if isinstance(obj, dict):
- for k, v in obj.items():
- if any(term in k.lower() for term in ['bid', 'end', 'close', 'date', 'time']):
- print(f" {prefix}{k}: {v}")
- if isinstance(v, (dict, list)):
- deep_search(v, prefix + k + ".")
- elif isinstance(obj, list) and len(obj) > 0:
- deep_search(obj[0], prefix + "[0].")
-
- deep_search(lot)
-
-conn.close()
diff --git a/deep_inspect_lot.py b/deep_inspect_lot.py
deleted file mode 100644
index 64bd218..0000000
--- a/deep_inspect_lot.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python3
-"""Deep inspect lot JSON for viewing/pickup data"""
-import sqlite3
-import zlib
-import json
-import re
-
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
-cursor = conn.execute("""
- SELECT url, content
- FROM cache
- WHERE url LIKE '%/l/%'
- ORDER BY timestamp DESC
- LIMIT 1
-""")
-
-row = cursor.fetchone()
-url, content_blob = row
-content = zlib.decompress(content_blob).decode('utf-8')
-
-match = re.search(r'', content, re.DOTALL)
-data = json.loads(match.group(1))
-lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
-
-print(f"Inspecting: {url}\n")
-
-# Check onboarding
-if 'onboarding' in lot:
- print("ONBOARDING:")
- print(json.dumps(lot['onboarding'], indent=2))
- print()
-
-# Check attributes
-if 'attributes' in lot:
- print("ATTRIBUTES:")
- attrs = lot['attributes']
- print(json.dumps(attrs[:3] if isinstance(attrs, list) else attrs, indent=2))
- print()
-
-# Check condition
-if 'condition' in lot:
- print("CONDITION:")
- print(json.dumps(lot['condition'], indent=2))
- print()
-
-# Check appearance
-if 'appearance' in lot:
- print("APPEARANCE:")
- print(json.dumps(lot['appearance'], indent=2))
- print()
-
-# Check location
-if 'location' in lot:
- print("LOCATION:")
- print(json.dumps(lot['location'], indent=2))
- print()
-
-# Check for any field with "view", "pick", "collect", "date", "time"
-print("\nFIELDS WITH VIEWING/PICKUP/TIME:")
-for key in lot.keys():
- if any(term in key.lower() for term in ['view', 'pick', 'collect', 'date', 'time', 'day']):
- print(f" {key}: {lot[key]}")
-
-conn.close()
diff --git a/enrich_existing_lots.py b/enrich_existing_lots.py
deleted file mode 100644
index a43bff6..0000000
--- a/enrich_existing_lots.py
+++ /dev/null
@@ -1,120 +0,0 @@
-"""
-Enrich existing lots with new intelligence fields:
-- followers_count
-- estimated_min_price / estimated_max_price
-- lot_condition
-- appearance
-
-Reads from cached lot pages __NEXT_DATA__ JSON
-"""
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
-
-import asyncio
-from cache import CacheManager
-import sqlite3
-import zlib
-import json
-import re
-from graphql_client import fetch_lot_bidding_data, format_bid_data
-
-async def enrich_existing_lots():
- """Enrich existing lots with new fields from GraphQL API"""
- cache = CacheManager()
- conn = sqlite3.connect(cache.db_path)
- cursor = conn.cursor()
-
- # Get all lot IDs
- cursor.execute("SELECT lot_id FROM lots")
- lot_ids = [r[0] for r in cursor.fetchall()]
-
- print(f"Found {len(lot_ids)} lots to enrich")
- print("Fetching enrichment data from GraphQL API...")
- print("This will take ~{:.1f} minutes (0.5s rate limit)".format(len(lot_ids) * 0.5 / 60))
-
- enriched = 0
- failed = 0
- no_data = 0
-
- for i, lot_id in enumerate(lot_ids):
- if (i + 1) % 10 == 0:
- print(f"Progress: {i+1}/{len(lot_ids)} ({enriched} enriched, {no_data} no data, {failed} failed)", end='\r')
-
- try:
- # Fetch from GraphQL API
- bidding_data = await fetch_lot_bidding_data(lot_id)
-
- if bidding_data:
- formatted_data = format_bid_data(bidding_data)
-
- # Update lot with new fields
- cursor.execute("""
- UPDATE lots
- SET followers_count = ?,
- estimated_min_price = ?,
- estimated_max_price = ?,
- lot_condition = ?,
- appearance = ?
- WHERE lot_id = ?
- """, (
- formatted_data.get('followers_count', 0),
- formatted_data.get('estimated_min_price'),
- formatted_data.get('estimated_max_price'),
- formatted_data.get('lot_condition', ''),
- formatted_data.get('appearance', ''),
- lot_id
- ))
-
- enriched += 1
-
- # Commit every 50 lots
- if enriched % 50 == 0:
- conn.commit()
-
- else:
- no_data += 1
-
- # Rate limit
- await asyncio.sleep(0.5)
-
- except Exception as e:
- failed += 1
- continue
-
- conn.commit()
-
- print(f"\n\nComplete!")
- print(f"Total lots: {len(lot_ids)}")
- print(f"Enriched: {enriched}")
- print(f"No data: {no_data}")
- print(f"Failed: {failed}")
-
- # Show statistics
- cursor.execute("SELECT COUNT(*) FROM lots WHERE followers_count > 0")
- with_followers = cursor.fetchone()[0]
-
- cursor.execute("SELECT COUNT(*) FROM lots WHERE estimated_min_price IS NOT NULL")
- with_estimates = cursor.fetchone()[0]
-
- cursor.execute("SELECT COUNT(*) FROM lots WHERE lot_condition IS NOT NULL AND lot_condition != ''")
- with_condition = cursor.fetchone()[0]
-
- print(f"\nEnrichment statistics:")
- print(f" Lots with followers_count: {with_followers} ({with_followers/len(lot_ids)*100:.1f}%)")
- print(f" Lots with estimated prices: {with_estimates} ({with_estimates/len(lot_ids)*100:.1f}%)")
- print(f" Lots with condition: {with_condition} ({with_condition/len(lot_ids)*100:.1f}%)")
-
- conn.close()
-
-if __name__ == "__main__":
- print("WARNING: This will make ~16,800 API calls at 0.5s intervals (~2.3 hours)")
- print("Press Ctrl+C to cancel, or wait 5 seconds to continue...")
- import time
- try:
- time.sleep(5)
- except KeyboardInterrupt:
- print("\nCancelled")
- sys.exit(0)
-
- asyncio.run(enrich_existing_lots())
diff --git a/explore_api_fields.py b/explore_api_fields.py
deleted file mode 100644
index db34e17..0000000
--- a/explore_api_fields.py
+++ /dev/null
@@ -1,370 +0,0 @@
-"""
-Explore API responses to identify additional fields available for intelligence.
-Tests GraphQL and REST API responses for field coverage.
-"""
-import asyncio
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
-
-import json
-import aiohttp
-from graphql_client import fetch_lot_bidding_data, GRAPHQL_ENDPOINT
-from bid_history_client import fetch_bid_history, BID_HISTORY_ENDPOINT
-
-async def explore_graphql_schema():
- """Query GraphQL schema to see all available fields"""
- print("=" * 80)
- print("GRAPHQL SCHEMA EXPLORATION")
- print("=" * 80)
-
- # Introspection query for LotDetails type
- introspection_query = """
- query IntrospectionQuery {
- __type(name: "LotDetails") {
- name
- fields {
- name
- type {
- name
- kind
- ofType {
- name
- kind
- }
- }
- }
- }
- }
- """
-
- async with aiohttp.ClientSession() as session:
- try:
- async with session.post(
- GRAPHQL_ENDPOINT,
- json={
- "query": introspection_query,
- "variables": {}
- },
- headers={"Content-Type": "application/json"}
- ) as response:
- if response.status == 200:
- data = await response.json()
- lot_type = data.get('data', {}).get('__type')
- if lot_type:
- print("\nLotDetails available fields:")
- for field in lot_type.get('fields', []):
- field_name = field['name']
- field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex')
- print(f" - {field_name}: {field_type}")
- print()
- else:
- print(f"Failed with status {response.status}")
- except Exception as e:
- print(f"Error: {e}")
-
- # Also try Lot type
- introspection_query_lot = """
- query IntrospectionQuery {
- __type(name: "Lot") {
- name
- fields {
- name
- type {
- name
- kind
- ofType {
- name
- kind
- }
- }
- }
- }
- }
- """
-
- async with aiohttp.ClientSession() as session:
- try:
- async with session.post(
- GRAPHQL_ENDPOINT,
- json={
- "query": introspection_query_lot,
- "variables": {}
- },
- headers={"Content-Type": "application/json"}
- ) as response:
- if response.status == 200:
- data = await response.json()
- lot_type = data.get('data', {}).get('__type')
- if lot_type:
- print("\nLot type available fields:")
- for field in lot_type.get('fields', []):
- field_name = field['name']
- field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex')
- print(f" - {field_name}: {field_type}")
- print()
- except Exception as e:
- print(f"Error: {e}")
-
-async def test_graphql_full_query():
- """Test a comprehensive GraphQL query to see all returned data"""
- print("=" * 80)
- print("GRAPHQL FULL QUERY TEST")
- print("=" * 80)
-
- # Test with a real lot ID
- lot_id = "A1-34731-107" # Example from database
-
- comprehensive_query = """
- query ComprehensiveLotQuery($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
- lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
- lot {
- id
- displayId
- title
- description
- currentBidAmount { cents currency }
- initialAmount { cents currency }
- nextMinimalBid { cents currency }
- bidsCount
- startDate
- endDate
- minimumBidAmountMet
- lotNumber
- auctionId
- lotState
- location {
- city
- countryCode
- }
- viewingDays {
- city
- countryCode
- addressLine1
- addressLine2
- endDate
- startDate
- }
- collectionDays {
- city
- countryCode
- addressLine1
- addressLine2
- endDate
- startDate
- }
- images {
- url
- thumbnailUrl
- }
- attributes {
- name
- value
- }
- }
- }
- }
- """
-
- async with aiohttp.ClientSession() as session:
- try:
- async with session.post(
- GRAPHQL_ENDPOINT,
- json={
- "query": comprehensive_query,
- "variables": {
- "lotDisplayId": lot_id,
- "locale": "nl_NL",
- "platform": "WEB"
- }
- },
- headers={"Content-Type": "application/json"}
- ) as response:
- if response.status == 200:
- data = await response.json()
- print(f"\nFull GraphQL response for {lot_id}:")
- print(json.dumps(data, indent=2))
- print()
- else:
- print(f"Failed with status {response.status}")
- print(await response.text())
- except Exception as e:
- print(f"Error: {e}")
-
-async def test_bid_history_response():
- """Test bid history API to see all returned fields"""
- print("=" * 80)
- print("BID HISTORY API TEST")
- print("=" * 80)
-
- # Get a lot with bids from database
- import sqlite3
- from cache import CacheManager
-
- cache = CacheManager()
- conn = sqlite3.connect(cache.db_path)
- cursor = conn.cursor()
-
- # Find a lot with bids
- cursor.execute("""
- SELECT lot_id, url FROM lots
- WHERE bid_count > 0
- ORDER BY bid_count DESC
- LIMIT 1
- """)
- result = cursor.fetchone()
-
- if result:
- lot_id, url = result
- # Extract UUID from URL
- import re
- match = re.search(r'', content, re.DOTALL)
-
- if not match:
- continue
-
- data = json.loads(match.group(1))
- lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
-
- if not lot:
- continue
-
- lot_display_id = lot.get('displayId')
- lot_uuid = lot.get('id')
-
- if lot_display_id and lot_uuid:
- lot_id_to_uuid[lot_display_id] = lot_uuid
-
- except:
- continue
-
- print(f"\n\nBuilt UUID mapping for {len(lot_id_to_uuid)} lots")
-
- # Fetch bid history for each lot
- print("\nFetching bid history from API...")
-
- fetched = 0
- failed = 0
- no_uuid = 0
-
- for lot_id, bid_count in lots_to_fetch:
- lot_uuid = lot_id_to_uuid.get(lot_id)
-
- if not lot_uuid:
- no_uuid += 1
- continue
-
- try:
- print(f"\nFetching bid history for {lot_id} ({bid_count} bids)...")
- bid_history = await fetch_bid_history(lot_uuid)
-
- if bid_history:
- bid_data = parse_bid_history(bid_history, lot_id)
-
- # Update lots table with bid intelligence
- cursor.execute("""
- UPDATE lots
- SET first_bid_time = ?,
- last_bid_time = ?,
- bid_velocity = ?
- WHERE lot_id = ?
- """, (
- bid_data['first_bid_time'],
- bid_data['last_bid_time'],
- bid_data['bid_velocity'],
- lot_id
- ))
-
- # Save bid history records
- cache.save_bid_history(lot_id, bid_data['bid_records'])
-
- fetched += 1
- print(f" Saved {len(bid_data['bid_records'])} bid records")
- print(f" Bid velocity: {bid_data['bid_velocity']:.2f} bids/hour")
-
- # Commit every 10 lots
- if fetched % 10 == 0:
- conn.commit()
- print(f"\nProgress: {fetched}/{len(lots_to_fetch)} lots processed...")
-
- # Rate limit to be respectful
- await asyncio.sleep(0.5)
-
- else:
- failed += 1
-
- except Exception as e:
- print(f" Error fetching bid history for {lot_id}: {e}")
- failed += 1
- continue
-
- conn.commit()
-
- print(f"\n\nComplete!")
- print(f"Total lots to process: {len(lots_to_fetch)}")
- print(f"Successfully fetched: {fetched}")
- print(f"Failed: {failed}")
- print(f"No UUID found: {no_uuid}")
-
- # Verify fix
- cursor.execute("""
- SELECT COUNT(DISTINCT lot_id) FROM bid_history
- """)
- lots_with_history = cursor.fetchone()[0]
-
- cursor.execute("""
- SELECT COUNT(*) FROM lots WHERE bid_count > 0
- """)
- lots_with_bids = cursor.fetchone()[0]
-
- print(f"\nLots with bids: {lots_with_bids}")
- print(f"Lots with bid history: {lots_with_history}")
- print(f"Coverage: {lots_with_history/lots_with_bids*100:.1f}%")
-
- conn.close()
-
-if __name__ == "__main__":
- asyncio.run(fetch_missing_bid_history())
diff --git a/find_api_endpoint.py b/find_api_endpoint.py
deleted file mode 100644
index 30f8e9e..0000000
--- a/find_api_endpoint.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python3
-"""Find the API endpoint by monitoring network requests"""
-import asyncio
-import json
-from playwright.async_api import async_playwright
-
-async def main():
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page()
-
- requests = []
- responses = []
-
- async def log_request(request):
- if any(term in request.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
- requests.append({
- 'url': request.url,
- 'method': request.method,
- 'headers': dict(request.headers),
- 'post_data': request.post_data
- })
-
- async def log_response(response):
- if any(term in response.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
- try:
- body = await response.text()
- responses.append({
- 'url': response.url,
- 'status': response.status,
- 'body': body[:1000]
- })
- except:
- pass
-
- page.on('request', log_request)
- page.on('response', log_response)
-
- print("Loading lot page...")
- await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
-
- # Wait for dynamic content
- await asyncio.sleep(3)
-
- print(f"\nFound {len(requests)} relevant requests")
- print(f"Found {len(responses)} relevant responses\n")
-
- for req in requests[:10]:
- print(f"REQUEST: {req['method']} {req['url']}")
- if req['post_data']:
- print(f" POST DATA: {req['post_data'][:200]}")
-
- print("\n" + "="*60 + "\n")
-
- for resp in responses[:10]:
- print(f"RESPONSE: {resp['url']}")
- print(f" Status: {resp['status']}")
- print(f" Body: {resp['body'][:300]}")
- print()
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/find_api_valid_lot.py b/find_api_valid_lot.py
deleted file mode 100644
index 25574b5..0000000
--- a/find_api_valid_lot.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env python3
-"""Find API endpoint using a valid lot from database"""
-import asyncio
-import sqlite3
-from playwright.async_api import async_playwright
-
-# Get a valid lot URL
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-cursor = conn.execute("SELECT url FROM lots WHERE url LIKE '%/l/%' LIMIT 5")
-lot_urls = [row[0] for row in cursor.fetchall()]
-conn.close()
-
-async def main():
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page()
-
- api_calls = []
-
- async def log_response(response):
- url = response.url
- # Look for API calls
- if ('api' in url.lower() or 'graphql' in url.lower() or
- '/v2/' in url or '/v3/' in url or '/v4/' in url or
- 'query' in url.lower() or 'mutation' in url.lower()):
- try:
- body = await response.text()
- api_calls.append({
- 'url': url,
- 'status': response.status,
- 'body': body
- })
- print(f"\nAPI: {url}")
- except:
- pass
-
- page.on('response', log_response)
-
- for lot_url in lot_urls[:2]:
- print(f"\n{'='*60}")
- print(f"Loading: {lot_url}")
- print(f"{'='*60}")
-
- try:
- await page.goto(lot_url, wait_until='networkidle', timeout=30000)
- await asyncio.sleep(2)
-
- # Check if page has bid info
- content = await page.content()
- if 'currentBid' in content or 'Current bid' in content or 'Huidig bod' in content:
- print("[+] Page contains bid information")
- break
- except Exception as e:
- print(f"[!] Error: {e}")
- continue
-
- print(f"\n\n{'='*60}")
- print(f"CAPTURED {len(api_calls)} API CALLS")
- print(f"{'='*60}")
-
- for call in api_calls:
- print(f"\n{call['url']}")
- print(f"Status: {call['status']}")
- if 'json' in call['body'][:100].lower() or call['body'].startswith('{'):
- print(f"Body (first 500 chars): {call['body'][:500]}")
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/find_auction_with_lots.py b/find_auction_with_lots.py
deleted file mode 100644
index 4bed970..0000000
--- a/find_auction_with_lots.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python3
-"""Find an auction page with lots data"""
-import sqlite3
-import zlib
-import json
-import re
-
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
-cursor = conn.execute("""
- SELECT url, content
- FROM cache
- WHERE url LIKE '%/a/%'
-""")
-
-for row in cursor:
- url, content_blob = row
- content = zlib.decompress(content_blob).decode('utf-8')
-
- match = re.search(r'', content, re.DOTALL)
- if not match:
- continue
-
- data = json.loads(match.group(1))
- page_props = data.get('props', {}).get('pageProps', {})
-
- if 'auction' in page_props:
- auction = page_props['auction']
- lots = auction.get('lots', [])
-
- if lots and len(lots) > 0:
- print(f"Found auction with {len(lots)} lots: {url}\n")
-
- lot = lots[0]
- print(f"SAMPLE LOT FROM AUCTION.LOTS[]:")
- print(f" displayId: {lot.get('displayId')}")
- print(f" title: {lot.get('title', '')[:50]}...")
- print(f" urlSlug: {lot.get('urlSlug')}")
- print(f"\nBIDDING FIELDS:")
- for key in ['currentBid', 'highestBid', 'startingBid', 'minimumBidAmount', 'bidCount', 'numberOfBids']:
- print(f" {key}: {lot.get(key)}")
- print(f"\nTIMING FIELDS:")
- for key in ['endDate', 'startDate', 'closingTime']:
- print(f" {key}: {lot.get(key)}")
- print(f"\nALL KEYS: {list(lot.keys())[:30]}...")
- break
-
-conn.close()
diff --git a/fix_auctions_table.py b/fix_auctions_table.py
deleted file mode 100644
index 4ca5154..0000000
--- a/fix_auctions_table.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""
-Fix auctions table by replacing with correct data from cached auction pages.
-The auctions table currently has wrong auction_ids (numeric instead of displayId).
-"""
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
-
-from cache import CacheManager
-import sqlite3
-import zlib
-import json
-import re
-from datetime import datetime
-
-def fix_auctions_table():
- """Rebuild auctions table from cached auction pages"""
- cache = CacheManager()
- conn = sqlite3.connect(cache.db_path)
- cursor = conn.cursor()
-
- # Clear existing auctions table
- print("Clearing auctions table...")
- cursor.execute("DELETE FROM auctions")
- conn.commit()
-
- # Get all auction pages from cache
- cursor.execute("""
- SELECT url, content
- FROM cache
- WHERE url LIKE '%/a/%'
- """)
-
- auction_pages = cursor.fetchall()
- print(f"Found {len(auction_pages)} auction pages in cache")
-
- total = 0
- inserted = 0
- errors = 0
-
- print("Extracting auction data from cached pages...")
-
- for url, content_blob in auction_pages:
- total += 1
-
- if total % 10 == 0:
- print(f"Processed {total}/{len(auction_pages)}...", end='\r')
-
- try:
- # Decompress and parse __NEXT_DATA__
- content = zlib.decompress(content_blob).decode('utf-8')
- match = re.search(r'', content, re.DOTALL)
-
- if not match:
- errors += 1
- continue
-
- data = json.loads(match.group(1))
- page_props = data.get('props', {}).get('pageProps', {})
- auction = page_props.get('auction', {})
-
- if not auction:
- errors += 1
- continue
-
- # Extract auction data
- auction_id = auction.get('displayId')
- if not auction_id:
- errors += 1
- continue
-
- title = auction.get('name', '')
-
- # Get location
- location = ''
- viewing_days = auction.get('viewingDays', [])
- if viewing_days and isinstance(viewing_days, list) and len(viewing_days) > 0:
- loc = viewing_days[0]
- city = loc.get('city', '')
- country = loc.get('countryCode', '').upper()
- location = f"{city}, {country}" if city and country else (city or country)
-
- lots_count = auction.get('lotCount', 0)
-
- # Get first lot closing time
- first_lot_closing = ''
- min_end_date = auction.get('minEndDate', '')
- if min_end_date:
- # Format timestamp
- try:
- dt = datetime.fromisoformat(min_end_date.replace('Z', '+00:00'))
- first_lot_closing = dt.strftime('%Y-%m-%d %H:%M:%S')
- except:
- first_lot_closing = min_end_date
-
- scraped_at = datetime.now().isoformat()
-
- # Insert into auctions table
- cursor.execute("""
- INSERT OR REPLACE INTO auctions
- (auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
- VALUES (?, ?, ?, ?, ?, ?, ?)
- """, (auction_id, url, title, location, lots_count, first_lot_closing, scraped_at))
-
- inserted += 1
-
- except Exception as e:
- errors += 1
- continue
-
- conn.commit()
-
- print(f"\n\nComplete!")
- print(f"Total auction pages processed: {total}")
- print(f"Auctions inserted: {inserted}")
- print(f"Errors: {errors}")
-
- # Verify fix
- cursor.execute("SELECT COUNT(*) FROM auctions")
- total_auctions = cursor.fetchone()[0]
- print(f"\nTotal auctions in table: {total_auctions}")
-
- cursor.execute("""
- SELECT COUNT(*) FROM lots
- WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
- AND auction_id != ''
- """)
- orphaned = cursor.fetchone()[0]
-
- print(f"Orphaned lots remaining: {orphaned}")
-
- if orphaned == 0:
- print("\nSUCCESS! All lots now have matching auctions!")
- else:
- # Show sample of remaining orphans
- cursor.execute("""
- SELECT lot_id, auction_id FROM lots
- WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
- AND auction_id != ''
- LIMIT 5
- """)
- print("\nSample remaining orphaned lots:")
- for lot_id, auction_id in cursor.fetchall():
- print(f" {lot_id} -> auction_id: {auction_id}")
-
- # Show what auction_ids we do have
- cursor.execute("SELECT auction_id FROM auctions LIMIT 10")
- print("\nSample auction_ids in auctions table:")
- for row in cursor.fetchall():
- print(f" {row[0]}")
-
- conn.close()
-
-if __name__ == "__main__":
- fix_auctions_table()
diff --git a/fix_orphaned_lots.py b/fix_orphaned_lots.py
deleted file mode 100644
index c38b969..0000000
--- a/fix_orphaned_lots.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""
-Fix orphaned lots by updating auction_id from UUID to displayId.
-This migration reads cached lot pages and extracts the correct auction displayId.
-"""
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
-
-from cache import CacheManager
-import sqlite3
-import zlib
-import json
-import re
-
-def fix_orphaned_lots():
- """Update lot auction_id from UUID to auction displayId"""
- cache = CacheManager()
- conn = sqlite3.connect(cache.db_path)
- cursor = conn.cursor()
-
- # Get all lots that need fixing (have UUID auction_id)
- cursor.execute("""
- SELECT l.lot_id, l.auction_id
- FROM lots l
- WHERE length(l.auction_id) > 20 -- UUID is longer than displayId like "A1-12345"
- """)
-
- lots_to_fix = {lot_id: auction_uuid for lot_id, auction_uuid in cursor.fetchall()}
- print(f"Found {len(lots_to_fix)} lots with UUID auction_id that need fixing")
-
- if not lots_to_fix:
- print("No lots to fix!")
- conn.close()
- return
-
- # Build mapping from lot displayId to auction displayId from cached pages
- print("Building lot displayId -> auction displayId mapping from cache...")
-
- cursor.execute("""
- SELECT url, content
- FROM cache
- WHERE url LIKE '%/l/%'
- """)
-
- lot_to_auction_map = {}
- total = 0
- errors = 0
-
- for url, content_blob in cursor:
- total += 1
-
- if total % 100 == 0:
- print(f"Processing cached pages... {total}", end='\r')
-
- try:
- # Decompress and parse __NEXT_DATA__
- content = zlib.decompress(content_blob).decode('utf-8')
- match = re.search(r'', content, re.DOTALL)
-
- if not match:
- continue
-
- data = json.loads(match.group(1))
- page_props = data.get('props', {}).get('pageProps', {})
-
- lot = page_props.get('lot', {})
- auction = page_props.get('auction', {})
-
- if not lot or not auction:
- continue
-
- lot_display_id = lot.get('displayId')
- auction_display_id = auction.get('displayId')
-
- if lot_display_id and auction_display_id:
- lot_to_auction_map[lot_display_id] = auction_display_id
-
- except Exception as e:
- errors += 1
- continue
-
- print(f"\n\nBuilt mapping for {len(lot_to_auction_map)} lots")
- print(f"Errors while parsing: {errors}")
-
- # Now update the lots table
- print("\nUpdating lots table...")
- updated = 0
- not_found = 0
-
- for lot_id, old_auction_uuid in lots_to_fix.items():
- if lot_id in lot_to_auction_map:
- new_auction_id = lot_to_auction_map[lot_id]
- cursor.execute("""
- UPDATE lots
- SET auction_id = ?
- WHERE lot_id = ?
- """, (new_auction_id, lot_id))
- updated += 1
- else:
- not_found += 1
-
- if (updated + not_found) % 100 == 0:
- print(f"Updated: {updated}, not found: {not_found}", end='\r')
-
- conn.commit()
-
- print(f"\n\nComplete!")
- print(f"Total cached pages processed: {total}")
- print(f"Lots updated with auction displayId: {updated}")
- print(f"Lots not found in cache: {not_found}")
- print(f"Parse errors: {errors}")
-
- # Verify fix
- cursor.execute("""
- SELECT COUNT(*) FROM lots
- WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
- """)
- orphaned = cursor.fetchone()[0]
-
- print(f"\nOrphaned lots remaining: {orphaned}")
-
- if orphaned > 0:
- # Show sample of remaining orphans
- cursor.execute("""
- SELECT lot_id, auction_id FROM lots
- WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
- LIMIT 5
- """)
- print("\nSample remaining orphaned lots:")
- for lot_id, auction_id in cursor.fetchall():
- print(f" {lot_id} -> auction_id: {auction_id}")
-
- conn.close()
-
-if __name__ == "__main__":
- fix_orphaned_lots()
diff --git a/inspect_cached_page.py b/inspect_cached_page.py
deleted file mode 100644
index ac67672..0000000
--- a/inspect_cached_page.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env python3
-"""Extract and inspect __NEXT_DATA__ from a cached lot page"""
-import sqlite3
-import zlib
-import json
-import re
-
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
-# Get a cached auction page
-cursor = conn.execute("""
- SELECT url, content
- FROM cache
- WHERE url LIKE '%/a/%'
- LIMIT 1
-""")
-
-row = cursor.fetchone()
-if not row:
- print("No cached lot pages found")
- exit(1)
-
-url, content_blob = row
-print(f"Inspecting: {url}\n")
-
-# Decompress
-content = zlib.decompress(content_blob).decode('utf-8')
-
-# Extract __NEXT_DATA__
-match = re.search(r'', content, re.DOTALL)
-if not match:
- print("No __NEXT_DATA__ found")
- exit(1)
-
-data = json.loads(match.group(1))
-page_props = data.get('props', {}).get('pageProps', {})
-
-if 'auction' in page_props:
- auction = page_props['auction']
- print("AUCTION DATA STRUCTURE:")
- print("=" * 60)
- print(f"displayId: {auction.get('displayId')}")
- print(f"name: {auction.get('name', '')[:50]}...")
- print(f"lots count: {len(auction.get('lots', []))}")
-
- if auction.get('lots'):
- lot = auction['lots'][0]
- print(f"\nFIRST LOT STRUCTURE:")
- print(f" displayId: {lot.get('displayId')}")
- print(f" title: {lot.get('title', '')[:50]}...")
- print(f"\n BIDDING:")
- print(f" currentBid: {lot.get('currentBid')}")
- print(f" highestBid: {lot.get('highestBid')}")
- print(f" startingBid: {lot.get('startingBid')}")
- print(f" minimumBidAmount: {lot.get('minimumBidAmount')}")
- print(f" bidCount: {lot.get('bidCount')}")
- print(f" numberOfBids: {lot.get('numberOfBids')}")
- print(f" TIMING:")
- print(f" endDate: {lot.get('endDate')}")
- print(f" startDate: {lot.get('startDate')}")
- print(f" closingTime: {lot.get('closingTime')}")
- print(f" ALL KEYS: {list(lot.keys())}")
-
- print(f"\nAUCTION TIMING:")
- print(f" minEndDate: {auction.get('minEndDate')}")
- print(f" maxEndDate: {auction.get('maxEndDate')}")
- print(f" ALL KEYS: {list(auction.keys())}")
-
-conn.close()
diff --git a/inspect_lot_html.py b/inspect_lot_html.py
deleted file mode 100644
index 3aa8f05..0000000
--- a/inspect_lot_html.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python3
-"""Inspect a lot page HTML to find viewing_time and pickup_date"""
-import asyncio
-from playwright.async_api import async_playwright
-
-async def main():
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page()
-
- # Use the known lot
- await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
- content = await page.content()
-
- print("Searching for patterns...")
- print("="*60)
-
- # Search for viewing time patterns
- import re
- patterns = {
- 'Bezichtigingen': r'Bezichtigingen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
- 'viewing': r'(?i)viewing.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
- 'Ophalen': r'Ophalen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
- 'pickup': r'(?i)pickup.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
- 'Status': r'Status\s+([^<]+)',
- }
-
- for name, pattern in patterns.items():
- matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
- if matches:
- print(f"\n{name}:")
- for match in matches[:3]:
- print(f" {match[:200]}")
-
- # Also look for structured data
- print("\n\nSearching for 'Bezichtigingen' section:")
- bez_match = re.search(r'Bezichtigingen.*?<.*?>(.*?)', content, re.DOTALL)
- if bez_match:
- print(bez_match.group(0)[:500])
-
- print("\n\nSearching for 'Ophalen' section:")
- oph_match = re.search(r'Ophalen.*?<.*?>(.*?)', content, re.DOTALL)
- if oph_match:
- print(oph_match.group(0)[:500])
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/intercept_api.py b/intercept_api.py
deleted file mode 100644
index 43667e7..0000000
--- a/intercept_api.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-"""Intercept API calls to find where lot data comes from"""
-import asyncio
-import json
-from playwright.async_api import async_playwright
-
-async def main():
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=False)
- page = await browser.new_page()
-
- # Track API calls
- api_calls = []
-
- async def handle_response(response):
- if 'api' in response.url.lower() or 'graphql' in response.url.lower():
- try:
- body = await response.json()
- api_calls.append({
- 'url': response.url,
- 'status': response.status,
- 'body': body
- })
- print(f"\nAPI CALL: {response.url}")
- print(f"Status: {response.status}")
- if 'lot' in response.url.lower() or 'auction' in response.url.lower():
- print(f"Body preview: {json.dumps(body, indent=2)[:500]}")
- except:
- pass
-
- page.on('response', handle_response)
-
- # Visit auction page
- print("Loading auction page...")
- await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
-
- # Wait a bit for lazy loading
- await asyncio.sleep(5)
-
- print(f"\n\nCaptured {len(api_calls)} API calls")
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/migrate_existing_data.py b/migrate_existing_data.py
deleted file mode 100644
index e390a24..0000000
--- a/migrate_existing_data.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python3
-"""
-Migrate existing lot data to extract missing enriched fields
-"""
-import sqlite3
-import json
-import re
-from datetime import datetime
-import sys
-sys.path.insert(0, 'src')
-
-from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json
-
-DB_PATH = "/mnt/okcomputer/output/cache.db"
-
-def migrate_lot_attributes():
- """Extract attributes from cached lot pages"""
- print("="*60)
- print("MIGRATING EXISTING LOT DATA")
- print("="*60)
-
- conn = sqlite3.connect(DB_PATH)
-
- # Get cached lot pages
- cursor = conn.execute("""
- SELECT url, content, timestamp
- FROM cache
- WHERE url LIKE '%/l/%'
- ORDER BY timestamp DESC
- """)
-
- import zlib
- updated_count = 0
-
- for url, content_blob, timestamp in cursor:
- try:
- # Get lot_id from URL
- lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url)
- if not lot_id_match:
- lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url)
- if not lot_id_match:
- continue
-
- lot_id = lot_id_match.group(1)
-
- # Check if lot exists in database
- lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,))
- lot_row = lot_cursor.fetchone()
- if not lot_row:
- continue
-
- _, title, description = lot_row
-
- # Decompress and parse __NEXT_DATA__
- content = zlib.decompress(content_blob).decode('utf-8')
- match = re.search(r'', content, re.DOTALL)
- if not match:
- continue
-
- data = json.loads(match.group(1))
- lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
- if not lot_json:
- continue
-
- # Extract basic attributes
- attrs = extract_attributes_from_lot_json(lot_json)
-
- # Extract enriched attributes
- page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')}
- enriched = extract_enriched_attributes(lot_json, page_data)
-
- # Merge
- all_attrs = {**attrs, **enriched}
-
- # Update database
- conn.execute("""
- UPDATE lots
- SET brand = ?,
- model = ?,
- attributes_json = ?,
- year_manufactured = ?,
- condition_score = ?,
- condition_description = ?,
- serial_number = ?,
- manufacturer = ?,
- damage_description = ?
- WHERE lot_id = ?
- """, (
- all_attrs.get('brand', ''),
- all_attrs.get('model', ''),
- all_attrs.get('attributes_json', ''),
- all_attrs.get('year_manufactured'),
- all_attrs.get('condition_score'),
- all_attrs.get('condition_description', ''),
- all_attrs.get('serial_number', ''),
- all_attrs.get('manufacturer', ''),
- all_attrs.get('damage_description', ''),
- lot_id
- ))
-
- updated_count += 1
- if updated_count % 100 == 0:
- print(f" Processed {updated_count} lots...")
- conn.commit()
-
- except Exception as e:
- print(f" Error processing {url}: {e}")
- continue
-
- conn.commit()
- print(f"\n✓ Updated {updated_count} lots with enriched attributes")
-
- # Show stats
- cursor = conn.execute("""
- SELECT
- COUNT(*) as total,
- SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
- SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
- SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
- SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
- SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
- FROM lots
- """)
- stats = cursor.fetchone()
-
- print(f"\nENRICHMENT STATISTICS:")
- print(f" Total lots: {stats[0]:,}")
- print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
- print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
- print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
- print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
- print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
-
- conn.close()
-
-
-def main():
- print("\nStarting migration of existing data...")
- print(f"Database: {DB_PATH}\n")
-
- migrate_lot_attributes()
-
- print(f"\n{'='*60}")
- print("MIGRATION COMPLETE")
- print(f"{'='*60}\n")
-
-if __name__ == "__main__":
- main()
diff --git a/scrape_fresh_auction.py b/scrape_fresh_auction.py
deleted file mode 100644
index 61d6d22..0000000
--- a/scrape_fresh_auction.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python3
-"""Scrape a fresh auction page to see the lots array structure"""
-import asyncio
-import json
-import re
-from playwright.async_api import async_playwright
-
-async def main():
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page()
-
- # Get first auction
- await page.goto("https://www.troostwijkauctions.com/auctions", wait_until='networkidle')
- content = await page.content()
-
- # Find first auction link
- match = re.search(r'href="(/a/[^"]+)"', content)
- if not match:
- print("No auction found")
- return
-
- auction_url = f"https://www.troostwijkauctions.com{match.group(1)}"
- print(f"Scraping: {auction_url}\n")
-
- await page.goto(auction_url, wait_until='networkidle')
- content = await page.content()
-
- # Extract __NEXT_DATA__
- match = re.search(r'', content, re.DOTALL)
- if not match:
- print("No __NEXT_DATA__ found")
- return
-
- data = json.loads(match.group(1))
- page_props = data.get('props', {}).get('pageProps', {})
-
- if 'auction' in page_props:
- auction = page_props['auction']
- print(f"Auction: {auction.get('name', '')[:50]}...")
- print(f"Lots in array: {len(auction.get('lots', []))}")
-
- if auction.get('lots'):
- lot = auction['lots'][0]
- print(f"\nFIRST LOT:")
- print(json.dumps(lot, indent=2)[:1500])
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/search_cached_viewing.py b/search_cached_viewing.py
deleted file mode 100644
index a5e2441..0000000
--- a/search_cached_viewing.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python3
-"""Search cached pages for viewing/pickup text"""
-import sqlite3
-import zlib
-import re
-
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
-cursor = conn.execute("""
- SELECT url, content
- FROM cache
- WHERE url LIKE '%/l/%'
- ORDER BY timestamp DESC
- LIMIT 20
-""")
-
-for url, content_blob in cursor:
- try:
- content = zlib.decompress(content_blob).decode('utf-8')
-
- # Look for viewing/pickup patterns
- if 'bezichtig' in content.lower() or 'ophalen' in content.lower():
- print(f"\n{'='*60}")
- print(f"URL: {url}")
- print(f"{'='*60}")
-
- # Extract sections with context
- patterns = [
- (r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'),
- (r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'),
- ]
-
- for pattern, label in patterns:
- matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
- if matches:
- print(f"\n{label}:")
- for match in matches[:1]: # First match
- # Clean up HTML
- clean = re.sub(r'<[^>]+>', ' ', match)
- clean = re.sub(r'\s+', ' ', clean).strip()
- print(f" {clean[:200]}")
-
- break # Found one, that's enough
- except:
- continue
-
-conn.close()
diff --git a/setup_windows_task.ps1 b/setup_windows_task.ps1
deleted file mode 100644
index e30f4a5..0000000
--- a/setup_windows_task.ps1
+++ /dev/null
@@ -1,47 +0,0 @@
-# PowerShell script to create Windows Task Scheduler job for Scaev Monitor
-# Run as Administrator
-
-$TaskName = "ScaevAuctionMonitor"
-$ScriptPath = "C:\vibe\scaev\src\monitor.py"
-$PythonPath = "python3" # Adjust if needed
-$WorkingDir = "C:\vibe\scaev"
-
-# Create the action (run Python script)
-$Action = New-ScheduledTaskAction -Execute $PythonPath `
- -Argument "$ScriptPath 30" `
- -WorkingDirectory $WorkingDir
-
-# Trigger: On system startup
-$TriggerStartup = New-ScheduledTaskTrigger -AtStartup
-
-# Settings
-$Settings = New-ScheduledTaskSettingsSet `
- -AllowStartIfOnBatteries `
- -DontStopIfGoingOnBatteries `
- -StartWhenAvailable `
- -RestartCount 3 `
- -RestartInterval (New-TimeSpan -Minutes 5)
-
-# Principal: Run with highest privileges
-$Principal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest
-
-# Register the task
-Register-ScheduledTask `
- -TaskName $TaskName `
- -Action $Action `
- -Trigger $TriggerStartup `
- -Settings $Settings `
- -Principal $Principal `
- -Description "Scaev auction monitor - polls for new auctions every 30 minutes" `
- -Force
-
-Write-Host "`nTask '$TaskName' created successfully!" -ForegroundColor Green
-Write-Host "`nTo manage the task:"
-Write-Host " 1. Open Task Scheduler (taskschd.msc)"
-Write-Host " 2. Find 'ScaevAuctionMonitor' in Task Scheduler Library"
-Write-Host " 3. Right-click to Run, Stop, or Disable"
-Write-Host "`nOr use PowerShell commands:"
-Write-Host " Start-ScheduledTask -TaskName '$TaskName'"
-Write-Host " Stop-ScheduledTask -TaskName '$TaskName'"
-Write-Host " Disable-ScheduledTask -TaskName '$TaskName'"
-Write-Host " Get-ScheduledTask -TaskName '$TaskName' | Get-ScheduledTaskInfo"
diff --git a/show_migration_stats.py b/show_migration_stats.py
deleted file mode 100644
index a04b962..0000000
--- a/show_migration_stats.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python3
-"""Show migration statistics"""
-import sqlite3
-
-conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
-cursor = conn.execute("""
- SELECT
- COUNT(*) as total,
- SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
- SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
- SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
- SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
- SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
- FROM lots
-""")
-
-stats = cursor.fetchone()
-
-print("="*60)
-print("MIGRATION RESULTS")
-print("="*60)
-print(f"\nTotal lots: {stats[0]:,}")
-print(f"Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
-print(f"Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
-print(f"Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
-print(f"Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
-print(f"Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
-
-# Show sample enriched data
-print(f"\n{'='*60}")
-print("SAMPLE ENRICHED LOTS")
-print(f"{'='*60}")
-
-cursor = conn.execute("""
- SELECT lot_id, year_manufactured, manufacturer, model, condition_score
- FROM lots
- WHERE year_manufactured IS NOT NULL OR manufacturer != ''
- LIMIT 5
-""")
-
-for row in cursor:
- print(f"\n{row[0]}:")
- print(f" Year: {row[1]}")
- print(f" Manufacturer: {row[2]}")
- print(f" Model: {row[3]}")
- print(f" Condition: {row[4]}")
-
-conn.close()
diff --git a/src/cache.py b/src/cache.py
index 169fe74..8d182a0 100644
--- a/src/cache.py
+++ b/src/cache.py
@@ -19,8 +19,9 @@ class CacheManager:
self._init_db()
def _init_db(self):
- """Initialize cache and data storage database"""
+ """Initialize cache and data storage database with consolidated schema"""
with sqlite3.connect(self.db_path) as conn:
+ # Cache table
conn.execute("""
CREATE TABLE IF NOT EXISTS cache (
url TEXT PRIMARY KEY,
@@ -32,6 +33,8 @@ class CacheManager:
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
""")
+
+ # Auctions table - consolidated schema
conn.execute("""
CREATE TABLE IF NOT EXISTS auctions (
auction_id TEXT PRIMARY KEY,
@@ -40,9 +43,18 @@ class CacheManager:
location TEXT,
lots_count INTEGER,
first_lot_closing_time TEXT,
- scraped_at TEXT
+ scraped_at TEXT,
+ city TEXT,
+ country TEXT,
+ type TEXT,
+ lot_count INTEGER DEFAULT 0,
+ closing_time TEXT,
+ discovered_at INTEGER
)
""")
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
+
+ # Lots table - consolidated schema with all fields from working database
conn.execute("""
CREATE TABLE IF NOT EXISTS lots (
lot_id TEXT PRIMARY KEY,
@@ -50,8 +62,6 @@ class CacheManager:
url TEXT UNIQUE,
title TEXT,
current_bid TEXT,
- starting_bid TEXT,
- minimum_bid TEXT,
bid_count INTEGER,
closing_time TEXT,
viewing_time TEXT,
@@ -60,9 +70,54 @@ class CacheManager:
description TEXT,
category TEXT,
scraped_at TEXT,
+ sale_id INTEGER,
+ manufacturer TEXT,
+ type TEXT,
+ year INTEGER,
+ currency TEXT DEFAULT 'EUR',
+ closing_notified INTEGER DEFAULT 0,
+ starting_bid TEXT,
+ minimum_bid TEXT,
+ status TEXT,
+ brand TEXT,
+ model TEXT,
+ attributes_json TEXT,
+ first_bid_time TEXT,
+ last_bid_time TEXT,
+ bid_velocity REAL,
+ bid_increment REAL,
+ year_manufactured INTEGER,
+ condition_score REAL,
+ condition_description TEXT,
+ serial_number TEXT,
+ damage_description TEXT,
+ followers_count INTEGER DEFAULT 0,
+ estimated_min_price REAL,
+ estimated_max_price REAL,
+ lot_condition TEXT,
+ appearance TEXT,
+ estimated_min REAL,
+ estimated_max REAL,
+ next_bid_step_cents INTEGER,
+ condition TEXT,
+ category_path TEXT,
+ city_location TEXT,
+ country_code TEXT,
+ bidding_status TEXT,
+ packaging TEXT,
+ quantity INTEGER,
+ vat REAL,
+ buyer_premium_percentage REAL,
+ remarks TEXT,
+ reserve_price REAL,
+ reserve_met INTEGER,
+ view_count INTEGER,
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
)
""")
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
+
+ # Images table
conn.execute("""
CREATE TABLE IF NOT EXISTS images (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -70,86 +125,28 @@ class CacheManager:
url TEXT,
local_path TEXT,
downloaded INTEGER DEFAULT 0,
+ labels TEXT,
+ processed_at INTEGER,
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_images_lot_id ON images(lot_id)")
- # Add new columns to auctions table if they don't exist
- cursor = conn.execute("PRAGMA table_info(auctions)")
- auction_columns = {row[1] for row in cursor.fetchall()}
+ # Remove duplicates before creating unique index
+ conn.execute("""
+ DELETE FROM images
+ WHERE id NOT IN (
+ SELECT MIN(id)
+ FROM images
+ GROUP BY lot_id, url
+ )
+ """)
+ conn.execute("""
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
+ ON images(lot_id, url)
+ """)
- if 'city' not in auction_columns:
- conn.execute("ALTER TABLE auctions ADD COLUMN city TEXT")
- if 'country' not in auction_columns:
- conn.execute("ALTER TABLE auctions ADD COLUMN country TEXT")
- if 'type' not in auction_columns:
- conn.execute("ALTER TABLE auctions ADD COLUMN type TEXT")
- if 'lot_count' not in auction_columns:
- conn.execute("ALTER TABLE auctions ADD COLUMN lot_count INTEGER DEFAULT 0")
- if 'closing_time' not in auction_columns:
- conn.execute("ALTER TABLE auctions ADD COLUMN closing_time TEXT")
- if 'discovered_at' not in auction_columns:
- conn.execute("ALTER TABLE auctions ADD COLUMN discovered_at INTEGER")
-
- # Add index for country filtering
- conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
-
- # Add new columns to lots table if they don't exist
- cursor = conn.execute("PRAGMA table_info(lots)")
- columns = {row[1] for row in cursor.fetchall()}
-
- if 'starting_bid' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
- if 'minimum_bid' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
- if 'status' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN status TEXT")
- if 'brand' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT")
- if 'model' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN model TEXT")
- if 'attributes_json' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT")
-
- # Bidding intelligence fields
- if 'first_bid_time' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT")
- if 'last_bid_time' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT")
- if 'bid_velocity' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL")
- if 'bid_increment' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL")
-
- # Valuation intelligence fields
- if 'year_manufactured' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER")
- if 'condition_score' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL")
- if 'condition_description' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT")
- if 'serial_number' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT")
- if 'manufacturer' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT")
- if 'damage_description' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT")
-
- # NEW: High-value API fields
- if 'followers_count' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN followers_count INTEGER DEFAULT 0")
- if 'estimated_min_price' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN estimated_min_price REAL")
- if 'estimated_max_price' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN estimated_max_price REAL")
- if 'lot_condition' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN lot_condition TEXT")
- if 'appearance' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN appearance TEXT")
- if 'scraped_at_timestamp' not in columns:
- conn.execute("ALTER TABLE lots ADD COLUMN scraped_at_timestamp INTEGER")
-
- # Create bid_history table
+ # Bid history table
conn.execute("""
CREATE TABLE IF NOT EXISTS bid_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -163,33 +160,15 @@ class CacheManager:
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
-
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
ON bid_history(lot_id, bid_time)
""")
-
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
ON bid_history(bidder_id)
""")
- # Remove duplicates before creating unique index
- # Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
- conn.execute("""
- DELETE FROM images
- WHERE id NOT IN (
- SELECT MIN(id)
- FROM images
- GROUP BY lot_id, url
- )
- """)
-
- # Now create the unique index
- conn.execute("""
- CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
- ON images(lot_id, url)
- """)
conn.commit()
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
diff --git a/sync_updates.py b/sync_updates.py
deleted file mode 100644
index 56a284f..0000000
--- a/sync_updates.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#!/usr/bin/env python3
-"""
-Sync local database updates to server-compatible format
-Creates incremental exports with only NEW or UPDATED records
-"""
-
-import sqlite3
-import json
-import csv
-from datetime import datetime
-from pathlib import Path
-
-DB_PATH = "C:/mnt/okcomputer/output/cache.db"
-OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
-
-def fill_missing_auction_fields():
- """Fill in missing fields in auctions table from scraped data"""
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
-
- print("Filling missing auction fields...")
-
- # Update closing_time from first_lot_closing_time
- cursor.execute("""
- UPDATE auctions
- SET closing_time = first_lot_closing_time
- WHERE closing_time IS NULL AND first_lot_closing_time IS NOT NULL
- """)
- updated = cursor.rowcount
- print(f" ✓ Updated {updated} closing_time fields")
-
- # Parse location to extract city and country
- cursor.execute("""
- SELECT auction_id, location
- FROM auctions
- WHERE location IS NOT NULL AND (city IS NULL OR country IS NULL)
- """)
- locations = cursor.fetchall()
-
- city_updates = 0
- for auction_id, location in locations:
- if not location:
- continue
-
- # Parse "City, COUNTRY" or "City, Region, COUNTRY"
- parts = [p.strip() for p in location.split(',')]
- if len(parts) >= 2:
- city = parts[0]
- country = parts[-1]
-
- cursor.execute("""
- UPDATE auctions
- SET city = ?, country = ?
- WHERE auction_id = ?
- """, (city, country, auction_id))
- city_updates += 1
-
- print(f" ✓ Updated {city_updates} city/country fields")
-
- # Set type to 'online' for all (Troostwijk is online platform)
- cursor.execute("""
- UPDATE auctions
- SET type = 'online'
- WHERE type IS NULL
- """)
- type_updates = cursor.rowcount
- print(f" ✓ Updated {type_updates} type fields")
-
- conn.commit()
- conn.close()
-
- print(f"✓ Auction fields updated\n")
-
-def get_last_sync_timestamp():
- """Get timestamp of last successful sync"""
- sync_file = OUTPUT_DIR / ".last_sync"
- if sync_file.exists():
- return int(sync_file.read_text().strip())
- return 0
-
-def save_sync_timestamp(timestamp: int):
- """Save timestamp of successful sync"""
- sync_file = OUTPUT_DIR / ".last_sync"
- sync_file.write_text(str(timestamp))
-
-def export_incremental():
- """Export only records that are new or updated since last sync"""
- conn = sqlite3.connect(DB_PATH)
- conn.row_factory = sqlite3.Row
- cursor = conn.cursor()
-
- last_sync = get_last_sync_timestamp()
- current_time = int(datetime.now().timestamp())
-
- print(f"Last sync: {datetime.fromtimestamp(last_sync).strftime('%Y-%m-%d %H:%M:%S') if last_sync else 'Never'}")
- print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
-
- # Get new/updated auctions
- cursor.execute("""
- SELECT * FROM auctions
- WHERE discovered_at IS NULL OR discovered_at > ?
- ORDER BY auction_id
- """, (last_sync,))
- new_auctions = [dict(row) for row in cursor.fetchall()]
-
- # Get new/updated lots
- cursor.execute("""
- SELECT * FROM lots
- WHERE scraped_at_timestamp IS NULL OR scraped_at_timestamp > ?
- ORDER BY lot_id
- """, (last_sync,))
- new_lots = [dict(row) for row in cursor.fetchall()]
-
- conn.close()
-
- # Export to timestamped files
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-
- results = {
- 'auctions': 0,
- 'lots': 0,
- 'files': {}
- }
-
- # Export auctions if any new
- if new_auctions:
- auctions_csv = OUTPUT_DIR / f'auctions_update_{timestamp}.csv'
- auctions_json = OUTPUT_DIR / f'auctions_update_{timestamp}.json'
-
- with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
- writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
- writer.writeheader()
- writer.writerows(new_auctions)
-
- with open(auctions_json, 'w', encoding='utf-8') as f:
- json.dump(new_auctions, f, indent=2, ensure_ascii=False)
-
- results['auctions'] = len(new_auctions)
- results['files']['auctions_csv'] = str(auctions_csv)
- results['files']['auctions_json'] = str(auctions_json)
-
- print(f"\n✓ Exported {len(new_auctions)} new/updated auctions")
- print(f" CSV: {auctions_csv}")
- print(f" JSON: {auctions_json}")
-
- # Export lots if any new
- if new_lots:
- lots_csv = OUTPUT_DIR / f'lots_update_{timestamp}.csv'
- lots_json = OUTPUT_DIR / f'lots_update_{timestamp}.json'
-
- with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
- writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
- writer.writeheader()
- writer.writerows(new_lots)
-
- with open(lots_json, 'w', encoding='utf-8') as f:
- json.dump(new_lots, f, indent=2, ensure_ascii=False)
-
- results['lots'] = len(new_lots)
- results['files']['lots_csv'] = str(lots_csv)
- results['files']['lots_json'] = str(lots_json)
-
- print(f"\n✓ Exported {len(new_lots)} new/updated lots")
- print(f" CSV: {lots_csv}")
- print(f" JSON: {lots_json}")
-
- if not new_auctions and not new_lots:
- print("\n✓ No new updates since last sync")
-
- return results
-
-def create_upsert_export():
- """Create SQL script for server to UPSERT (update or insert) data"""
- conn = sqlite3.connect(DB_PATH)
- conn.row_factory = sqlite3.Row
- cursor = conn.cursor()
-
- last_sync = get_last_sync_timestamp()
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-
- # Get new/updated auctions
- cursor.execute("""
- SELECT * FROM auctions
- WHERE discovered_at IS NULL OR discovered_at > ?
- """, (last_sync,))
- new_auctions = [dict(row) for row in cursor.fetchall()]
-
- if new_auctions:
- sql_file = OUTPUT_DIR / f'upsert_auctions_{timestamp}.sql'
-
- with open(sql_file, 'w', encoding='utf-8') as f:
- f.write("-- UPSERT script for auctions (updates existing, inserts new)\n\n")
-
- for auction in new_auctions:
- # Create INSERT OR REPLACE statement
- columns = list(auction.keys())
- placeholders = []
-
- for col, val in auction.items():
- if val is None:
- placeholders.append("NULL")
- elif isinstance(val, (int, float)):
- placeholders.append(str(val))
- else:
- # Escape single quotes
- escaped = str(val).replace("'", "''")
- placeholders.append(f"'{escaped}'")
-
- f.write(f"INSERT OR REPLACE INTO auctions ({', '.join(columns)})\n")
- f.write(f"VALUES ({', '.join(placeholders)});\n\n")
-
- print(f"\n✓ Created UPSERT SQL script: {sql_file}")
- print(f" Server can execute this to avoid constraint errors")
-
- conn.close()
-
-def main():
- """Main sync process"""
- print("="*60)
- print("DATABASE SYNC UTILITY")
- print("="*60)
- print(f"Database: {DB_PATH}")
- print(f"Output: {OUTPUT_DIR}")
- print("="*60)
-
- # Step 1: Fill missing fields
- fill_missing_auction_fields()
-
- # Step 2: Export incremental updates
- print("Exporting incremental updates...")
- results = export_incremental()
-
- # Step 3: Create UPSERT SQL (prevents constraint errors on server)
- if results['auctions'] > 0:
- create_upsert_export()
-
- # Step 4: Save sync timestamp
- current_time = int(datetime.now().timestamp())
- save_sync_timestamp(current_time)
-
- print("\n" + "="*60)
- print("SYNC COMPLETE")
- print("="*60)
- print(f"New auctions: {results['auctions']}")
- print(f"New lots: {results['lots']}")
-
- if results['files']:
- print("\nFiles ready for server import:")
- for key, path in results['files'].items():
- print(f" {key}: {path}")
-
- print("\nNext sync will only export records newer than:")
- print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
-
-if __name__ == "__main__":
- main()
diff --git a/test_auction_fetch.py b/test_auction_fetch.py
deleted file mode 100644
index 1888978..0000000
--- a/test_auction_fetch.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env python3
-"""Test auction data fetch"""
-import asyncio
-import json
-import sys
-sys.path.insert(0, 'src')
-
-from graphql_client import fetch_auction_data, format_auction_data
-
-async def main():
- auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
-
- print(f"Fetching auction: {auction_id}\n")
- auction_data = await fetch_auction_data(auction_id)
-
- if auction_data:
- print("Raw Auction Data:")
- print(json.dumps(auction_data, indent=2))
-
- print("\n\nFormatted:")
- formatted = format_auction_data(auction_data)
- print(f"Viewing: {formatted['viewing_time']}")
- print(f"Pickup: {formatted['pickup_date']}")
- else:
- print("No auction data returned")
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/test_auction_query.py b/test_auction_query.py
deleted file mode 100644
index bfc8b08..0000000
--- a/test_auction_query.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env python3
-"""Test if the auction query works at all"""
-import asyncio
-import aiohttp
-import json
-
-GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
-
-# Try a simpler query first
-SIMPLE_QUERY = """
-query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
- auction(id: $auctionId, locale: $locale, platform: $platform) {
- id
- displayId
- viewingDays {
- startDate
- endDate
- city
- countryCode
- }
- collectionDays {
- startDate
- endDate
- city
- countryCode
- }
- }
-}
-"""
-
-async def main():
- auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
-
- variables = {
- "auctionId": auction_id,
- "locale": "nl",
- "platform": "TWK"
- }
-
- payload = {
- "query": SIMPLE_QUERY,
- "variables": variables
- }
-
- async with aiohttp.ClientSession() as session:
- async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
- print(f"Status: {response.status}")
- text = await response.text()
- print(f"Response: {text}")
-
- try:
- data = await response.json()
- print(f"\nParsed:")
- print(json.dumps(data, indent=2))
- except:
- pass
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/test_comprehensive.py b/test_comprehensive.py
deleted file mode 100644
index 561f4da..0000000
--- a/test_comprehensive.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/usr/bin/env python3
-"""Test comprehensive data enrichment"""
-import asyncio
-import sys
-sys.path.insert(0, 'src')
-
-from scraper import TroostwijkScraper
-
-async def main():
- scraper = TroostwijkScraper()
-
- from playwright.async_api import async_playwright
-
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page(
- viewport={'width': 1920, 'height': 1080},
- user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- )
-
- # Test with lot that has bids
- lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
-
- print(f"Testing comprehensive extraction\n")
- result = await scraper.crawl_page(page, lot_url)
-
- if result:
- print(f"\n{'='*60}")
- print("COMPREHENSIVE DATA EXTRACTION:")
- print(f"{'='*60}")
- print(f"Lot ID: {result.get('lot_id')}")
- print(f"Title: {result.get('title', '')[:50]}...")
- print(f"\n[Bidding Intelligence]")
- print(f" Status: {result.get('status')}")
- print(f" Current Bid: {result.get('current_bid')}")
- print(f" Starting Bid: {result.get('starting_bid')}")
- print(f" Bid Increment: EUR {result.get('bid_increment', 0):.2f}")
- print(f" Bid Count: {result.get('bid_count')}")
- print(f" First Bid: {result.get('first_bid_time', 'N/A')}")
- print(f" Last Bid: {result.get('last_bid_time', 'N/A')}")
- print(f" Bid Velocity: {result.get('bid_velocity', 0)} bids/hour")
- print(f"\n[Valuation Intelligence]")
- print(f" Brand: {result.get('brand', 'N/A')}")
- print(f" Model: {result.get('model', 'N/A')}")
- print(f" Year: {result.get('year_manufactured', 'N/A')}")
- print(f" Manufacturer: {result.get('manufacturer', 'N/A')}")
- print(f" Condition Score: {result.get('condition_score', 'N/A')}")
- print(f" Condition: {result.get('condition_description', 'N/A')}")
- print(f" Serial#: {result.get('serial_number', 'N/A')}")
- print(f" Damage: {result.get('damage_description', 'N/A')[:50] if result.get('damage_description') else 'N/A'}...")
-
- await browser.close()
-
- # Verify database
- import sqlite3
- conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
-
- # Check lot data
- cursor = conn.execute("""
- SELECT bid_velocity, first_bid_time, year_manufactured, condition_score
- FROM lots
- WHERE lot_id = ?
- """, (result.get('lot_id'),))
- row = cursor.fetchone()
-
- if row:
- print(f"\n{'='*60}")
- print("DATABASE VERIFICATION (lots table):")
- print(f"{'='*60}")
- print(f" Bid Velocity: {row[0]}")
- print(f" First Bid Time: {row[1]}")
- print(f" Year: {row[2]}")
- print(f" Condition Score: {row[3]}")
-
- # Check bid history
- cursor = conn.execute("""
- SELECT COUNT(*), MIN(bid_time), MAX(bid_time), SUM(is_autobid)
- FROM bid_history
- WHERE lot_id = ?
- """, (result.get('lot_id'),))
- row = cursor.fetchone()
-
- if row and row[0] > 0:
- print(f"\n{'='*60}")
- print("DATABASE VERIFICATION (bid_history table):")
- print(f"{'='*60}")
- print(f" Total Bids Stored: {row[0]}")
- print(f" First Bid: {row[1]}")
- print(f" Last Bid: {row[2]}")
- print(f" Autobids: {row[3]}")
-
- conn.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/test_concurrent_images.py b/test_concurrent_images.py
deleted file mode 100644
index 8d24e15..0000000
--- a/test_concurrent_images.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python3
-"""Test concurrent image downloads"""
-import asyncio
-import time
-import sys
-sys.path.insert(0, 'src')
-
-from scraper import TroostwijkScraper
-
-async def main():
- scraper = TroostwijkScraper()
-
- from playwright.async_api import async_playwright
-
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page(
- viewport={'width': 1920, 'height': 1080},
- user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- )
-
- # Test with a lot that has multiple images
- lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
-
- print(f"Testing concurrent image downloads\n")
- print(f"Lot: {lot_url}\n")
-
- start_time = time.time()
- result = await scraper.crawl_page(page, lot_url)
- elapsed = time.time() - start_time
-
- print(f"\n{'='*60}")
- print(f"TIMING RESULTS:")
- print(f"{'='*60}")
- print(f"Total time: {elapsed:.2f}s")
-
- image_count = len(result.get('images', []))
- print(f"Images: {image_count}")
-
- if image_count > 1:
- print(f"Time per image: {elapsed/image_count:.2f}s (if sequential)")
- print(f"Actual time: {elapsed:.2f}s (concurrent!)")
- speedup = (image_count * 0.5) / elapsed if elapsed > 0 else 1
- print(f"Speedup factor: {speedup:.1f}x")
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/test_full_scraper.py b/test_full_scraper.py
deleted file mode 100644
index bb7cacd..0000000
--- a/test_full_scraper.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-"""Test the full scraper with one lot"""
-import asyncio
-import sys
-sys.path.insert(0, 'src')
-
-from scraper import TroostwijkScraper
-
-async def main():
- scraper = TroostwijkScraper()
-
- from playwright.async_api import async_playwright
-
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page(
- viewport={'width': 1920, 'height': 1080},
- user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- )
-
- # Test with a known lot
- lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
-
- print(f"Testing with: {lot_url}\n")
- result = await scraper.crawl_page(page, lot_url)
-
- if result:
- print(f"\n{'='*60}")
- print("FINAL RESULT:")
- print(f"{'='*60}")
- print(f"Lot ID: {result.get('lot_id')}")
- print(f"Title: {result.get('title', '')[:50]}...")
- print(f"Current Bid: {result.get('current_bid')}")
- print(f"Starting Bid: {result.get('starting_bid')}")
- print(f"Minimum Bid: {result.get('minimum_bid')}")
- print(f"Bid Count: {result.get('bid_count')}")
- print(f"Closing Time: {result.get('closing_time')}")
- print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
- print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
- print(f"Location: {result.get('location')}")
-
- await browser.close()
-
- # Verify database
- import sqlite3
- conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
- cursor = conn.execute("""
- SELECT current_bid, starting_bid, minimum_bid, bid_count, closing_time
- FROM lots
- WHERE lot_id = 'A1-28505-5'
- """)
- row = cursor.fetchone()
- conn.close()
-
- if row:
- print(f"\n{'='*60}")
- print("DATABASE VERIFICATION:")
- print(f"{'='*60}")
- print(f"Current Bid: {row[0]}")
- print(f"Starting Bid: {row[1]}")
- print(f"Minimum Bid: {row[2]}")
- print(f"Bid Count: {row[3]}")
- print(f"Closing Time: {row[4]}")
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/test_graphql_scraper.py b/test_graphql_scraper.py
deleted file mode 100644
index 71eda86..0000000
--- a/test_graphql_scraper.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python3
-"""Test the updated scraper with GraphQL integration"""
-import asyncio
-import sys
-sys.path.insert(0, 'src')
-
-from graphql_client import fetch_lot_bidding_data, format_bid_data
-
-async def main():
- # Test with known lot ID
- lot_id = "A1-28505-5"
-
- print(f"Testing GraphQL API with lot: {lot_id}\n")
-
- bidding_data = await fetch_lot_bidding_data(lot_id)
-
- if bidding_data:
- print("Raw GraphQL Response:")
- print("="*60)
- import json
- print(json.dumps(bidding_data, indent=2))
-
- print("\n\nFormatted Data:")
- print("="*60)
- formatted = format_bid_data(bidding_data)
- for key, value in formatted.items():
- print(f" {key}: {value}")
- else:
- print("Failed to fetch bidding data")
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/test_live_lot.py b/test_live_lot.py
deleted file mode 100644
index 78096ee..0000000
--- a/test_live_lot.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-"""Test scraping a single live lot page"""
-import asyncio
-import sys
-sys.path.insert(0, 'src')
-
-from scraper import TroostwijkScraper
-
-async def main():
- scraper = TroostwijkScraper()
-
- from playwright.async_api import async_playwright
-
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page()
-
- # Get a lot URL from the database
- import sqlite3
- conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
- cursor = conn.execute("SELECT url FROM lots LIMIT 1")
- row = cursor.fetchone()
- conn.close()
-
- if not row:
- print("No lots in database")
- return
-
- lot_url = row[0]
- print(f"Fetching: {lot_url}\n")
-
- result = await scraper.crawl_page(page, lot_url)
-
- if result:
- print(f"\nExtracted Data:")
- print(f" current_bid: {result.get('current_bid')}")
- print(f" bid_count: {result.get('bid_count')}")
- print(f" closing_time: {result.get('closing_time')}")
-
- await browser.close()
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/test_new_fields.py b/test_new_fields.py
deleted file mode 100644
index 4f49b43..0000000
--- a/test_new_fields.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python3
-"""Test the new fields extraction"""
-import asyncio
-import sys
-sys.path.insert(0, 'src')
-
-from scraper import TroostwijkScraper
-
-async def main():
- scraper = TroostwijkScraper()
-
- from playwright.async_api import async_playwright
-
- async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
- page = await browser.new_page(
- viewport={'width': 1920, 'height': 1080},
- user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- )
-
- # Test with lot that has attributes
- lot_url = "https://www.troostwijkauctions.com/l/47-5kg-hexagon-dumbbell-%25282x%2529-A1-40668-34"
-
- print(f"Testing new fields with: {lot_url}\n")
- result = await scraper.crawl_page(page, lot_url)
-
- if result:
- print(f"\n{'='*60}")
- print("EXTRACTED FIELDS:")
- print(f"{'='*60}")
- print(f"Lot ID: {result.get('lot_id')}")
- print(f"Title: {result.get('title', '')[:50]}...")
- print(f"Status: {result.get('status')}")
- print(f"Brand: {result.get('brand')}")
- print(f"Model: {result.get('model')}")
- print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
- print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
- print(f"Attributes: {result.get('attributes_json', '')[:100]}...")
-
- await browser.close()
-
- # Verify database
- import sqlite3
- conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
- cursor = conn.execute("""
- SELECT status, brand, model, viewing_time, pickup_date
- FROM lots
- WHERE lot_id = ?
- """, (result.get('lot_id'),))
- row = cursor.fetchone()
- conn.close()
-
- if row:
- print(f"\n{'='*60}")
- print("DATABASE VERIFICATION:")
- print(f"{'='*60}")
- print(f"Status: {row[0]}")
- print(f"Brand: {row[1]}")
- print(f"Model: {row[2]}")
- print(f"Viewing: {row[3][:100] if row[3] else 'N/A'}...")
- print(f"Pickup: {row[4][:100] if row[4] else 'N/A'}...")
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/validate_data.py b/validate_data.py
deleted file mode 100644
index 78e3f22..0000000
--- a/validate_data.py
+++ /dev/null
@@ -1,306 +0,0 @@
-"""
-Validate data quality and completeness in the database.
-Checks if scraped data matches expectations and API capabilities.
-"""
-import sys
-import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
-
-import sqlite3
-from datetime import datetime
-from typing import Dict, List, Tuple
-from cache import CacheManager
-
-cache = CacheManager()
-DB_PATH = cache.db_path
-
-def get_db_stats() -> Dict:
- """Get comprehensive database statistics"""
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
-
- stats = {}
-
- # Total counts
- stats['total_auctions'] = cursor.execute("SELECT COUNT(*) FROM auctions").fetchone()[0]
- stats['total_lots'] = cursor.execute("SELECT COUNT(*) FROM lots").fetchone()[0]
- stats['total_images'] = cursor.execute("SELECT COUNT(*) FROM images").fetchone()[0]
- stats['total_bid_history'] = cursor.execute("SELECT COUNT(*) FROM bid_history").fetchone()[0]
-
- # Auctions completeness
- cursor.execute("""
- SELECT
- COUNT(*) as total,
- SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
- SUM(CASE WHEN lots_count IS NOT NULL THEN 1 ELSE 0 END) as has_lots_count,
- SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
- SUM(CASE WHEN first_lot_closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_lot_closing
- FROM auctions
- """)
- row = cursor.fetchone()
- stats['auctions'] = {
- 'total': row[0],
- 'has_title': row[1],
- 'has_lots_count': row[2],
- 'has_closing_time': row[3],
- 'has_first_lot_closing': row[4]
- }
-
- # Lots completeness - Core fields
- cursor.execute("""
- SELECT
- COUNT(*) as total,
- SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
- SUM(CASE WHEN current_bid IS NOT NULL THEN 1 ELSE 0 END) as has_current_bid,
- SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid,
- SUM(CASE WHEN minimum_bid IS NOT NULL THEN 1 ELSE 0 END) as has_minimum_bid,
- SUM(CASE WHEN bid_count IS NOT NULL AND bid_count > 0 THEN 1 ELSE 0 END) as has_bids,
- SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
- SUM(CASE WHEN status IS NOT NULL AND status != '' THEN 1 ELSE 0 END) as has_status
- FROM lots
- """)
- row = cursor.fetchone()
- stats['lots_core'] = {
- 'total': row[0],
- 'has_title': row[1],
- 'has_current_bid': row[2],
- 'has_starting_bid': row[3],
- 'has_minimum_bid': row[4],
- 'has_bids': row[5],
- 'has_closing_time': row[6],
- 'has_status': row[7]
- }
-
- # Lots completeness - Enriched fields
- cursor.execute("""
- SELECT
- COUNT(*) as total,
- SUM(CASE WHEN brand IS NOT NULL AND brand != '' THEN 1 ELSE 0 END) as has_brand,
- SUM(CASE WHEN model IS NOT NULL AND model != '' THEN 1 ELSE 0 END) as has_model,
- SUM(CASE WHEN manufacturer IS NOT NULL AND manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
- SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
- SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition_score,
- SUM(CASE WHEN condition_description IS NOT NULL AND condition_description != '' THEN 1 ELSE 0 END) as has_condition_desc,
- SUM(CASE WHEN serial_number IS NOT NULL AND serial_number != '' THEN 1 ELSE 0 END) as has_serial,
- SUM(CASE WHEN damage_description IS NOT NULL AND damage_description != '' THEN 1 ELSE 0 END) as has_damage
- FROM lots
- """)
- row = cursor.fetchone()
- stats['lots_enriched'] = {
- 'total': row[0],
- 'has_brand': row[1],
- 'has_model': row[2],
- 'has_manufacturer': row[3],
- 'has_year': row[4],
- 'has_condition_score': row[5],
- 'has_condition_desc': row[6],
- 'has_serial': row[7],
- 'has_damage': row[8]
- }
-
- # Lots completeness - Bid intelligence
- cursor.execute("""
- SELECT
- COUNT(*) as total,
- SUM(CASE WHEN first_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_bid_time,
- SUM(CASE WHEN last_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_last_bid_time,
- SUM(CASE WHEN bid_velocity IS NOT NULL THEN 1 ELSE 0 END) as has_bid_velocity,
- SUM(CASE WHEN bid_increment IS NOT NULL THEN 1 ELSE 0 END) as has_bid_increment
- FROM lots
- """)
- row = cursor.fetchone()
- stats['lots_bid_intelligence'] = {
- 'total': row[0],
- 'has_first_bid_time': row[1],
- 'has_last_bid_time': row[2],
- 'has_bid_velocity': row[3],
- 'has_bid_increment': row[4]
- }
-
- # Bid history stats
- cursor.execute("""
- SELECT
- COUNT(DISTINCT lot_id) as lots_with_history,
- COUNT(*) as total_bids,
- SUM(CASE WHEN is_autobid = 1 THEN 1 ELSE 0 END) as autobids,
- SUM(CASE WHEN bidder_id IS NOT NULL THEN 1 ELSE 0 END) as has_bidder_id
- FROM bid_history
- """)
- row = cursor.fetchone()
- stats['bid_history'] = {
- 'lots_with_history': row[0],
- 'total_bids': row[1],
- 'autobids': row[2],
- 'has_bidder_id': row[3]
- }
-
- # Image stats
- cursor.execute("""
- SELECT
- COUNT(DISTINCT lot_id) as lots_with_images,
- COUNT(*) as total_images,
- SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded_images,
- SUM(CASE WHEN local_path IS NOT NULL THEN 1 ELSE 0 END) as has_local_path
- FROM images
- """)
- row = cursor.fetchone()
- stats['images'] = {
- 'lots_with_images': row[0],
- 'total_images': row[1],
- 'downloaded_images': row[2],
- 'has_local_path': row[3]
- }
-
- conn.close()
- return stats
-
-def check_data_quality() -> List[Tuple[str, str, str]]:
- """Check for data quality issues"""
- issues = []
- conn = sqlite3.connect(DB_PATH)
- cursor = conn.cursor()
-
- # Check for lots without auction
- cursor.execute("""
- SELECT COUNT(*) FROM lots
- WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
- """)
- orphaned_lots = cursor.fetchone()[0]
- if orphaned_lots > 0:
- issues.append(("ERROR", "Orphaned Lots", f"{orphaned_lots} lots without matching auction"))
-
- # Check for lots with bids but no bid history
- cursor.execute("""
- SELECT COUNT(*) FROM lots
- WHERE bid_count > 0
- AND lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
- """)
- missing_history = cursor.fetchone()[0]
- if missing_history > 0:
- issues.append(("WARNING", "Missing Bid History", f"{missing_history} lots have bids but no bid history records"))
-
- # Check for lots with closing time in past but still active
- cursor.execute("""
- SELECT COUNT(*) FROM lots
- WHERE closing_time IS NOT NULL
- AND closing_time < datetime('now')
- AND status NOT LIKE '%gesloten%'
- """)
- past_closing = cursor.fetchone()[0]
- if past_closing > 0:
- issues.append(("INFO", "Past Closing Time", f"{past_closing} lots have closing time in past"))
-
- # Check for duplicate lot_ids
- cursor.execute("""
- SELECT lot_id, COUNT(*) FROM lots
- GROUP BY lot_id
- HAVING COUNT(*) > 1
- """)
- duplicates = cursor.fetchall()
- if duplicates:
- issues.append(("ERROR", "Duplicate Lot IDs", f"{len(duplicates)} duplicate lot_id values found"))
-
- # Check for lots without images
- cursor.execute("""
- SELECT COUNT(*) FROM lots
- WHERE lot_id NOT IN (SELECT DISTINCT lot_id FROM images)
- """)
- no_images = cursor.fetchone()[0]
- if no_images > 0:
- issues.append(("WARNING", "No Images", f"{no_images} lots have no images"))
-
- conn.close()
- return issues
-
-def print_validation_report():
- """Print comprehensive validation report"""
- print("=" * 80)
- print("DATABASE VALIDATION REPORT")
- print("=" * 80)
- print()
-
- stats = get_db_stats()
-
- # Overall counts
- print("OVERALL COUNTS:")
- print(f" Auctions: {stats['total_auctions']:,}")
- print(f" Lots: {stats['total_lots']:,}")
- print(f" Images: {stats['total_images']:,}")
- print(f" Bid History Records: {stats['total_bid_history']:,}")
- print()
-
- # Auctions completeness
- print("AUCTIONS COMPLETENESS:")
- a = stats['auctions']
- print(f" Title: {a['has_title']:,} / {a['total']:,} ({a['has_title']/a['total']*100:.1f}%)")
- print(f" Lots Count: {a['has_lots_count']:,} / {a['total']:,} ({a['has_lots_count']/a['total']*100:.1f}%)")
- print(f" Closing Time: {a['has_closing_time']:,} / {a['total']:,} ({a['has_closing_time']/a['total']*100:.1f}%)")
- print(f" First Lot Closing: {a['has_first_lot_closing']:,} / {a['total']:,} ({a['has_first_lot_closing']/a['total']*100:.1f}%)")
- print()
-
- # Lots core completeness
- print("LOTS CORE FIELDS:")
- l = stats['lots_core']
- print(f" Title: {l['has_title']:,} / {l['total']:,} ({l['has_title']/l['total']*100:.1f}%)")
- print(f" Current Bid: {l['has_current_bid']:,} / {l['total']:,} ({l['has_current_bid']/l['total']*100:.1f}%)")
- print(f" Starting Bid: {l['has_starting_bid']:,} / {l['total']:,} ({l['has_starting_bid']/l['total']*100:.1f}%)")
- print(f" Minimum Bid: {l['has_minimum_bid']:,} / {l['total']:,} ({l['has_minimum_bid']/l['total']*100:.1f}%)")
- print(f" Has Bids (>0): {l['has_bids']:,} / {l['total']:,} ({l['has_bids']/l['total']*100:.1f}%)")
- print(f" Closing Time: {l['has_closing_time']:,} / {l['total']:,} ({l['has_closing_time']/l['total']*100:.1f}%)")
- print(f" Status: {l['has_status']:,} / {l['total']:,} ({l['has_status']/l['total']*100:.1f}%)")
- print()
-
- # Lots enriched fields
- print("LOTS ENRICHED FIELDS:")
- e = stats['lots_enriched']
- print(f" Brand: {e['has_brand']:,} / {e['total']:,} ({e['has_brand']/e['total']*100:.1f}%)")
- print(f" Model: {e['has_model']:,} / {e['total']:,} ({e['has_model']/e['total']*100:.1f}%)")
- print(f" Manufacturer: {e['has_manufacturer']:,} / {e['total']:,} ({e['has_manufacturer']/e['total']*100:.1f}%)")
- print(f" Year: {e['has_year']:,} / {e['total']:,} ({e['has_year']/e['total']*100:.1f}%)")
- print(f" Condition Score: {e['has_condition_score']:,} / {e['total']:,} ({e['has_condition_score']/e['total']*100:.1f}%)")
- print(f" Condition Desc: {e['has_condition_desc']:,} / {e['total']:,} ({e['has_condition_desc']/e['total']*100:.1f}%)")
- print(f" Serial Number: {e['has_serial']:,} / {e['total']:,} ({e['has_serial']/e['total']*100:.1f}%)")
- print(f" Damage Desc: {e['has_damage']:,} / {e['total']:,} ({e['has_damage']/e['total']*100:.1f}%)")
- print()
-
- # Bid intelligence
- print("LOTS BID INTELLIGENCE:")
- b = stats['lots_bid_intelligence']
- print(f" First Bid Time: {b['has_first_bid_time']:,} / {b['total']:,} ({b['has_first_bid_time']/b['total']*100:.1f}%)")
- print(f" Last Bid Time: {b['has_last_bid_time']:,} / {b['total']:,} ({b['has_last_bid_time']/b['total']*100:.1f}%)")
- print(f" Bid Velocity: {b['has_bid_velocity']:,} / {b['total']:,} ({b['has_bid_velocity']/b['total']*100:.1f}%)")
- print(f" Bid Increment: {b['has_bid_increment']:,} / {b['total']:,} ({b['has_bid_increment']/b['total']*100:.1f}%)")
- print()
-
- # Bid history
- print("BID HISTORY:")
- h = stats['bid_history']
- print(f" Lots with History: {h['lots_with_history']:,}")
- print(f" Total Bid Records: {h['total_bids']:,}")
- print(f" Autobids: {h['autobids']:,} ({h['autobids']/max(h['total_bids'],1)*100:.1f}%)")
- print(f" Has Bidder ID: {h['has_bidder_id']:,} ({h['has_bidder_id']/max(h['total_bids'],1)*100:.1f}%)")
- print()
-
- # Images
- print("IMAGES:")
- i = stats['images']
- print(f" Lots with Images: {i['lots_with_images']:,}")
- print(f" Total Images: {i['total_images']:,}")
- print(f" Downloaded: {i['downloaded_images']:,} ({i['downloaded_images']/max(i['total_images'],1)*100:.1f}%)")
- print(f" Has Local Path: {i['has_local_path']:,} ({i['has_local_path']/max(i['total_images'],1)*100:.1f}%)")
- print()
-
- # Data quality issues
- print("=" * 80)
- print("DATA QUALITY ISSUES:")
- print("=" * 80)
- issues = check_data_quality()
- if issues:
- for severity, category, message in issues:
- print(f" [{severity}] {category}: {message}")
- else:
- print(" No issues found!")
- print()
-
-if __name__ == "__main__":
- print_validation_report()
diff --git a/verify_images.py b/verify_images.py
deleted file mode 100644
index c93064b..0000000
--- a/verify_images.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env python3
-"""
-Verification script to check image download status and duplicates
-Run this after deployment to verify the scraper is working correctly
-"""
-import sqlite3
-import sys
-from pathlib import Path
-
-DB_PATH = "/mnt/okcomputer/output/cache.db"
-
-def verify_database():
- """Run verification queries on the database"""
-
- if not Path(DB_PATH).exists():
- print(f"❌ Database not found: {DB_PATH}")
- sys.exit(1)
-
- conn = sqlite3.connect(DB_PATH)
-
- print("=" * 60)
- print("IMAGE DOWNLOAD VERIFICATION")
- print("=" * 60)
-
- # Check download success rate
- print("\n[*] Download Success Rate:")
- cursor = conn.execute("""
- SELECT
- COUNT(*) as total_images,
- SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
- SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
- ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
- FROM images
- """)
- row = cursor.fetchone()
- print(f" Total images: {row[0]:,}")
- print(f" Downloaded: {row[1]:,}")
- print(f" Not downloaded: {row[2]:,}")
- print(f" Success rate: {row[3]}%")
-
- # Check for duplicates
- print("\n[*] Duplicate Check:")
- cursor = conn.execute("""
- SELECT lot_id, url, COUNT(*) as dup_count
- FROM images
- GROUP BY lot_id, url
- HAVING COUNT(*) > 1
- LIMIT 5
- """)
- duplicates = cursor.fetchall()
-
- if duplicates:
- print(f" [!] Found {len(duplicates)} duplicate entries!")
- for lot_id, url, count in duplicates:
- print(f" {lot_id}: {url[:50]}... (x{count})")
- else:
- print(" [+] No duplicates found!")
-
- # Verify file system
- print("\n[*] File System Verification:")
- cursor = conn.execute("""
- SELECT COUNT(*)
- FROM images
- WHERE downloaded = 1
- AND local_path IS NOT NULL
- AND local_path != ''
- """)
- files_with_path = cursor.fetchone()[0]
- print(f" Images with local_path: {files_with_path:,}")
-
- # Sample some downloaded images
- print("\n[*] Sample Downloaded Images:")
- cursor = conn.execute("""
- SELECT lot_id, local_path
- FROM images
- WHERE downloaded = 1
- AND local_path IS NOT NULL
- LIMIT 5
- """)
- samples = cursor.fetchall()
- for lot_id, path in samples:
- exists = "[+]" if Path(path).exists() else "[!]"
- print(f" {exists} {lot_id}: {path}")
-
- conn.close()
-
- print("\n" + "=" * 60)
- print("VERIFICATION COMPLETE")
- print("=" * 60)
-
-if __name__ == "__main__":
- verify_database()