diff --git a/check_apollo_state.py b/check_apollo_state.py deleted file mode 100644 index 287981a..0000000 --- a/check_apollo_state.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -"""Check for Apollo state or other embedded data""" -import asyncio -import json -import re -from playwright.async_api import async_playwright - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle') - content = await page.content() - - # Look for embedded data structures - patterns = [ - (r']*id="__NEXT_DATA__"[^>]*>(.+?)', "NEXT_DATA"), - (r'window\.__APOLLO_STATE__\s*=\s*({.+?});', "APOLLO_STATE"), - (r'"lots"\s*:\s*\[(.+?)\]', "LOTS_ARRAY"), - ] - - for pattern, name in patterns: - match = re.search(pattern, content, re.DOTALL) - if match: - print(f"\n{'='*60}") - print(f"FOUND: {name}") - print(f"{'='*60}") - try: - if name == "LOTS_ARRAY": - print(f"Preview: {match.group(1)[:500]}") - else: - data = json.loads(match.group(1)) - print(json.dumps(data, indent=2)[:2000]) - except: - print(f"Preview: {match.group(1)[:1000]}") - - # Also check for any script tags with "lot" and "bid" and "end" - print(f"\n{'='*60}") - print("SEARCHING FOR LOT DATA IN ALL SCRIPTS") - print(f"{'='*60}") - - scripts = re.findall(r']*>(.+?)', content, re.DOTALL) - for i, script in enumerate(scripts): - if all(term in script.lower() for term in ['lot', 'bid', 'end']): - print(f"\nScript #{i} (first 500 chars):") - print(script[:500]) - if i > 3: # Limit output - break - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/check_data.py b/check_data.py deleted file mode 100644 index c35f646..0000000 --- a/check_data.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -"""Check current data quality in cache.db""" -import sqlite3 - -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - -print("=" * 60) -print("CURRENT DATA QUALITY CHECK") -print("=" * 60) - -# Check lots table -print("\n[*] Sample Lot Data:") -cursor = conn.execute(""" - SELECT lot_id, current_bid, bid_count, closing_time - FROM lots - LIMIT 10 -""") -for row in cursor: - print(f" Lot: {row[0]}") - print(f" Current Bid: {row[1]}") - print(f" Bid Count: {row[2]}") - print(f" Closing Time: {row[3]}") - -# Check auctions table -print("\n[*] Sample Auction Data:") -cursor = conn.execute(""" - SELECT auction_id, title, closing_time, first_lot_closing_time - FROM auctions - LIMIT 5 -""") -for row in cursor: - print(f" Auction: {row[0]}") - print(f" Title: {row[1][:50]}...") - print(f" Closing Time: {row[2] if len(row) > 2 else 'N/A'}") - print(f" First Lot Closing: {row[3]}") - -# Data completeness stats -print("\n[*] Data Completeness:") -cursor = conn.execute(""" - SELECT - COUNT(*) as total, - SUM(CASE WHEN current_bid IS NULL OR current_bid = '' THEN 1 ELSE 0 END) as missing_current_bid, - SUM(CASE WHEN closing_time IS NULL OR closing_time = '' THEN 1 ELSE 0 END) as missing_closing_time, - SUM(CASE WHEN bid_count IS NULL OR bid_count = 0 THEN 1 ELSE 0 END) as zero_bid_count - FROM lots -""") -row = cursor.fetchone() -print(f" Total lots: {row[0]:,}") -print(f" Missing current_bid: {row[1]:,} ({100*row[1]/row[0]:.1f}%)") -print(f" Missing closing_time: {row[2]:,} ({100*row[2]/row[0]:.1f}%)") -print(f" Zero bid_count: {row[3]:,} ({100*row[3]/row[0]:.1f}%)") - -conn.close() -print("\n" + "=" * 60) diff --git a/check_graphql_full.py b/check_graphql_full.py deleted file mode 100644 index 09dc901..0000000 --- a/check_graphql_full.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 -"""Check if GraphQL has viewing/pickup data""" -import asyncio -import json -import sys -sys.path.insert(0, 'src') - -from graphql_client import GRAPHQL_ENDPOINT -import aiohttp - -# Expanded query to check for all available fields -EXTENDED_QUERY = """ -query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) { - lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) { - lot { - id - displayId - auctionId - currentBidAmount { cents currency } - initialAmount { cents currency } - nextMinimalBid { cents currency } - bidsCount - startDate - endDate - - # Try to find viewing/pickup fields - viewingDays { startDate endDate city countryCode } - collectionDays { startDate endDate city countryCode } - pickupDays { startDate endDate city countryCode } - } - auction { - id - displayId - viewingDays { startDate endDate city countryCode } - collectionDays { startDate endDate city countryCode } - } - } -} -""" - -async def main(): - variables = { - "lotDisplayId": "A1-28505-5", - "locale": "nl", - "platform": "TWK" - } - - payload = { - "query": EXTENDED_QUERY, - "variables": variables - } - - try: - async with aiohttp.ClientSession() as session: - async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response: - if response.status == 200: - data = await response.json() - print("Full GraphQL Response:") - print(json.dumps(data, indent=2)) - else: - print(f"Error: {response.status}") - print(await response.text()) - except Exception as e: - print(f"Exception: {e}") - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/check_lot_auction_link.py b/check_lot_auction_link.py deleted file mode 100644 index 9268e0d..0000000 --- a/check_lot_auction_link.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Check how lots link to auctions""" -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) - -from cache import CacheManager -import sqlite3 -import zlib -import json -import re - -cache = CacheManager() -conn = sqlite3.connect(cache.db_path) -cursor = conn.cursor() - -# Get a lot page from cache -cursor.execute("SELECT url, content FROM cache WHERE url LIKE '%/l/%' LIMIT 1") -url, content_blob = cursor.fetchone() -content = zlib.decompress(content_blob).decode('utf-8') - -# Extract __NEXT_DATA__ -match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) -data = json.loads(match.group(1)) - -props = data.get('props', {}).get('pageProps', {}) -print("PageProps keys:", list(props.keys())) - -lot = props.get('lot', {}) -print("\nLot data:") -print(f" displayId: {lot.get('displayId')}") -print(f" auctionId (UUID): {lot.get('auctionId')}") - -# Check if auction data is also included -auction = props.get('auction') -if auction: - print("\nAuction data IS included in lot page!") - print(f" Auction displayId: {auction.get('displayId')}") - print(f" Auction id (UUID): {auction.get('id')}") - print(f" Auction name: {auction.get('name', '')[:60]}") -else: - print("\nAuction data NOT included in lot page") - print("Need to look up auction by UUID") - -# Check if we can find the auction by UUID -lot_auction_uuid = lot.get('auctionId') -if lot_auction_uuid: - # Try to find auction page with this UUID - cursor.execute(""" - SELECT url, content FROM cache - WHERE url LIKE '%/a/%' - LIMIT 10 - """) - - found_match = False - for auction_url, auction_content_blob in cursor.fetchall(): - auction_content = zlib.decompress(auction_content_blob).decode('utf-8') - match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', auction_content, re.DOTALL) - if match: - auction_data = json.loads(match.group(1)) - auction_obj = auction_data.get('props', {}).get('pageProps', {}).get('auction', {}) - if auction_obj.get('id') == lot_auction_uuid: - print(f"\n✓ Found matching auction!") - print(f" Auction displayId: {auction_obj.get('displayId')}") - print(f" Auction UUID: {auction_obj.get('id')}") - print(f" Auction URL: {auction_url}") - found_match = True - break - - if not found_match: - print(f"\n✗ Could not find auction with UUID {lot_auction_uuid} in first 10 cached auctions") - -conn.close() diff --git a/check_viewing_data.py b/check_viewing_data.py deleted file mode 100644 index e8e3405..0000000 --- a/check_viewing_data.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 -"""Check viewing time data""" -import sqlite3 - -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - -# Check if viewing_time has data -cursor = conn.execute(""" - SELECT viewing_time, pickup_date - FROM lots - WHERE viewing_time IS NOT NULL AND viewing_time != '' - LIMIT 5 -""") - -rows = cursor.fetchall() -print("Existing viewing_time data:") -for r in rows: - print(f" Viewing: {r[0]}") - print(f" Pickup: {r[1]}") - print() - -# Check overall completeness -cursor = conn.execute(""" - SELECT - COUNT(*) as total, - SUM(CASE WHEN viewing_time IS NOT NULL AND viewing_time != '' THEN 1 ELSE 0 END) as has_viewing, - SUM(CASE WHEN pickup_date IS NOT NULL AND pickup_date != '' THEN 1 ELSE 0 END) as has_pickup - FROM lots -""") -row = cursor.fetchone() -print(f"Completeness:") -print(f" Total lots: {row[0]}") -print(f" Has viewing_time: {row[1]} ({100*row[1]/row[0]:.1f}%)") -print(f" Has pickup_date: {row[2]} ({100*row[2]/row[0]:.1f}%)") - -conn.close() diff --git a/check_viewing_time.py b/check_viewing_time.py deleted file mode 100644 index 4688b54..0000000 --- a/check_viewing_time.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -"""Check if viewing time is in the GraphQL response""" -import asyncio -import json -from playwright.async_api import async_playwright - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - responses = [] - - async def capture_response(response): - if 'graphql' in response.url and 'LotBiddingData' in await response.text(): - try: - body = await response.json() - responses.append(body) - except: - pass - - page.on('response', capture_response) - - await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle') - await asyncio.sleep(2) - - if responses: - print("Full LotBiddingData Response:") - print("="*60) - print(json.dumps(responses[0], indent=2)) - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/debug_lot_structure.py b/debug_lot_structure.py deleted file mode 100644 index 8a8148d..0000000 --- a/debug_lot_structure.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python3 -"""Debug lot data structure from cached page""" -import sqlite3 -import zlib -import json -import re -import sys -sys.path.insert(0, 'src') - -from parse import DataParser - -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - -# Get a recent lot page -cursor = conn.execute(""" - SELECT url, content - FROM cache - WHERE url LIKE '%/l/%' - ORDER BY timestamp DESC - LIMIT 1 -""") - -row = cursor.fetchone() -if not row: - print("No lot pages found") - exit(1) - -url, content_blob = row -content = zlib.decompress(content_blob).decode('utf-8') - -parser = DataParser() -result = parser.parse_page(content, url) - -if result: - print(f"URL: {url}") - print(f"\nParsed Data:") - print(f" type: {result.get('type')}") - print(f" lot_id: {result.get('lot_id')}") - print(f" title: {result.get('title', '')[:50]}...") - print(f" current_bid: {result.get('current_bid')}") - print(f" bid_count: {result.get('bid_count')}") - print(f" closing_time: {result.get('closing_time')}") - print(f" location: {result.get('location')}") - -# Also dump the raw JSON -match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) -if match: - data = json.loads(match.group(1)) - page_props = data.get('props', {}).get('pageProps', {}) - - if 'lot' in page_props: - lot = page_props['lot'] - print(f"\nRAW __NEXT_DATA__.lot keys: {list(lot.keys())}") - print(f"\nSearching for bid/timing fields...") - - # Deep search for these fields - def deep_search(obj, prefix=""): - if isinstance(obj, dict): - for k, v in obj.items(): - if any(term in k.lower() for term in ['bid', 'end', 'close', 'date', 'time']): - print(f" {prefix}{k}: {v}") - if isinstance(v, (dict, list)): - deep_search(v, prefix + k + ".") - elif isinstance(obj, list) and len(obj) > 0: - deep_search(obj[0], prefix + "[0].") - - deep_search(lot) - -conn.close() diff --git a/deep_inspect_lot.py b/deep_inspect_lot.py deleted file mode 100644 index 64bd218..0000000 --- a/deep_inspect_lot.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -"""Deep inspect lot JSON for viewing/pickup data""" -import sqlite3 -import zlib -import json -import re - -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - -cursor = conn.execute(""" - SELECT url, content - FROM cache - WHERE url LIKE '%/l/%' - ORDER BY timestamp DESC - LIMIT 1 -""") - -row = cursor.fetchone() -url, content_blob = row -content = zlib.decompress(content_blob).decode('utf-8') - -match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) -data = json.loads(match.group(1)) -lot = data.get('props', {}).get('pageProps', {}).get('lot', {}) - -print(f"Inspecting: {url}\n") - -# Check onboarding -if 'onboarding' in lot: - print("ONBOARDING:") - print(json.dumps(lot['onboarding'], indent=2)) - print() - -# Check attributes -if 'attributes' in lot: - print("ATTRIBUTES:") - attrs = lot['attributes'] - print(json.dumps(attrs[:3] if isinstance(attrs, list) else attrs, indent=2)) - print() - -# Check condition -if 'condition' in lot: - print("CONDITION:") - print(json.dumps(lot['condition'], indent=2)) - print() - -# Check appearance -if 'appearance' in lot: - print("APPEARANCE:") - print(json.dumps(lot['appearance'], indent=2)) - print() - -# Check location -if 'location' in lot: - print("LOCATION:") - print(json.dumps(lot['location'], indent=2)) - print() - -# Check for any field with "view", "pick", "collect", "date", "time" -print("\nFIELDS WITH VIEWING/PICKUP/TIME:") -for key in lot.keys(): - if any(term in key.lower() for term in ['view', 'pick', 'collect', 'date', 'time', 'day']): - print(f" {key}: {lot[key]}") - -conn.close() diff --git a/enrich_existing_lots.py b/enrich_existing_lots.py deleted file mode 100644 index a43bff6..0000000 --- a/enrich_existing_lots.py +++ /dev/null @@ -1,120 +0,0 @@ -""" -Enrich existing lots with new intelligence fields: -- followers_count -- estimated_min_price / estimated_max_price -- lot_condition -- appearance - -Reads from cached lot pages __NEXT_DATA__ JSON -""" -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) - -import asyncio -from cache import CacheManager -import sqlite3 -import zlib -import json -import re -from graphql_client import fetch_lot_bidding_data, format_bid_data - -async def enrich_existing_lots(): - """Enrich existing lots with new fields from GraphQL API""" - cache = CacheManager() - conn = sqlite3.connect(cache.db_path) - cursor = conn.cursor() - - # Get all lot IDs - cursor.execute("SELECT lot_id FROM lots") - lot_ids = [r[0] for r in cursor.fetchall()] - - print(f"Found {len(lot_ids)} lots to enrich") - print("Fetching enrichment data from GraphQL API...") - print("This will take ~{:.1f} minutes (0.5s rate limit)".format(len(lot_ids) * 0.5 / 60)) - - enriched = 0 - failed = 0 - no_data = 0 - - for i, lot_id in enumerate(lot_ids): - if (i + 1) % 10 == 0: - print(f"Progress: {i+1}/{len(lot_ids)} ({enriched} enriched, {no_data} no data, {failed} failed)", end='\r') - - try: - # Fetch from GraphQL API - bidding_data = await fetch_lot_bidding_data(lot_id) - - if bidding_data: - formatted_data = format_bid_data(bidding_data) - - # Update lot with new fields - cursor.execute(""" - UPDATE lots - SET followers_count = ?, - estimated_min_price = ?, - estimated_max_price = ?, - lot_condition = ?, - appearance = ? - WHERE lot_id = ? - """, ( - formatted_data.get('followers_count', 0), - formatted_data.get('estimated_min_price'), - formatted_data.get('estimated_max_price'), - formatted_data.get('lot_condition', ''), - formatted_data.get('appearance', ''), - lot_id - )) - - enriched += 1 - - # Commit every 50 lots - if enriched % 50 == 0: - conn.commit() - - else: - no_data += 1 - - # Rate limit - await asyncio.sleep(0.5) - - except Exception as e: - failed += 1 - continue - - conn.commit() - - print(f"\n\nComplete!") - print(f"Total lots: {len(lot_ids)}") - print(f"Enriched: {enriched}") - print(f"No data: {no_data}") - print(f"Failed: {failed}") - - # Show statistics - cursor.execute("SELECT COUNT(*) FROM lots WHERE followers_count > 0") - with_followers = cursor.fetchone()[0] - - cursor.execute("SELECT COUNT(*) FROM lots WHERE estimated_min_price IS NOT NULL") - with_estimates = cursor.fetchone()[0] - - cursor.execute("SELECT COUNT(*) FROM lots WHERE lot_condition IS NOT NULL AND lot_condition != ''") - with_condition = cursor.fetchone()[0] - - print(f"\nEnrichment statistics:") - print(f" Lots with followers_count: {with_followers} ({with_followers/len(lot_ids)*100:.1f}%)") - print(f" Lots with estimated prices: {with_estimates} ({with_estimates/len(lot_ids)*100:.1f}%)") - print(f" Lots with condition: {with_condition} ({with_condition/len(lot_ids)*100:.1f}%)") - - conn.close() - -if __name__ == "__main__": - print("WARNING: This will make ~16,800 API calls at 0.5s intervals (~2.3 hours)") - print("Press Ctrl+C to cancel, or wait 5 seconds to continue...") - import time - try: - time.sleep(5) - except KeyboardInterrupt: - print("\nCancelled") - sys.exit(0) - - asyncio.run(enrich_existing_lots()) diff --git a/explore_api_fields.py b/explore_api_fields.py deleted file mode 100644 index db34e17..0000000 --- a/explore_api_fields.py +++ /dev/null @@ -1,370 +0,0 @@ -""" -Explore API responses to identify additional fields available for intelligence. -Tests GraphQL and REST API responses for field coverage. -""" -import asyncio -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) - -import json -import aiohttp -from graphql_client import fetch_lot_bidding_data, GRAPHQL_ENDPOINT -from bid_history_client import fetch_bid_history, BID_HISTORY_ENDPOINT - -async def explore_graphql_schema(): - """Query GraphQL schema to see all available fields""" - print("=" * 80) - print("GRAPHQL SCHEMA EXPLORATION") - print("=" * 80) - - # Introspection query for LotDetails type - introspection_query = """ - query IntrospectionQuery { - __type(name: "LotDetails") { - name - fields { - name - type { - name - kind - ofType { - name - kind - } - } - } - } - } - """ - - async with aiohttp.ClientSession() as session: - try: - async with session.post( - GRAPHQL_ENDPOINT, - json={ - "query": introspection_query, - "variables": {} - }, - headers={"Content-Type": "application/json"} - ) as response: - if response.status == 200: - data = await response.json() - lot_type = data.get('data', {}).get('__type') - if lot_type: - print("\nLotDetails available fields:") - for field in lot_type.get('fields', []): - field_name = field['name'] - field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex') - print(f" - {field_name}: {field_type}") - print() - else: - print(f"Failed with status {response.status}") - except Exception as e: - print(f"Error: {e}") - - # Also try Lot type - introspection_query_lot = """ - query IntrospectionQuery { - __type(name: "Lot") { - name - fields { - name - type { - name - kind - ofType { - name - kind - } - } - } - } - } - """ - - async with aiohttp.ClientSession() as session: - try: - async with session.post( - GRAPHQL_ENDPOINT, - json={ - "query": introspection_query_lot, - "variables": {} - }, - headers={"Content-Type": "application/json"} - ) as response: - if response.status == 200: - data = await response.json() - lot_type = data.get('data', {}).get('__type') - if lot_type: - print("\nLot type available fields:") - for field in lot_type.get('fields', []): - field_name = field['name'] - field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex') - print(f" - {field_name}: {field_type}") - print() - except Exception as e: - print(f"Error: {e}") - -async def test_graphql_full_query(): - """Test a comprehensive GraphQL query to see all returned data""" - print("=" * 80) - print("GRAPHQL FULL QUERY TEST") - print("=" * 80) - - # Test with a real lot ID - lot_id = "A1-34731-107" # Example from database - - comprehensive_query = """ - query ComprehensiveLotQuery($lotDisplayId: String!, $locale: String!, $platform: Platform!) { - lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) { - lot { - id - displayId - title - description - currentBidAmount { cents currency } - initialAmount { cents currency } - nextMinimalBid { cents currency } - bidsCount - startDate - endDate - minimumBidAmountMet - lotNumber - auctionId - lotState - location { - city - countryCode - } - viewingDays { - city - countryCode - addressLine1 - addressLine2 - endDate - startDate - } - collectionDays { - city - countryCode - addressLine1 - addressLine2 - endDate - startDate - } - images { - url - thumbnailUrl - } - attributes { - name - value - } - } - } - } - """ - - async with aiohttp.ClientSession() as session: - try: - async with session.post( - GRAPHQL_ENDPOINT, - json={ - "query": comprehensive_query, - "variables": { - "lotDisplayId": lot_id, - "locale": "nl_NL", - "platform": "WEB" - } - }, - headers={"Content-Type": "application/json"} - ) as response: - if response.status == 200: - data = await response.json() - print(f"\nFull GraphQL response for {lot_id}:") - print(json.dumps(data, indent=2)) - print() - else: - print(f"Failed with status {response.status}") - print(await response.text()) - except Exception as e: - print(f"Error: {e}") - -async def test_bid_history_response(): - """Test bid history API to see all returned fields""" - print("=" * 80) - print("BID HISTORY API TEST") - print("=" * 80) - - # Get a lot with bids from database - import sqlite3 - from cache import CacheManager - - cache = CacheManager() - conn = sqlite3.connect(cache.db_path) - cursor = conn.cursor() - - # Find a lot with bids - cursor.execute(""" - SELECT lot_id, url FROM lots - WHERE bid_count > 0 - ORDER BY bid_count DESC - LIMIT 1 - """) - result = cursor.fetchone() - - if result: - lot_id, url = result - # Extract UUID from URL - import re - match = re.search(r']*id="__NEXT_DATA__"[^>]*>', url) - # We need to get UUID from cached page - cursor.execute("SELECT content FROM cache WHERE url = ?", (url,)) - page_result = cursor.fetchone() - - if page_result: - import zlib - content = zlib.decompress(page_result[0]).decode('utf-8') - match = re.search(r'"lot":\s*\{[^}]*"id":\s*"([^"]+)"', content) - if match: - lot_uuid = match.group(1) - print(f"\nTesting with lot {lot_id} (UUID: {lot_uuid})") - - # Fetch bid history - bid_history = await fetch_bid_history(lot_uuid) - if bid_history: - print(f"\nBid history sample (first 3 records):") - for i, bid in enumerate(bid_history[:3]): - print(f"\nBid {i+1}:") - print(json.dumps(bid, indent=2)) - - print(f"\n\nAll available fields in bid records:") - if bid_history: - all_keys = set() - for bid in bid_history: - all_keys.update(bid.keys()) - for key in sorted(all_keys): - print(f" - {key}") - else: - print("No bid history found") - - conn.close() - -async def check_auction_api(): - """Check if there's an auction details API""" - print("=" * 80) - print("AUCTION API EXPLORATION") - print("=" * 80) - - auction_query = """ - query AuctionDetails($auctionId: String!, $locale: String!, $platform: Platform!) { - auctionDetails(auctionId: $auctionId, locale: $locale, platform: $platform) { - auction { - id - title - description - startDate - endDate - firstLotEndDate - location { - city - countryCode - } - viewingDays { - city - countryCode - startDate - endDate - addressLine1 - addressLine2 - } - collectionDays { - city - countryCode - startDate - endDate - addressLine1 - addressLine2 - } - } - } - } - """ - - # Get an auction ID from database - import sqlite3 - from cache import CacheManager - - cache = CacheManager() - conn = sqlite3.connect(cache.db_path) - cursor = conn.cursor() - - # Get auction ID from a lot - cursor.execute("SELECT DISTINCT auction_id FROM lots WHERE auction_id IS NOT NULL LIMIT 1") - result = cursor.fetchone() - - if result: - auction_id = result[0] - print(f"\nTesting with auction {auction_id}") - - async with aiohttp.ClientSession() as session: - try: - async with session.post( - GRAPHQL_ENDPOINT, - json={ - "query": auction_query, - "variables": { - "auctionId": auction_id, - "locale": "nl_NL", - "platform": "WEB" - } - }, - headers={"Content-Type": "application/json"} - ) as response: - if response.status == 200: - data = await response.json() - print("\nAuction API response:") - print(json.dumps(data, indent=2)) - else: - print(f"Failed with status {response.status}") - print(await response.text()) - except Exception as e: - print(f"Error: {e}") - - conn.close() - -async def main(): - """Run all API explorations""" - await explore_graphql_schema() - await test_graphql_full_query() - await test_bid_history_response() - await check_auction_api() - - print("\n" + "=" * 80) - print("SUMMARY: AVAILABLE DATA FIELDS") - print("=" * 80) - print(""" - CURRENTLY CAPTURED: - - Lot bidding data: current_bid, starting_bid, minimum_bid, bid_count, closing_time - - Lot attributes: brand, model, manufacturer, year, condition, serial_number - - Bid history: bid_amount, bid_time, bidder_id, is_autobid - - Bid intelligence: first_bid_time, last_bid_time, bid_velocity, bid_increment - - Images: URLs and local paths - - POTENTIALLY AVAILABLE (TO CHECK): - - Viewing/collection times with full address and date ranges - - Lot location details (city, country) - - Lot state/status - - Image thumbnails - - More detailed attributes - - NOT AVAILABLE: - - Watch count (not exposed in API) - - Reserve price (not exposed in API) - - Estimated min/max value (not exposed in API) - - Bidder identities (anonymized) - """) - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/explore_auction_schema.py b/explore_auction_schema.py deleted file mode 100644 index f6ef30a..0000000 --- a/explore_auction_schema.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 -"""Explore the actual auction schema""" -import asyncio -import aiohttp -import json - -GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql" - -# Try different field structures -QUERIES = { - "viewingDays_simple": """ -query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) { - auction(id: $auctionId, locale: $locale, platform: $platform) { - viewingDays { - city - countryCode - } - } -} -""", - "viewingDays_with_times": """ -query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) { - auction(id: $auctionId, locale: $locale, platform: $platform) { - viewingDays { - from - to - city - } - } -} -""", - "full_auction": """ -query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) { - auction(id: $auctionId, locale: $locale, platform: $platform) { - id - displayId - biddingStatus - buyersPremium - viewingDays { - city - countryCode - from - to - } - collectionDays { - city - countryCode - from - to - } - } -} -""" -} - -async def test_query(name, query, auction_id): - variables = { - "auctionId": auction_id, - "locale": "nl", - "platform": "TWK" - } - - payload = { - "query": query, - "variables": variables - } - - async with aiohttp.ClientSession() as session: - async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response: - data = await response.json() - - print(f"\n{'='*60}") - print(f"QUERY: {name}") - print(f"{'='*60}") - - if 'errors' in data: - print("ERRORS:") - for error in data['errors']: - print(f" {error}") - else: - print("SUCCESS:") - print(json.dumps(data, indent=2)) - -async def main(): - # Test with the auction we know exists - auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa" - - for name, query in QUERIES.items(): - await test_query(name, query, auction_id) - await asyncio.sleep(0.5) - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/export_incremental.py b/export_incremental.py deleted file mode 100644 index f368d98..0000000 --- a/export_incremental.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -""" -Export only NEW auctions/lots that haven't been sent to server yet -Prevents UNIQUE constraint errors on server import -""" - -import sqlite3 -import json -import csv -from datetime import datetime -from pathlib import Path - -DB_PATH = "C:/mnt/okcomputer/output/cache.db" -OUTPUT_DIR = Path("C:/mnt/okcomputer/output") -SYNC_STATE_FILE = OUTPUT_DIR / ".server_sync_state" - -def get_last_export_timestamp(): - """Get timestamp of last successful export to server""" - if SYNC_STATE_FILE.exists(): - return int(SYNC_STATE_FILE.read_text().strip()) - return 0 - -def save_export_timestamp(timestamp: int): - """Save timestamp of successful export""" - SYNC_STATE_FILE.write_text(str(timestamp)) - -def export_new_data(): - """Export only records that are NEW since last server import""" - conn = sqlite3.connect(DB_PATH) - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - last_export = get_last_export_timestamp() - current_time = int(datetime.now().timestamp()) - - print("="*60) - print("INCREMENTAL EXPORT FOR SERVER") - print("="*60) - print(f"Last export: {datetime.fromtimestamp(last_export).strftime('%Y-%m-%d %H:%M:%S') if last_export else 'Never (will export ALL)'}") - print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}") - print() - - # Get new auctions (discovered_at > last_export) - if last_export == 0: - # First run: export all - cursor.execute("SELECT * FROM auctions ORDER BY auction_id") - else: - # Subsequent runs: only new ones - cursor.execute(""" - SELECT * FROM auctions - WHERE discovered_at > ? - ORDER BY auction_id - """, (last_export,)) - - new_auctions = [dict(row) for row in cursor.fetchall()] - - # Get new lots (scraped_at_timestamp > last_export) - if last_export == 0: - cursor.execute("SELECT * FROM lots ORDER BY lot_id") - else: - cursor.execute(""" - SELECT * FROM lots - WHERE scraped_at_timestamp > ? - ORDER BY lot_id - """, (last_export,)) - - new_lots = [dict(row) for row in cursor.fetchall()] - - conn.close() - - # Export to server-ready files - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - files_created = [] - - # Export auctions - if new_auctions: - auctions_csv = OUTPUT_DIR / f'auctions_{timestamp}.csv' - auctions_json = OUTPUT_DIR / f'auctions_{timestamp}.json' - - with open(auctions_csv, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys()) - writer.writeheader() - writer.writerows(new_auctions) - - with open(auctions_json, 'w', encoding='utf-8') as f: - json.dump(new_auctions, f, indent=2, ensure_ascii=False) - - files_created.extend([auctions_csv, auctions_json]) - print(f"✓ Exported {len(new_auctions)} auctions") - print(f" CSV: {auctions_csv}") - print(f" JSON: {auctions_json}") - else: - print("✓ No new auctions to export") - - # Export lots - if new_lots: - lots_csv = OUTPUT_DIR / f'lots_{timestamp}.csv' - lots_json = OUTPUT_DIR / f'lots_{timestamp}.json' - - with open(lots_csv, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=new_lots[0].keys()) - writer.writeheader() - writer.writerows(new_lots) - - with open(lots_json, 'w', encoding='utf-8') as f: - json.dump(new_lots, f, indent=2, ensure_ascii=False) - - files_created.extend([lots_csv, lots_json]) - print(f"✓ Exported {len(new_lots)} lots") - print(f" CSV: {lots_csv}") - print(f" JSON: {lots_json}") - else: - print("✓ No new lots to export") - - # Save sync state - if new_auctions or new_lots: - save_export_timestamp(current_time) - print() - print("="*60) - print("EXPORT COMPLETE") - print("="*60) - print(f"New auctions: {len(new_auctions)}") - print(f"New lots: {len(new_lots)}") - print() - print("Next export will only include records newer than:") - print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}") - else: - print() - print("="*60) - print("NOTHING TO EXPORT") - print("="*60) - print("All data already exported to server") - - return { - 'auctions': len(new_auctions), - 'lots': len(new_lots), - 'files': [str(f) for f in files_created] - } - -if __name__ == "__main__": - export_new_data() diff --git a/extract_graphql_query.py b/extract_graphql_query.py deleted file mode 100644 index c9f2ec9..0000000 --- a/extract_graphql_query.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -"""Extract the GraphQL query being used""" -import asyncio -import json -from playwright.async_api import async_playwright - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - graphql_requests = [] - - async def capture_request(request): - if 'graphql' in request.url: - graphql_requests.append({ - 'url': request.url, - 'method': request.method, - 'post_data': request.post_data, - 'headers': dict(request.headers) - }) - - page.on('request', capture_request) - - await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle') - await asyncio.sleep(2) - - print(f"Captured {len(graphql_requests)} GraphQL requests\n") - - for i, req in enumerate(graphql_requests): - print(f"{'='*60}") - print(f"REQUEST #{i+1}") - print(f"{'='*60}") - print(f"URL: {req['url']}") - print(f"Method: {req['method']}") - - if req['post_data']: - try: - data = json.loads(req['post_data']) - print(f"\nQuery Name: {data.get('operationName', 'N/A')}") - print(f"\nVariables:") - print(json.dumps(data.get('variables', {}), indent=2)) - print(f"\nQuery:") - print(data.get('query', '')[:1000]) - except: - print(f"\nPOST Data: {req['post_data'][:500]}") - - print() - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/extract_viewing_from_html.py b/extract_viewing_from_html.py deleted file mode 100644 index a5dabb5..0000000 --- a/extract_viewing_from_html.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -"""Find viewing/pickup in actual HTML""" -import asyncio -from playwright.async_api import async_playwright -import re - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - # Try a lot that should have viewing times - await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle') - - # Get text content - text_content = await page.evaluate("document.body.innerText") - - print("Searching for viewing/pickup patterns...\n") - - # Look for "Bezichtigingen" section - lines = text_content.split('\n') - for i, line in enumerate(lines): - if 'bezichtig' in line.lower() or 'viewing' in line.lower(): - # Print surrounding context - context = lines[max(0, i-1):min(len(lines), i+5)] - print("FOUND Bezichtigingen:") - for c in context: - print(f" {c}") - print() - break - - # Look for "Ophalen" section - for i, line in enumerate(lines): - if 'ophalen' in line.lower() or 'collection' in line.lower() or 'pickup' in line.lower(): - context = lines[max(0, i-1):min(len(lines), i+5)] - print("FOUND Ophalen:") - for c in context: - print(f" {c}") - print() - break - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/fetch_missing_bid_history.py b/fetch_missing_bid_history.py deleted file mode 100644 index c38ab12..0000000 --- a/fetch_missing_bid_history.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -Fetch bid history for existing lots that have bids but no bid history records. -Reads cached lot pages to get lot UUIDs, then calls bid history API. -""" -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) - -import asyncio -from cache import CacheManager -import sqlite3 -import zlib -import json -import re -from bid_history_client import fetch_bid_history, parse_bid_history - -async def fetch_missing_bid_history(): - """Fetch bid history for lots that have bids but no history records""" - cache = CacheManager() - conn = sqlite3.connect(cache.db_path) - cursor = conn.cursor() - - # Get lots with bids but no bid history - cursor.execute(""" - SELECT l.lot_id, l.bid_count - FROM lots l - WHERE l.bid_count > 0 - AND l.lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history) - ORDER BY l.bid_count DESC - """) - - lots_to_fetch = cursor.fetchall() - print(f"Found {len(lots_to_fetch)} lots with bids but no bid history") - - if not lots_to_fetch: - print("No lots to process!") - conn.close() - return - - # Build mapping from lot_id to lot UUID from cached pages - print("Building lot_id -> UUID mapping from cache...") - - cursor.execute(""" - SELECT url, content - FROM cache - WHERE url LIKE '%/l/%' - """) - - lot_id_to_uuid = {} - total_cached = 0 - - for url, content_blob in cursor: - total_cached += 1 - - if total_cached % 100 == 0: - print(f"Processed {total_cached} cached pages...", end='\r') - - try: - content = zlib.decompress(content_blob).decode('utf-8') - match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) - - if not match: - continue - - data = json.loads(match.group(1)) - lot = data.get('props', {}).get('pageProps', {}).get('lot', {}) - - if not lot: - continue - - lot_display_id = lot.get('displayId') - lot_uuid = lot.get('id') - - if lot_display_id and lot_uuid: - lot_id_to_uuid[lot_display_id] = lot_uuid - - except: - continue - - print(f"\n\nBuilt UUID mapping for {len(lot_id_to_uuid)} lots") - - # Fetch bid history for each lot - print("\nFetching bid history from API...") - - fetched = 0 - failed = 0 - no_uuid = 0 - - for lot_id, bid_count in lots_to_fetch: - lot_uuid = lot_id_to_uuid.get(lot_id) - - if not lot_uuid: - no_uuid += 1 - continue - - try: - print(f"\nFetching bid history for {lot_id} ({bid_count} bids)...") - bid_history = await fetch_bid_history(lot_uuid) - - if bid_history: - bid_data = parse_bid_history(bid_history, lot_id) - - # Update lots table with bid intelligence - cursor.execute(""" - UPDATE lots - SET first_bid_time = ?, - last_bid_time = ?, - bid_velocity = ? - WHERE lot_id = ? - """, ( - bid_data['first_bid_time'], - bid_data['last_bid_time'], - bid_data['bid_velocity'], - lot_id - )) - - # Save bid history records - cache.save_bid_history(lot_id, bid_data['bid_records']) - - fetched += 1 - print(f" Saved {len(bid_data['bid_records'])} bid records") - print(f" Bid velocity: {bid_data['bid_velocity']:.2f} bids/hour") - - # Commit every 10 lots - if fetched % 10 == 0: - conn.commit() - print(f"\nProgress: {fetched}/{len(lots_to_fetch)} lots processed...") - - # Rate limit to be respectful - await asyncio.sleep(0.5) - - else: - failed += 1 - - except Exception as e: - print(f" Error fetching bid history for {lot_id}: {e}") - failed += 1 - continue - - conn.commit() - - print(f"\n\nComplete!") - print(f"Total lots to process: {len(lots_to_fetch)}") - print(f"Successfully fetched: {fetched}") - print(f"Failed: {failed}") - print(f"No UUID found: {no_uuid}") - - # Verify fix - cursor.execute(""" - SELECT COUNT(DISTINCT lot_id) FROM bid_history - """) - lots_with_history = cursor.fetchone()[0] - - cursor.execute(""" - SELECT COUNT(*) FROM lots WHERE bid_count > 0 - """) - lots_with_bids = cursor.fetchone()[0] - - print(f"\nLots with bids: {lots_with_bids}") - print(f"Lots with bid history: {lots_with_history}") - print(f"Coverage: {lots_with_history/lots_with_bids*100:.1f}%") - - conn.close() - -if __name__ == "__main__": - asyncio.run(fetch_missing_bid_history()) diff --git a/find_api_endpoint.py b/find_api_endpoint.py deleted file mode 100644 index 30f8e9e..0000000 --- a/find_api_endpoint.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -"""Find the API endpoint by monitoring network requests""" -import asyncio -import json -from playwright.async_api import async_playwright - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - requests = [] - responses = [] - - async def log_request(request): - if any(term in request.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']): - requests.append({ - 'url': request.url, - 'method': request.method, - 'headers': dict(request.headers), - 'post_data': request.post_data - }) - - async def log_response(response): - if any(term in response.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']): - try: - body = await response.text() - responses.append({ - 'url': response.url, - 'status': response.status, - 'body': body[:1000] - }) - except: - pass - - page.on('request', log_request) - page.on('response', log_response) - - print("Loading lot page...") - await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle') - - # Wait for dynamic content - await asyncio.sleep(3) - - print(f"\nFound {len(requests)} relevant requests") - print(f"Found {len(responses)} relevant responses\n") - - for req in requests[:10]: - print(f"REQUEST: {req['method']} {req['url']}") - if req['post_data']: - print(f" POST DATA: {req['post_data'][:200]}") - - print("\n" + "="*60 + "\n") - - for resp in responses[:10]: - print(f"RESPONSE: {resp['url']}") - print(f" Status: {resp['status']}") - print(f" Body: {resp['body'][:300]}") - print() - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/find_api_valid_lot.py b/find_api_valid_lot.py deleted file mode 100644 index 25574b5..0000000 --- a/find_api_valid_lot.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -"""Find API endpoint using a valid lot from database""" -import asyncio -import sqlite3 -from playwright.async_api import async_playwright - -# Get a valid lot URL -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') -cursor = conn.execute("SELECT url FROM lots WHERE url LIKE '%/l/%' LIMIT 5") -lot_urls = [row[0] for row in cursor.fetchall()] -conn.close() - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - api_calls = [] - - async def log_response(response): - url = response.url - # Look for API calls - if ('api' in url.lower() or 'graphql' in url.lower() or - '/v2/' in url or '/v3/' in url or '/v4/' in url or - 'query' in url.lower() or 'mutation' in url.lower()): - try: - body = await response.text() - api_calls.append({ - 'url': url, - 'status': response.status, - 'body': body - }) - print(f"\nAPI: {url}") - except: - pass - - page.on('response', log_response) - - for lot_url in lot_urls[:2]: - print(f"\n{'='*60}") - print(f"Loading: {lot_url}") - print(f"{'='*60}") - - try: - await page.goto(lot_url, wait_until='networkidle', timeout=30000) - await asyncio.sleep(2) - - # Check if page has bid info - content = await page.content() - if 'currentBid' in content or 'Current bid' in content or 'Huidig bod' in content: - print("[+] Page contains bid information") - break - except Exception as e: - print(f"[!] Error: {e}") - continue - - print(f"\n\n{'='*60}") - print(f"CAPTURED {len(api_calls)} API CALLS") - print(f"{'='*60}") - - for call in api_calls: - print(f"\n{call['url']}") - print(f"Status: {call['status']}") - if 'json' in call['body'][:100].lower() or call['body'].startswith('{'): - print(f"Body (first 500 chars): {call['body'][:500]}") - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/find_auction_with_lots.py b/find_auction_with_lots.py deleted file mode 100644 index 4bed970..0000000 --- a/find_auction_with_lots.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -"""Find an auction page with lots data""" -import sqlite3 -import zlib -import json -import re - -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - -cursor = conn.execute(""" - SELECT url, content - FROM cache - WHERE url LIKE '%/a/%' -""") - -for row in cursor: - url, content_blob = row - content = zlib.decompress(content_blob).decode('utf-8') - - match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) - if not match: - continue - - data = json.loads(match.group(1)) - page_props = data.get('props', {}).get('pageProps', {}) - - if 'auction' in page_props: - auction = page_props['auction'] - lots = auction.get('lots', []) - - if lots and len(lots) > 0: - print(f"Found auction with {len(lots)} lots: {url}\n") - - lot = lots[0] - print(f"SAMPLE LOT FROM AUCTION.LOTS[]:") - print(f" displayId: {lot.get('displayId')}") - print(f" title: {lot.get('title', '')[:50]}...") - print(f" urlSlug: {lot.get('urlSlug')}") - print(f"\nBIDDING FIELDS:") - for key in ['currentBid', 'highestBid', 'startingBid', 'minimumBidAmount', 'bidCount', 'numberOfBids']: - print(f" {key}: {lot.get(key)}") - print(f"\nTIMING FIELDS:") - for key in ['endDate', 'startDate', 'closingTime']: - print(f" {key}: {lot.get(key)}") - print(f"\nALL KEYS: {list(lot.keys())[:30]}...") - break - -conn.close() diff --git a/fix_auctions_table.py b/fix_auctions_table.py deleted file mode 100644 index 4ca5154..0000000 --- a/fix_auctions_table.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Fix auctions table by replacing with correct data from cached auction pages. -The auctions table currently has wrong auction_ids (numeric instead of displayId). -""" -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) - -from cache import CacheManager -import sqlite3 -import zlib -import json -import re -from datetime import datetime - -def fix_auctions_table(): - """Rebuild auctions table from cached auction pages""" - cache = CacheManager() - conn = sqlite3.connect(cache.db_path) - cursor = conn.cursor() - - # Clear existing auctions table - print("Clearing auctions table...") - cursor.execute("DELETE FROM auctions") - conn.commit() - - # Get all auction pages from cache - cursor.execute(""" - SELECT url, content - FROM cache - WHERE url LIKE '%/a/%' - """) - - auction_pages = cursor.fetchall() - print(f"Found {len(auction_pages)} auction pages in cache") - - total = 0 - inserted = 0 - errors = 0 - - print("Extracting auction data from cached pages...") - - for url, content_blob in auction_pages: - total += 1 - - if total % 10 == 0: - print(f"Processed {total}/{len(auction_pages)}...", end='\r') - - try: - # Decompress and parse __NEXT_DATA__ - content = zlib.decompress(content_blob).decode('utf-8') - match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) - - if not match: - errors += 1 - continue - - data = json.loads(match.group(1)) - page_props = data.get('props', {}).get('pageProps', {}) - auction = page_props.get('auction', {}) - - if not auction: - errors += 1 - continue - - # Extract auction data - auction_id = auction.get('displayId') - if not auction_id: - errors += 1 - continue - - title = auction.get('name', '') - - # Get location - location = '' - viewing_days = auction.get('viewingDays', []) - if viewing_days and isinstance(viewing_days, list) and len(viewing_days) > 0: - loc = viewing_days[0] - city = loc.get('city', '') - country = loc.get('countryCode', '').upper() - location = f"{city}, {country}" if city and country else (city or country) - - lots_count = auction.get('lotCount', 0) - - # Get first lot closing time - first_lot_closing = '' - min_end_date = auction.get('minEndDate', '') - if min_end_date: - # Format timestamp - try: - dt = datetime.fromisoformat(min_end_date.replace('Z', '+00:00')) - first_lot_closing = dt.strftime('%Y-%m-%d %H:%M:%S') - except: - first_lot_closing = min_end_date - - scraped_at = datetime.now().isoformat() - - # Insert into auctions table - cursor.execute(""" - INSERT OR REPLACE INTO auctions - (auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at) - VALUES (?, ?, ?, ?, ?, ?, ?) - """, (auction_id, url, title, location, lots_count, first_lot_closing, scraped_at)) - - inserted += 1 - - except Exception as e: - errors += 1 - continue - - conn.commit() - - print(f"\n\nComplete!") - print(f"Total auction pages processed: {total}") - print(f"Auctions inserted: {inserted}") - print(f"Errors: {errors}") - - # Verify fix - cursor.execute("SELECT COUNT(*) FROM auctions") - total_auctions = cursor.fetchone()[0] - print(f"\nTotal auctions in table: {total_auctions}") - - cursor.execute(""" - SELECT COUNT(*) FROM lots - WHERE auction_id NOT IN (SELECT auction_id FROM auctions) - AND auction_id != '' - """) - orphaned = cursor.fetchone()[0] - - print(f"Orphaned lots remaining: {orphaned}") - - if orphaned == 0: - print("\nSUCCESS! All lots now have matching auctions!") - else: - # Show sample of remaining orphans - cursor.execute(""" - SELECT lot_id, auction_id FROM lots - WHERE auction_id NOT IN (SELECT auction_id FROM auctions) - AND auction_id != '' - LIMIT 5 - """) - print("\nSample remaining orphaned lots:") - for lot_id, auction_id in cursor.fetchall(): - print(f" {lot_id} -> auction_id: {auction_id}") - - # Show what auction_ids we do have - cursor.execute("SELECT auction_id FROM auctions LIMIT 10") - print("\nSample auction_ids in auctions table:") - for row in cursor.fetchall(): - print(f" {row[0]}") - - conn.close() - -if __name__ == "__main__": - fix_auctions_table() diff --git a/fix_orphaned_lots.py b/fix_orphaned_lots.py deleted file mode 100644 index c38b969..0000000 --- a/fix_orphaned_lots.py +++ /dev/null @@ -1,136 +0,0 @@ -""" -Fix orphaned lots by updating auction_id from UUID to displayId. -This migration reads cached lot pages and extracts the correct auction displayId. -""" -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) - -from cache import CacheManager -import sqlite3 -import zlib -import json -import re - -def fix_orphaned_lots(): - """Update lot auction_id from UUID to auction displayId""" - cache = CacheManager() - conn = sqlite3.connect(cache.db_path) - cursor = conn.cursor() - - # Get all lots that need fixing (have UUID auction_id) - cursor.execute(""" - SELECT l.lot_id, l.auction_id - FROM lots l - WHERE length(l.auction_id) > 20 -- UUID is longer than displayId like "A1-12345" - """) - - lots_to_fix = {lot_id: auction_uuid for lot_id, auction_uuid in cursor.fetchall()} - print(f"Found {len(lots_to_fix)} lots with UUID auction_id that need fixing") - - if not lots_to_fix: - print("No lots to fix!") - conn.close() - return - - # Build mapping from lot displayId to auction displayId from cached pages - print("Building lot displayId -> auction displayId mapping from cache...") - - cursor.execute(""" - SELECT url, content - FROM cache - WHERE url LIKE '%/l/%' - """) - - lot_to_auction_map = {} - total = 0 - errors = 0 - - for url, content_blob in cursor: - total += 1 - - if total % 100 == 0: - print(f"Processing cached pages... {total}", end='\r') - - try: - # Decompress and parse __NEXT_DATA__ - content = zlib.decompress(content_blob).decode('utf-8') - match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) - - if not match: - continue - - data = json.loads(match.group(1)) - page_props = data.get('props', {}).get('pageProps', {}) - - lot = page_props.get('lot', {}) - auction = page_props.get('auction', {}) - - if not lot or not auction: - continue - - lot_display_id = lot.get('displayId') - auction_display_id = auction.get('displayId') - - if lot_display_id and auction_display_id: - lot_to_auction_map[lot_display_id] = auction_display_id - - except Exception as e: - errors += 1 - continue - - print(f"\n\nBuilt mapping for {len(lot_to_auction_map)} lots") - print(f"Errors while parsing: {errors}") - - # Now update the lots table - print("\nUpdating lots table...") - updated = 0 - not_found = 0 - - for lot_id, old_auction_uuid in lots_to_fix.items(): - if lot_id in lot_to_auction_map: - new_auction_id = lot_to_auction_map[lot_id] - cursor.execute(""" - UPDATE lots - SET auction_id = ? - WHERE lot_id = ? - """, (new_auction_id, lot_id)) - updated += 1 - else: - not_found += 1 - - if (updated + not_found) % 100 == 0: - print(f"Updated: {updated}, not found: {not_found}", end='\r') - - conn.commit() - - print(f"\n\nComplete!") - print(f"Total cached pages processed: {total}") - print(f"Lots updated with auction displayId: {updated}") - print(f"Lots not found in cache: {not_found}") - print(f"Parse errors: {errors}") - - # Verify fix - cursor.execute(""" - SELECT COUNT(*) FROM lots - WHERE auction_id NOT IN (SELECT auction_id FROM auctions) - """) - orphaned = cursor.fetchone()[0] - - print(f"\nOrphaned lots remaining: {orphaned}") - - if orphaned > 0: - # Show sample of remaining orphans - cursor.execute(""" - SELECT lot_id, auction_id FROM lots - WHERE auction_id NOT IN (SELECT auction_id FROM auctions) - LIMIT 5 - """) - print("\nSample remaining orphaned lots:") - for lot_id, auction_id in cursor.fetchall(): - print(f" {lot_id} -> auction_id: {auction_id}") - - conn.close() - -if __name__ == "__main__": - fix_orphaned_lots() diff --git a/inspect_cached_page.py b/inspect_cached_page.py deleted file mode 100644 index ac67672..0000000 --- a/inspect_cached_page.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python3 -"""Extract and inspect __NEXT_DATA__ from a cached lot page""" -import sqlite3 -import zlib -import json -import re - -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - -# Get a cached auction page -cursor = conn.execute(""" - SELECT url, content - FROM cache - WHERE url LIKE '%/a/%' - LIMIT 1 -""") - -row = cursor.fetchone() -if not row: - print("No cached lot pages found") - exit(1) - -url, content_blob = row -print(f"Inspecting: {url}\n") - -# Decompress -content = zlib.decompress(content_blob).decode('utf-8') - -# Extract __NEXT_DATA__ -match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) -if not match: - print("No __NEXT_DATA__ found") - exit(1) - -data = json.loads(match.group(1)) -page_props = data.get('props', {}).get('pageProps', {}) - -if 'auction' in page_props: - auction = page_props['auction'] - print("AUCTION DATA STRUCTURE:") - print("=" * 60) - print(f"displayId: {auction.get('displayId')}") - print(f"name: {auction.get('name', '')[:50]}...") - print(f"lots count: {len(auction.get('lots', []))}") - - if auction.get('lots'): - lot = auction['lots'][0] - print(f"\nFIRST LOT STRUCTURE:") - print(f" displayId: {lot.get('displayId')}") - print(f" title: {lot.get('title', '')[:50]}...") - print(f"\n BIDDING:") - print(f" currentBid: {lot.get('currentBid')}") - print(f" highestBid: {lot.get('highestBid')}") - print(f" startingBid: {lot.get('startingBid')}") - print(f" minimumBidAmount: {lot.get('minimumBidAmount')}") - print(f" bidCount: {lot.get('bidCount')}") - print(f" numberOfBids: {lot.get('numberOfBids')}") - print(f" TIMING:") - print(f" endDate: {lot.get('endDate')}") - print(f" startDate: {lot.get('startDate')}") - print(f" closingTime: {lot.get('closingTime')}") - print(f" ALL KEYS: {list(lot.keys())}") - - print(f"\nAUCTION TIMING:") - print(f" minEndDate: {auction.get('minEndDate')}") - print(f" maxEndDate: {auction.get('maxEndDate')}") - print(f" ALL KEYS: {list(auction.keys())}") - -conn.close() diff --git a/inspect_lot_html.py b/inspect_lot_html.py deleted file mode 100644 index 3aa8f05..0000000 --- a/inspect_lot_html.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -"""Inspect a lot page HTML to find viewing_time and pickup_date""" -import asyncio -from playwright.async_api import async_playwright - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - # Use the known lot - await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle') - content = await page.content() - - print("Searching for patterns...") - print("="*60) - - # Search for viewing time patterns - import re - patterns = { - 'Bezichtigingen': r'Bezichtigingen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})', - 'viewing': r'(?i)viewing.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})', - 'Ophalen': r'Ophalen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})', - 'pickup': r'(?i)pickup.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})', - 'Status': r'Status\s+([^<]+)', - } - - for name, pattern in patterns.items(): - matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE) - if matches: - print(f"\n{name}:") - for match in matches[:3]: - print(f" {match[:200]}") - - # Also look for structured data - print("\n\nSearching for 'Bezichtigingen' section:") - bez_match = re.search(r'Bezichtigingen.*?<.*?>(.*?)', content, re.DOTALL) - if bez_match: - print(bez_match.group(0)[:500]) - - print("\n\nSearching for 'Ophalen' section:") - oph_match = re.search(r'Ophalen.*?<.*?>(.*?)', content, re.DOTALL) - if oph_match: - print(oph_match.group(0)[:500]) - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/intercept_api.py b/intercept_api.py deleted file mode 100644 index 43667e7..0000000 --- a/intercept_api.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -"""Intercept API calls to find where lot data comes from""" -import asyncio -import json -from playwright.async_api import async_playwright - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=False) - page = await browser.new_page() - - # Track API calls - api_calls = [] - - async def handle_response(response): - if 'api' in response.url.lower() or 'graphql' in response.url.lower(): - try: - body = await response.json() - api_calls.append({ - 'url': response.url, - 'status': response.status, - 'body': body - }) - print(f"\nAPI CALL: {response.url}") - print(f"Status: {response.status}") - if 'lot' in response.url.lower() or 'auction' in response.url.lower(): - print(f"Body preview: {json.dumps(body, indent=2)[:500]}") - except: - pass - - page.on('response', handle_response) - - # Visit auction page - print("Loading auction page...") - await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle') - - # Wait a bit for lazy loading - await asyncio.sleep(5) - - print(f"\n\nCaptured {len(api_calls)} API calls") - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/migrate_existing_data.py b/migrate_existing_data.py deleted file mode 100644 index e390a24..0000000 --- a/migrate_existing_data.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env python3 -""" -Migrate existing lot data to extract missing enriched fields -""" -import sqlite3 -import json -import re -from datetime import datetime -import sys -sys.path.insert(0, 'src') - -from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json - -DB_PATH = "/mnt/okcomputer/output/cache.db" - -def migrate_lot_attributes(): - """Extract attributes from cached lot pages""" - print("="*60) - print("MIGRATING EXISTING LOT DATA") - print("="*60) - - conn = sqlite3.connect(DB_PATH) - - # Get cached lot pages - cursor = conn.execute(""" - SELECT url, content, timestamp - FROM cache - WHERE url LIKE '%/l/%' - ORDER BY timestamp DESC - """) - - import zlib - updated_count = 0 - - for url, content_blob, timestamp in cursor: - try: - # Get lot_id from URL - lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url) - if not lot_id_match: - lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url) - if not lot_id_match: - continue - - lot_id = lot_id_match.group(1) - - # Check if lot exists in database - lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,)) - lot_row = lot_cursor.fetchone() - if not lot_row: - continue - - _, title, description = lot_row - - # Decompress and parse __NEXT_DATA__ - content = zlib.decompress(content_blob).decode('utf-8') - match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) - if not match: - continue - - data = json.loads(match.group(1)) - lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {}) - if not lot_json: - continue - - # Extract basic attributes - attrs = extract_attributes_from_lot_json(lot_json) - - # Extract enriched attributes - page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')} - enriched = extract_enriched_attributes(lot_json, page_data) - - # Merge - all_attrs = {**attrs, **enriched} - - # Update database - conn.execute(""" - UPDATE lots - SET brand = ?, - model = ?, - attributes_json = ?, - year_manufactured = ?, - condition_score = ?, - condition_description = ?, - serial_number = ?, - manufacturer = ?, - damage_description = ? - WHERE lot_id = ? - """, ( - all_attrs.get('brand', ''), - all_attrs.get('model', ''), - all_attrs.get('attributes_json', ''), - all_attrs.get('year_manufactured'), - all_attrs.get('condition_score'), - all_attrs.get('condition_description', ''), - all_attrs.get('serial_number', ''), - all_attrs.get('manufacturer', ''), - all_attrs.get('damage_description', ''), - lot_id - )) - - updated_count += 1 - if updated_count % 100 == 0: - print(f" Processed {updated_count} lots...") - conn.commit() - - except Exception as e: - print(f" Error processing {url}: {e}") - continue - - conn.commit() - print(f"\n✓ Updated {updated_count} lots with enriched attributes") - - # Show stats - cursor = conn.execute(""" - SELECT - COUNT(*) as total, - SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year, - SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition, - SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer, - SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand, - SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model - FROM lots - """) - stats = cursor.fetchone() - - print(f"\nENRICHMENT STATISTICS:") - print(f" Total lots: {stats[0]:,}") - print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)") - print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)") - print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)") - print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)") - print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)") - - conn.close() - - -def main(): - print("\nStarting migration of existing data...") - print(f"Database: {DB_PATH}\n") - - migrate_lot_attributes() - - print(f"\n{'='*60}") - print("MIGRATION COMPLETE") - print(f"{'='*60}\n") - -if __name__ == "__main__": - main() diff --git a/scrape_fresh_auction.py b/scrape_fresh_auction.py deleted file mode 100644 index 61d6d22..0000000 --- a/scrape_fresh_auction.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -"""Scrape a fresh auction page to see the lots array structure""" -import asyncio -import json -import re -from playwright.async_api import async_playwright - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - # Get first auction - await page.goto("https://www.troostwijkauctions.com/auctions", wait_until='networkidle') - content = await page.content() - - # Find first auction link - match = re.search(r'href="(/a/[^"]+)"', content) - if not match: - print("No auction found") - return - - auction_url = f"https://www.troostwijkauctions.com{match.group(1)}" - print(f"Scraping: {auction_url}\n") - - await page.goto(auction_url, wait_until='networkidle') - content = await page.content() - - # Extract __NEXT_DATA__ - match = re.search(r']*id="__NEXT_DATA__"[^>]*>(.+?)', content, re.DOTALL) - if not match: - print("No __NEXT_DATA__ found") - return - - data = json.loads(match.group(1)) - page_props = data.get('props', {}).get('pageProps', {}) - - if 'auction' in page_props: - auction = page_props['auction'] - print(f"Auction: {auction.get('name', '')[:50]}...") - print(f"Lots in array: {len(auction.get('lots', []))}") - - if auction.get('lots'): - lot = auction['lots'][0] - print(f"\nFIRST LOT:") - print(json.dumps(lot, indent=2)[:1500]) - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/search_cached_viewing.py b/search_cached_viewing.py deleted file mode 100644 index a5e2441..0000000 --- a/search_cached_viewing.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -"""Search cached pages for viewing/pickup text""" -import sqlite3 -import zlib -import re - -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - -cursor = conn.execute(""" - SELECT url, content - FROM cache - WHERE url LIKE '%/l/%' - ORDER BY timestamp DESC - LIMIT 20 -""") - -for url, content_blob in cursor: - try: - content = zlib.decompress(content_blob).decode('utf-8') - - # Look for viewing/pickup patterns - if 'bezichtig' in content.lower() or 'ophalen' in content.lower(): - print(f"\n{'='*60}") - print(f"URL: {url}") - print(f"{'='*60}") - - # Extract sections with context - patterns = [ - (r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'), - (r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'), - ] - - for pattern, label in patterns: - matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL) - if matches: - print(f"\n{label}:") - for match in matches[:1]: # First match - # Clean up HTML - clean = re.sub(r'<[^>]+>', ' ', match) - clean = re.sub(r'\s+', ' ', clean).strip() - print(f" {clean[:200]}") - - break # Found one, that's enough - except: - continue - -conn.close() diff --git a/setup_windows_task.ps1 b/setup_windows_task.ps1 deleted file mode 100644 index e30f4a5..0000000 --- a/setup_windows_task.ps1 +++ /dev/null @@ -1,47 +0,0 @@ -# PowerShell script to create Windows Task Scheduler job for Scaev Monitor -# Run as Administrator - -$TaskName = "ScaevAuctionMonitor" -$ScriptPath = "C:\vibe\scaev\src\monitor.py" -$PythonPath = "python3" # Adjust if needed -$WorkingDir = "C:\vibe\scaev" - -# Create the action (run Python script) -$Action = New-ScheduledTaskAction -Execute $PythonPath ` - -Argument "$ScriptPath 30" ` - -WorkingDirectory $WorkingDir - -# Trigger: On system startup -$TriggerStartup = New-ScheduledTaskTrigger -AtStartup - -# Settings -$Settings = New-ScheduledTaskSettingsSet ` - -AllowStartIfOnBatteries ` - -DontStopIfGoingOnBatteries ` - -StartWhenAvailable ` - -RestartCount 3 ` - -RestartInterval (New-TimeSpan -Minutes 5) - -# Principal: Run with highest privileges -$Principal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest - -# Register the task -Register-ScheduledTask ` - -TaskName $TaskName ` - -Action $Action ` - -Trigger $TriggerStartup ` - -Settings $Settings ` - -Principal $Principal ` - -Description "Scaev auction monitor - polls for new auctions every 30 minutes" ` - -Force - -Write-Host "`nTask '$TaskName' created successfully!" -ForegroundColor Green -Write-Host "`nTo manage the task:" -Write-Host " 1. Open Task Scheduler (taskschd.msc)" -Write-Host " 2. Find 'ScaevAuctionMonitor' in Task Scheduler Library" -Write-Host " 3. Right-click to Run, Stop, or Disable" -Write-Host "`nOr use PowerShell commands:" -Write-Host " Start-ScheduledTask -TaskName '$TaskName'" -Write-Host " Stop-ScheduledTask -TaskName '$TaskName'" -Write-Host " Disable-ScheduledTask -TaskName '$TaskName'" -Write-Host " Get-ScheduledTask -TaskName '$TaskName' | Get-ScheduledTaskInfo" diff --git a/show_migration_stats.py b/show_migration_stats.py deleted file mode 100644 index a04b962..0000000 --- a/show_migration_stats.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -"""Show migration statistics""" -import sqlite3 - -conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - -cursor = conn.execute(""" - SELECT - COUNT(*) as total, - SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year, - SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition, - SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer, - SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand, - SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model - FROM lots -""") - -stats = cursor.fetchone() - -print("="*60) -print("MIGRATION RESULTS") -print("="*60) -print(f"\nTotal lots: {stats[0]:,}") -print(f"Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)") -print(f"Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)") -print(f"Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)") -print(f"Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)") -print(f"Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)") - -# Show sample enriched data -print(f"\n{'='*60}") -print("SAMPLE ENRICHED LOTS") -print(f"{'='*60}") - -cursor = conn.execute(""" - SELECT lot_id, year_manufactured, manufacturer, model, condition_score - FROM lots - WHERE year_manufactured IS NOT NULL OR manufacturer != '' - LIMIT 5 -""") - -for row in cursor: - print(f"\n{row[0]}:") - print(f" Year: {row[1]}") - print(f" Manufacturer: {row[2]}") - print(f" Model: {row[3]}") - print(f" Condition: {row[4]}") - -conn.close() diff --git a/src/cache.py b/src/cache.py index 169fe74..8d182a0 100644 --- a/src/cache.py +++ b/src/cache.py @@ -19,8 +19,9 @@ class CacheManager: self._init_db() def _init_db(self): - """Initialize cache and data storage database""" + """Initialize cache and data storage database with consolidated schema""" with sqlite3.connect(self.db_path) as conn: + # Cache table conn.execute(""" CREATE TABLE IF NOT EXISTS cache ( url TEXT PRIMARY KEY, @@ -32,6 +33,8 @@ class CacheManager: conn.execute(""" CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp) """) + + # Auctions table - consolidated schema conn.execute(""" CREATE TABLE IF NOT EXISTS auctions ( auction_id TEXT PRIMARY KEY, @@ -40,9 +43,18 @@ class CacheManager: location TEXT, lots_count INTEGER, first_lot_closing_time TEXT, - scraped_at TEXT + scraped_at TEXT, + city TEXT, + country TEXT, + type TEXT, + lot_count INTEGER DEFAULT 0, + closing_time TEXT, + discovered_at INTEGER ) """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)") + + # Lots table - consolidated schema with all fields from working database conn.execute(""" CREATE TABLE IF NOT EXISTS lots ( lot_id TEXT PRIMARY KEY, @@ -50,8 +62,6 @@ class CacheManager: url TEXT UNIQUE, title TEXT, current_bid TEXT, - starting_bid TEXT, - minimum_bid TEXT, bid_count INTEGER, closing_time TEXT, viewing_time TEXT, @@ -60,9 +70,54 @@ class CacheManager: description TEXT, category TEXT, scraped_at TEXT, + sale_id INTEGER, + manufacturer TEXT, + type TEXT, + year INTEGER, + currency TEXT DEFAULT 'EUR', + closing_notified INTEGER DEFAULT 0, + starting_bid TEXT, + minimum_bid TEXT, + status TEXT, + brand TEXT, + model TEXT, + attributes_json TEXT, + first_bid_time TEXT, + last_bid_time TEXT, + bid_velocity REAL, + bid_increment REAL, + year_manufactured INTEGER, + condition_score REAL, + condition_description TEXT, + serial_number TEXT, + damage_description TEXT, + followers_count INTEGER DEFAULT 0, + estimated_min_price REAL, + estimated_max_price REAL, + lot_condition TEXT, + appearance TEXT, + estimated_min REAL, + estimated_max REAL, + next_bid_step_cents INTEGER, + condition TEXT, + category_path TEXT, + city_location TEXT, + country_code TEXT, + bidding_status TEXT, + packaging TEXT, + quantity INTEGER, + vat REAL, + buyer_premium_percentage REAL, + remarks TEXT, + reserve_price REAL, + reserve_met INTEGER, + view_count INTEGER, FOREIGN KEY (auction_id) REFERENCES auctions(auction_id) ) """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)") + + # Images table conn.execute(""" CREATE TABLE IF NOT EXISTS images ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -70,86 +125,28 @@ class CacheManager: url TEXT, local_path TEXT, downloaded INTEGER DEFAULT 0, + labels TEXT, + processed_at INTEGER, FOREIGN KEY (lot_id) REFERENCES lots(lot_id) ) """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_images_lot_id ON images(lot_id)") - # Add new columns to auctions table if they don't exist - cursor = conn.execute("PRAGMA table_info(auctions)") - auction_columns = {row[1] for row in cursor.fetchall()} + # Remove duplicates before creating unique index + conn.execute(""" + DELETE FROM images + WHERE id NOT IN ( + SELECT MIN(id) + FROM images + GROUP BY lot_id, url + ) + """) + conn.execute(""" + CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url + ON images(lot_id, url) + """) - if 'city' not in auction_columns: - conn.execute("ALTER TABLE auctions ADD COLUMN city TEXT") - if 'country' not in auction_columns: - conn.execute("ALTER TABLE auctions ADD COLUMN country TEXT") - if 'type' not in auction_columns: - conn.execute("ALTER TABLE auctions ADD COLUMN type TEXT") - if 'lot_count' not in auction_columns: - conn.execute("ALTER TABLE auctions ADD COLUMN lot_count INTEGER DEFAULT 0") - if 'closing_time' not in auction_columns: - conn.execute("ALTER TABLE auctions ADD COLUMN closing_time TEXT") - if 'discovered_at' not in auction_columns: - conn.execute("ALTER TABLE auctions ADD COLUMN discovered_at INTEGER") - - # Add index for country filtering - conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)") - - # Add new columns to lots table if they don't exist - cursor = conn.execute("PRAGMA table_info(lots)") - columns = {row[1] for row in cursor.fetchall()} - - if 'starting_bid' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT") - if 'minimum_bid' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT") - if 'status' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN status TEXT") - if 'brand' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT") - if 'model' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN model TEXT") - if 'attributes_json' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT") - - # Bidding intelligence fields - if 'first_bid_time' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT") - if 'last_bid_time' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT") - if 'bid_velocity' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL") - if 'bid_increment' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL") - - # Valuation intelligence fields - if 'year_manufactured' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER") - if 'condition_score' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL") - if 'condition_description' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT") - if 'serial_number' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT") - if 'manufacturer' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT") - if 'damage_description' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT") - - # NEW: High-value API fields - if 'followers_count' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN followers_count INTEGER DEFAULT 0") - if 'estimated_min_price' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN estimated_min_price REAL") - if 'estimated_max_price' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN estimated_max_price REAL") - if 'lot_condition' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN lot_condition TEXT") - if 'appearance' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN appearance TEXT") - if 'scraped_at_timestamp' not in columns: - conn.execute("ALTER TABLE lots ADD COLUMN scraped_at_timestamp INTEGER") - - # Create bid_history table + # Bid history table conn.execute(""" CREATE TABLE IF NOT EXISTS bid_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -163,33 +160,15 @@ class CacheManager: FOREIGN KEY (lot_id) REFERENCES lots(lot_id) ) """) - conn.execute(""" CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time ON bid_history(lot_id, bid_time) """) - conn.execute(""" CREATE INDEX IF NOT EXISTS idx_bid_history_bidder ON bid_history(bidder_id) """) - # Remove duplicates before creating unique index - # Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair - conn.execute(""" - DELETE FROM images - WHERE id NOT IN ( - SELECT MIN(id) - FROM images - GROUP BY lot_id, url - ) - """) - - # Now create the unique index - conn.execute(""" - CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url - ON images(lot_id, url) - """) conn.commit() def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]: diff --git a/sync_updates.py b/sync_updates.py deleted file mode 100644 index 56a284f..0000000 --- a/sync_updates.py +++ /dev/null @@ -1,256 +0,0 @@ -#!/usr/bin/env python3 -""" -Sync local database updates to server-compatible format -Creates incremental exports with only NEW or UPDATED records -""" - -import sqlite3 -import json -import csv -from datetime import datetime -from pathlib import Path - -DB_PATH = "C:/mnt/okcomputer/output/cache.db" -OUTPUT_DIR = Path("C:/mnt/okcomputer/output") - -def fill_missing_auction_fields(): - """Fill in missing fields in auctions table from scraped data""" - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - - print("Filling missing auction fields...") - - # Update closing_time from first_lot_closing_time - cursor.execute(""" - UPDATE auctions - SET closing_time = first_lot_closing_time - WHERE closing_time IS NULL AND first_lot_closing_time IS NOT NULL - """) - updated = cursor.rowcount - print(f" ✓ Updated {updated} closing_time fields") - - # Parse location to extract city and country - cursor.execute(""" - SELECT auction_id, location - FROM auctions - WHERE location IS NOT NULL AND (city IS NULL OR country IS NULL) - """) - locations = cursor.fetchall() - - city_updates = 0 - for auction_id, location in locations: - if not location: - continue - - # Parse "City, COUNTRY" or "City, Region, COUNTRY" - parts = [p.strip() for p in location.split(',')] - if len(parts) >= 2: - city = parts[0] - country = parts[-1] - - cursor.execute(""" - UPDATE auctions - SET city = ?, country = ? - WHERE auction_id = ? - """, (city, country, auction_id)) - city_updates += 1 - - print(f" ✓ Updated {city_updates} city/country fields") - - # Set type to 'online' for all (Troostwijk is online platform) - cursor.execute(""" - UPDATE auctions - SET type = 'online' - WHERE type IS NULL - """) - type_updates = cursor.rowcount - print(f" ✓ Updated {type_updates} type fields") - - conn.commit() - conn.close() - - print(f"✓ Auction fields updated\n") - -def get_last_sync_timestamp(): - """Get timestamp of last successful sync""" - sync_file = OUTPUT_DIR / ".last_sync" - if sync_file.exists(): - return int(sync_file.read_text().strip()) - return 0 - -def save_sync_timestamp(timestamp: int): - """Save timestamp of successful sync""" - sync_file = OUTPUT_DIR / ".last_sync" - sync_file.write_text(str(timestamp)) - -def export_incremental(): - """Export only records that are new or updated since last sync""" - conn = sqlite3.connect(DB_PATH) - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - last_sync = get_last_sync_timestamp() - current_time = int(datetime.now().timestamp()) - - print(f"Last sync: {datetime.fromtimestamp(last_sync).strftime('%Y-%m-%d %H:%M:%S') if last_sync else 'Never'}") - print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}") - - # Get new/updated auctions - cursor.execute(""" - SELECT * FROM auctions - WHERE discovered_at IS NULL OR discovered_at > ? - ORDER BY auction_id - """, (last_sync,)) - new_auctions = [dict(row) for row in cursor.fetchall()] - - # Get new/updated lots - cursor.execute(""" - SELECT * FROM lots - WHERE scraped_at_timestamp IS NULL OR scraped_at_timestamp > ? - ORDER BY lot_id - """, (last_sync,)) - new_lots = [dict(row) for row in cursor.fetchall()] - - conn.close() - - # Export to timestamped files - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - - results = { - 'auctions': 0, - 'lots': 0, - 'files': {} - } - - # Export auctions if any new - if new_auctions: - auctions_csv = OUTPUT_DIR / f'auctions_update_{timestamp}.csv' - auctions_json = OUTPUT_DIR / f'auctions_update_{timestamp}.json' - - with open(auctions_csv, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys()) - writer.writeheader() - writer.writerows(new_auctions) - - with open(auctions_json, 'w', encoding='utf-8') as f: - json.dump(new_auctions, f, indent=2, ensure_ascii=False) - - results['auctions'] = len(new_auctions) - results['files']['auctions_csv'] = str(auctions_csv) - results['files']['auctions_json'] = str(auctions_json) - - print(f"\n✓ Exported {len(new_auctions)} new/updated auctions") - print(f" CSV: {auctions_csv}") - print(f" JSON: {auctions_json}") - - # Export lots if any new - if new_lots: - lots_csv = OUTPUT_DIR / f'lots_update_{timestamp}.csv' - lots_json = OUTPUT_DIR / f'lots_update_{timestamp}.json' - - with open(lots_csv, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=new_lots[0].keys()) - writer.writeheader() - writer.writerows(new_lots) - - with open(lots_json, 'w', encoding='utf-8') as f: - json.dump(new_lots, f, indent=2, ensure_ascii=False) - - results['lots'] = len(new_lots) - results['files']['lots_csv'] = str(lots_csv) - results['files']['lots_json'] = str(lots_json) - - print(f"\n✓ Exported {len(new_lots)} new/updated lots") - print(f" CSV: {lots_csv}") - print(f" JSON: {lots_json}") - - if not new_auctions and not new_lots: - print("\n✓ No new updates since last sync") - - return results - -def create_upsert_export(): - """Create SQL script for server to UPSERT (update or insert) data""" - conn = sqlite3.connect(DB_PATH) - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - last_sync = get_last_sync_timestamp() - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - - # Get new/updated auctions - cursor.execute(""" - SELECT * FROM auctions - WHERE discovered_at IS NULL OR discovered_at > ? - """, (last_sync,)) - new_auctions = [dict(row) for row in cursor.fetchall()] - - if new_auctions: - sql_file = OUTPUT_DIR / f'upsert_auctions_{timestamp}.sql' - - with open(sql_file, 'w', encoding='utf-8') as f: - f.write("-- UPSERT script for auctions (updates existing, inserts new)\n\n") - - for auction in new_auctions: - # Create INSERT OR REPLACE statement - columns = list(auction.keys()) - placeholders = [] - - for col, val in auction.items(): - if val is None: - placeholders.append("NULL") - elif isinstance(val, (int, float)): - placeholders.append(str(val)) - else: - # Escape single quotes - escaped = str(val).replace("'", "''") - placeholders.append(f"'{escaped}'") - - f.write(f"INSERT OR REPLACE INTO auctions ({', '.join(columns)})\n") - f.write(f"VALUES ({', '.join(placeholders)});\n\n") - - print(f"\n✓ Created UPSERT SQL script: {sql_file}") - print(f" Server can execute this to avoid constraint errors") - - conn.close() - -def main(): - """Main sync process""" - print("="*60) - print("DATABASE SYNC UTILITY") - print("="*60) - print(f"Database: {DB_PATH}") - print(f"Output: {OUTPUT_DIR}") - print("="*60) - - # Step 1: Fill missing fields - fill_missing_auction_fields() - - # Step 2: Export incremental updates - print("Exporting incremental updates...") - results = export_incremental() - - # Step 3: Create UPSERT SQL (prevents constraint errors on server) - if results['auctions'] > 0: - create_upsert_export() - - # Step 4: Save sync timestamp - current_time = int(datetime.now().timestamp()) - save_sync_timestamp(current_time) - - print("\n" + "="*60) - print("SYNC COMPLETE") - print("="*60) - print(f"New auctions: {results['auctions']}") - print(f"New lots: {results['lots']}") - - if results['files']: - print("\nFiles ready for server import:") - for key, path in results['files'].items(): - print(f" {key}: {path}") - - print("\nNext sync will only export records newer than:") - print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}") - -if __name__ == "__main__": - main() diff --git a/test_auction_fetch.py b/test_auction_fetch.py deleted file mode 100644 index 1888978..0000000 --- a/test_auction_fetch.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 -"""Test auction data fetch""" -import asyncio -import json -import sys -sys.path.insert(0, 'src') - -from graphql_client import fetch_auction_data, format_auction_data - -async def main(): - auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa" - - print(f"Fetching auction: {auction_id}\n") - auction_data = await fetch_auction_data(auction_id) - - if auction_data: - print("Raw Auction Data:") - print(json.dumps(auction_data, indent=2)) - - print("\n\nFormatted:") - formatted = format_auction_data(auction_data) - print(f"Viewing: {formatted['viewing_time']}") - print(f"Pickup: {formatted['pickup_date']}") - else: - print("No auction data returned") - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_auction_query.py b/test_auction_query.py deleted file mode 100644 index bfc8b08..0000000 --- a/test_auction_query.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 -"""Test if the auction query works at all""" -import asyncio -import aiohttp -import json - -GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql" - -# Try a simpler query first -SIMPLE_QUERY = """ -query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) { - auction(id: $auctionId, locale: $locale, platform: $platform) { - id - displayId - viewingDays { - startDate - endDate - city - countryCode - } - collectionDays { - startDate - endDate - city - countryCode - } - } -} -""" - -async def main(): - auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa" - - variables = { - "auctionId": auction_id, - "locale": "nl", - "platform": "TWK" - } - - payload = { - "query": SIMPLE_QUERY, - "variables": variables - } - - async with aiohttp.ClientSession() as session: - async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response: - print(f"Status: {response.status}") - text = await response.text() - print(f"Response: {text}") - - try: - data = await response.json() - print(f"\nParsed:") - print(json.dumps(data, indent=2)) - except: - pass - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_comprehensive.py b/test_comprehensive.py deleted file mode 100644 index 561f4da..0000000 --- a/test_comprehensive.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python3 -"""Test comprehensive data enrichment""" -import asyncio -import sys -sys.path.insert(0, 'src') - -from scraper import TroostwijkScraper - -async def main(): - scraper = TroostwijkScraper() - - from playwright.async_api import async_playwright - - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page( - viewport={'width': 1920, 'height': 1080}, - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - ) - - # Test with lot that has bids - lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5" - - print(f"Testing comprehensive extraction\n") - result = await scraper.crawl_page(page, lot_url) - - if result: - print(f"\n{'='*60}") - print("COMPREHENSIVE DATA EXTRACTION:") - print(f"{'='*60}") - print(f"Lot ID: {result.get('lot_id')}") - print(f"Title: {result.get('title', '')[:50]}...") - print(f"\n[Bidding Intelligence]") - print(f" Status: {result.get('status')}") - print(f" Current Bid: {result.get('current_bid')}") - print(f" Starting Bid: {result.get('starting_bid')}") - print(f" Bid Increment: EUR {result.get('bid_increment', 0):.2f}") - print(f" Bid Count: {result.get('bid_count')}") - print(f" First Bid: {result.get('first_bid_time', 'N/A')}") - print(f" Last Bid: {result.get('last_bid_time', 'N/A')}") - print(f" Bid Velocity: {result.get('bid_velocity', 0)} bids/hour") - print(f"\n[Valuation Intelligence]") - print(f" Brand: {result.get('brand', 'N/A')}") - print(f" Model: {result.get('model', 'N/A')}") - print(f" Year: {result.get('year_manufactured', 'N/A')}") - print(f" Manufacturer: {result.get('manufacturer', 'N/A')}") - print(f" Condition Score: {result.get('condition_score', 'N/A')}") - print(f" Condition: {result.get('condition_description', 'N/A')}") - print(f" Serial#: {result.get('serial_number', 'N/A')}") - print(f" Damage: {result.get('damage_description', 'N/A')[:50] if result.get('damage_description') else 'N/A'}...") - - await browser.close() - - # Verify database - import sqlite3 - conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - - # Check lot data - cursor = conn.execute(""" - SELECT bid_velocity, first_bid_time, year_manufactured, condition_score - FROM lots - WHERE lot_id = ? - """, (result.get('lot_id'),)) - row = cursor.fetchone() - - if row: - print(f"\n{'='*60}") - print("DATABASE VERIFICATION (lots table):") - print(f"{'='*60}") - print(f" Bid Velocity: {row[0]}") - print(f" First Bid Time: {row[1]}") - print(f" Year: {row[2]}") - print(f" Condition Score: {row[3]}") - - # Check bid history - cursor = conn.execute(""" - SELECT COUNT(*), MIN(bid_time), MAX(bid_time), SUM(is_autobid) - FROM bid_history - WHERE lot_id = ? - """, (result.get('lot_id'),)) - row = cursor.fetchone() - - if row and row[0] > 0: - print(f"\n{'='*60}") - print("DATABASE VERIFICATION (bid_history table):") - print(f"{'='*60}") - print(f" Total Bids Stored: {row[0]}") - print(f" First Bid: {row[1]}") - print(f" Last Bid: {row[2]}") - print(f" Autobids: {row[3]}") - - conn.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_concurrent_images.py b/test_concurrent_images.py deleted file mode 100644 index 8d24e15..0000000 --- a/test_concurrent_images.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -"""Test concurrent image downloads""" -import asyncio -import time -import sys -sys.path.insert(0, 'src') - -from scraper import TroostwijkScraper - -async def main(): - scraper = TroostwijkScraper() - - from playwright.async_api import async_playwright - - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page( - viewport={'width': 1920, 'height': 1080}, - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - ) - - # Test with a lot that has multiple images - lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5" - - print(f"Testing concurrent image downloads\n") - print(f"Lot: {lot_url}\n") - - start_time = time.time() - result = await scraper.crawl_page(page, lot_url) - elapsed = time.time() - start_time - - print(f"\n{'='*60}") - print(f"TIMING RESULTS:") - print(f"{'='*60}") - print(f"Total time: {elapsed:.2f}s") - - image_count = len(result.get('images', [])) - print(f"Images: {image_count}") - - if image_count > 1: - print(f"Time per image: {elapsed/image_count:.2f}s (if sequential)") - print(f"Actual time: {elapsed:.2f}s (concurrent!)") - speedup = (image_count * 0.5) / elapsed if elapsed > 0 else 1 - print(f"Speedup factor: {speedup:.1f}x") - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_full_scraper.py b/test_full_scraper.py deleted file mode 100644 index bb7cacd..0000000 --- a/test_full_scraper.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -"""Test the full scraper with one lot""" -import asyncio -import sys -sys.path.insert(0, 'src') - -from scraper import TroostwijkScraper - -async def main(): - scraper = TroostwijkScraper() - - from playwright.async_api import async_playwright - - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page( - viewport={'width': 1920, 'height': 1080}, - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - ) - - # Test with a known lot - lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5" - - print(f"Testing with: {lot_url}\n") - result = await scraper.crawl_page(page, lot_url) - - if result: - print(f"\n{'='*60}") - print("FINAL RESULT:") - print(f"{'='*60}") - print(f"Lot ID: {result.get('lot_id')}") - print(f"Title: {result.get('title', '')[:50]}...") - print(f"Current Bid: {result.get('current_bid')}") - print(f"Starting Bid: {result.get('starting_bid')}") - print(f"Minimum Bid: {result.get('minimum_bid')}") - print(f"Bid Count: {result.get('bid_count')}") - print(f"Closing Time: {result.get('closing_time')}") - print(f"Viewing Time: {result.get('viewing_time', 'N/A')}") - print(f"Pickup Date: {result.get('pickup_date', 'N/A')}") - print(f"Location: {result.get('location')}") - - await browser.close() - - # Verify database - import sqlite3 - conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - cursor = conn.execute(""" - SELECT current_bid, starting_bid, minimum_bid, bid_count, closing_time - FROM lots - WHERE lot_id = 'A1-28505-5' - """) - row = cursor.fetchone() - conn.close() - - if row: - print(f"\n{'='*60}") - print("DATABASE VERIFICATION:") - print(f"{'='*60}") - print(f"Current Bid: {row[0]}") - print(f"Starting Bid: {row[1]}") - print(f"Minimum Bid: {row[2]}") - print(f"Bid Count: {row[3]}") - print(f"Closing Time: {row[4]}") - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_graphql_scraper.py b/test_graphql_scraper.py deleted file mode 100644 index 71eda86..0000000 --- a/test_graphql_scraper.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 -"""Test the updated scraper with GraphQL integration""" -import asyncio -import sys -sys.path.insert(0, 'src') - -from graphql_client import fetch_lot_bidding_data, format_bid_data - -async def main(): - # Test with known lot ID - lot_id = "A1-28505-5" - - print(f"Testing GraphQL API with lot: {lot_id}\n") - - bidding_data = await fetch_lot_bidding_data(lot_id) - - if bidding_data: - print("Raw GraphQL Response:") - print("="*60) - import json - print(json.dumps(bidding_data, indent=2)) - - print("\n\nFormatted Data:") - print("="*60) - formatted = format_bid_data(bidding_data) - for key, value in formatted.items(): - print(f" {key}: {value}") - else: - print("Failed to fetch bidding data") - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_live_lot.py b/test_live_lot.py deleted file mode 100644 index 78096ee..0000000 --- a/test_live_lot.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 -"""Test scraping a single live lot page""" -import asyncio -import sys -sys.path.insert(0, 'src') - -from scraper import TroostwijkScraper - -async def main(): - scraper = TroostwijkScraper() - - from playwright.async_api import async_playwright - - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - - # Get a lot URL from the database - import sqlite3 - conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - cursor = conn.execute("SELECT url FROM lots LIMIT 1") - row = cursor.fetchone() - conn.close() - - if not row: - print("No lots in database") - return - - lot_url = row[0] - print(f"Fetching: {lot_url}\n") - - result = await scraper.crawl_page(page, lot_url) - - if result: - print(f"\nExtracted Data:") - print(f" current_bid: {result.get('current_bid')}") - print(f" bid_count: {result.get('bid_count')}") - print(f" closing_time: {result.get('closing_time')}") - - await browser.close() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_new_fields.py b/test_new_fields.py deleted file mode 100644 index 4f49b43..0000000 --- a/test_new_fields.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -"""Test the new fields extraction""" -import asyncio -import sys -sys.path.insert(0, 'src') - -from scraper import TroostwijkScraper - -async def main(): - scraper = TroostwijkScraper() - - from playwright.async_api import async_playwright - - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page( - viewport={'width': 1920, 'height': 1080}, - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - ) - - # Test with lot that has attributes - lot_url = "https://www.troostwijkauctions.com/l/47-5kg-hexagon-dumbbell-%25282x%2529-A1-40668-34" - - print(f"Testing new fields with: {lot_url}\n") - result = await scraper.crawl_page(page, lot_url) - - if result: - print(f"\n{'='*60}") - print("EXTRACTED FIELDS:") - print(f"{'='*60}") - print(f"Lot ID: {result.get('lot_id')}") - print(f"Title: {result.get('title', '')[:50]}...") - print(f"Status: {result.get('status')}") - print(f"Brand: {result.get('brand')}") - print(f"Model: {result.get('model')}") - print(f"Viewing Time: {result.get('viewing_time', 'N/A')}") - print(f"Pickup Date: {result.get('pickup_date', 'N/A')}") - print(f"Attributes: {result.get('attributes_json', '')[:100]}...") - - await browser.close() - - # Verify database - import sqlite3 - conn = sqlite3.connect('/mnt/okcomputer/output/cache.db') - cursor = conn.execute(""" - SELECT status, brand, model, viewing_time, pickup_date - FROM lots - WHERE lot_id = ? - """, (result.get('lot_id'),)) - row = cursor.fetchone() - conn.close() - - if row: - print(f"\n{'='*60}") - print("DATABASE VERIFICATION:") - print(f"{'='*60}") - print(f"Status: {row[0]}") - print(f"Brand: {row[1]}") - print(f"Model: {row[2]}") - print(f"Viewing: {row[3][:100] if row[3] else 'N/A'}...") - print(f"Pickup: {row[4][:100] if row[4] else 'N/A'}...") - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/validate_data.py b/validate_data.py deleted file mode 100644 index 78e3f22..0000000 --- a/validate_data.py +++ /dev/null @@ -1,306 +0,0 @@ -""" -Validate data quality and completeness in the database. -Checks if scraped data matches expectations and API capabilities. -""" -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) - -import sqlite3 -from datetime import datetime -from typing import Dict, List, Tuple -from cache import CacheManager - -cache = CacheManager() -DB_PATH = cache.db_path - -def get_db_stats() -> Dict: - """Get comprehensive database statistics""" - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - - stats = {} - - # Total counts - stats['total_auctions'] = cursor.execute("SELECT COUNT(*) FROM auctions").fetchone()[0] - stats['total_lots'] = cursor.execute("SELECT COUNT(*) FROM lots").fetchone()[0] - stats['total_images'] = cursor.execute("SELECT COUNT(*) FROM images").fetchone()[0] - stats['total_bid_history'] = cursor.execute("SELECT COUNT(*) FROM bid_history").fetchone()[0] - - # Auctions completeness - cursor.execute(""" - SELECT - COUNT(*) as total, - SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title, - SUM(CASE WHEN lots_count IS NOT NULL THEN 1 ELSE 0 END) as has_lots_count, - SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time, - SUM(CASE WHEN first_lot_closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_lot_closing - FROM auctions - """) - row = cursor.fetchone() - stats['auctions'] = { - 'total': row[0], - 'has_title': row[1], - 'has_lots_count': row[2], - 'has_closing_time': row[3], - 'has_first_lot_closing': row[4] - } - - # Lots completeness - Core fields - cursor.execute(""" - SELECT - COUNT(*) as total, - SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title, - SUM(CASE WHEN current_bid IS NOT NULL THEN 1 ELSE 0 END) as has_current_bid, - SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid, - SUM(CASE WHEN minimum_bid IS NOT NULL THEN 1 ELSE 0 END) as has_minimum_bid, - SUM(CASE WHEN bid_count IS NOT NULL AND bid_count > 0 THEN 1 ELSE 0 END) as has_bids, - SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time, - SUM(CASE WHEN status IS NOT NULL AND status != '' THEN 1 ELSE 0 END) as has_status - FROM lots - """) - row = cursor.fetchone() - stats['lots_core'] = { - 'total': row[0], - 'has_title': row[1], - 'has_current_bid': row[2], - 'has_starting_bid': row[3], - 'has_minimum_bid': row[4], - 'has_bids': row[5], - 'has_closing_time': row[6], - 'has_status': row[7] - } - - # Lots completeness - Enriched fields - cursor.execute(""" - SELECT - COUNT(*) as total, - SUM(CASE WHEN brand IS NOT NULL AND brand != '' THEN 1 ELSE 0 END) as has_brand, - SUM(CASE WHEN model IS NOT NULL AND model != '' THEN 1 ELSE 0 END) as has_model, - SUM(CASE WHEN manufacturer IS NOT NULL AND manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer, - SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year, - SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition_score, - SUM(CASE WHEN condition_description IS NOT NULL AND condition_description != '' THEN 1 ELSE 0 END) as has_condition_desc, - SUM(CASE WHEN serial_number IS NOT NULL AND serial_number != '' THEN 1 ELSE 0 END) as has_serial, - SUM(CASE WHEN damage_description IS NOT NULL AND damage_description != '' THEN 1 ELSE 0 END) as has_damage - FROM lots - """) - row = cursor.fetchone() - stats['lots_enriched'] = { - 'total': row[0], - 'has_brand': row[1], - 'has_model': row[2], - 'has_manufacturer': row[3], - 'has_year': row[4], - 'has_condition_score': row[5], - 'has_condition_desc': row[6], - 'has_serial': row[7], - 'has_damage': row[8] - } - - # Lots completeness - Bid intelligence - cursor.execute(""" - SELECT - COUNT(*) as total, - SUM(CASE WHEN first_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_bid_time, - SUM(CASE WHEN last_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_last_bid_time, - SUM(CASE WHEN bid_velocity IS NOT NULL THEN 1 ELSE 0 END) as has_bid_velocity, - SUM(CASE WHEN bid_increment IS NOT NULL THEN 1 ELSE 0 END) as has_bid_increment - FROM lots - """) - row = cursor.fetchone() - stats['lots_bid_intelligence'] = { - 'total': row[0], - 'has_first_bid_time': row[1], - 'has_last_bid_time': row[2], - 'has_bid_velocity': row[3], - 'has_bid_increment': row[4] - } - - # Bid history stats - cursor.execute(""" - SELECT - COUNT(DISTINCT lot_id) as lots_with_history, - COUNT(*) as total_bids, - SUM(CASE WHEN is_autobid = 1 THEN 1 ELSE 0 END) as autobids, - SUM(CASE WHEN bidder_id IS NOT NULL THEN 1 ELSE 0 END) as has_bidder_id - FROM bid_history - """) - row = cursor.fetchone() - stats['bid_history'] = { - 'lots_with_history': row[0], - 'total_bids': row[1], - 'autobids': row[2], - 'has_bidder_id': row[3] - } - - # Image stats - cursor.execute(""" - SELECT - COUNT(DISTINCT lot_id) as lots_with_images, - COUNT(*) as total_images, - SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded_images, - SUM(CASE WHEN local_path IS NOT NULL THEN 1 ELSE 0 END) as has_local_path - FROM images - """) - row = cursor.fetchone() - stats['images'] = { - 'lots_with_images': row[0], - 'total_images': row[1], - 'downloaded_images': row[2], - 'has_local_path': row[3] - } - - conn.close() - return stats - -def check_data_quality() -> List[Tuple[str, str, str]]: - """Check for data quality issues""" - issues = [] - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - - # Check for lots without auction - cursor.execute(""" - SELECT COUNT(*) FROM lots - WHERE auction_id NOT IN (SELECT auction_id FROM auctions) - """) - orphaned_lots = cursor.fetchone()[0] - if orphaned_lots > 0: - issues.append(("ERROR", "Orphaned Lots", f"{orphaned_lots} lots without matching auction")) - - # Check for lots with bids but no bid history - cursor.execute(""" - SELECT COUNT(*) FROM lots - WHERE bid_count > 0 - AND lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history) - """) - missing_history = cursor.fetchone()[0] - if missing_history > 0: - issues.append(("WARNING", "Missing Bid History", f"{missing_history} lots have bids but no bid history records")) - - # Check for lots with closing time in past but still active - cursor.execute(""" - SELECT COUNT(*) FROM lots - WHERE closing_time IS NOT NULL - AND closing_time < datetime('now') - AND status NOT LIKE '%gesloten%' - """) - past_closing = cursor.fetchone()[0] - if past_closing > 0: - issues.append(("INFO", "Past Closing Time", f"{past_closing} lots have closing time in past")) - - # Check for duplicate lot_ids - cursor.execute(""" - SELECT lot_id, COUNT(*) FROM lots - GROUP BY lot_id - HAVING COUNT(*) > 1 - """) - duplicates = cursor.fetchall() - if duplicates: - issues.append(("ERROR", "Duplicate Lot IDs", f"{len(duplicates)} duplicate lot_id values found")) - - # Check for lots without images - cursor.execute(""" - SELECT COUNT(*) FROM lots - WHERE lot_id NOT IN (SELECT DISTINCT lot_id FROM images) - """) - no_images = cursor.fetchone()[0] - if no_images > 0: - issues.append(("WARNING", "No Images", f"{no_images} lots have no images")) - - conn.close() - return issues - -def print_validation_report(): - """Print comprehensive validation report""" - print("=" * 80) - print("DATABASE VALIDATION REPORT") - print("=" * 80) - print() - - stats = get_db_stats() - - # Overall counts - print("OVERALL COUNTS:") - print(f" Auctions: {stats['total_auctions']:,}") - print(f" Lots: {stats['total_lots']:,}") - print(f" Images: {stats['total_images']:,}") - print(f" Bid History Records: {stats['total_bid_history']:,}") - print() - - # Auctions completeness - print("AUCTIONS COMPLETENESS:") - a = stats['auctions'] - print(f" Title: {a['has_title']:,} / {a['total']:,} ({a['has_title']/a['total']*100:.1f}%)") - print(f" Lots Count: {a['has_lots_count']:,} / {a['total']:,} ({a['has_lots_count']/a['total']*100:.1f}%)") - print(f" Closing Time: {a['has_closing_time']:,} / {a['total']:,} ({a['has_closing_time']/a['total']*100:.1f}%)") - print(f" First Lot Closing: {a['has_first_lot_closing']:,} / {a['total']:,} ({a['has_first_lot_closing']/a['total']*100:.1f}%)") - print() - - # Lots core completeness - print("LOTS CORE FIELDS:") - l = stats['lots_core'] - print(f" Title: {l['has_title']:,} / {l['total']:,} ({l['has_title']/l['total']*100:.1f}%)") - print(f" Current Bid: {l['has_current_bid']:,} / {l['total']:,} ({l['has_current_bid']/l['total']*100:.1f}%)") - print(f" Starting Bid: {l['has_starting_bid']:,} / {l['total']:,} ({l['has_starting_bid']/l['total']*100:.1f}%)") - print(f" Minimum Bid: {l['has_minimum_bid']:,} / {l['total']:,} ({l['has_minimum_bid']/l['total']*100:.1f}%)") - print(f" Has Bids (>0): {l['has_bids']:,} / {l['total']:,} ({l['has_bids']/l['total']*100:.1f}%)") - print(f" Closing Time: {l['has_closing_time']:,} / {l['total']:,} ({l['has_closing_time']/l['total']*100:.1f}%)") - print(f" Status: {l['has_status']:,} / {l['total']:,} ({l['has_status']/l['total']*100:.1f}%)") - print() - - # Lots enriched fields - print("LOTS ENRICHED FIELDS:") - e = stats['lots_enriched'] - print(f" Brand: {e['has_brand']:,} / {e['total']:,} ({e['has_brand']/e['total']*100:.1f}%)") - print(f" Model: {e['has_model']:,} / {e['total']:,} ({e['has_model']/e['total']*100:.1f}%)") - print(f" Manufacturer: {e['has_manufacturer']:,} / {e['total']:,} ({e['has_manufacturer']/e['total']*100:.1f}%)") - print(f" Year: {e['has_year']:,} / {e['total']:,} ({e['has_year']/e['total']*100:.1f}%)") - print(f" Condition Score: {e['has_condition_score']:,} / {e['total']:,} ({e['has_condition_score']/e['total']*100:.1f}%)") - print(f" Condition Desc: {e['has_condition_desc']:,} / {e['total']:,} ({e['has_condition_desc']/e['total']*100:.1f}%)") - print(f" Serial Number: {e['has_serial']:,} / {e['total']:,} ({e['has_serial']/e['total']*100:.1f}%)") - print(f" Damage Desc: {e['has_damage']:,} / {e['total']:,} ({e['has_damage']/e['total']*100:.1f}%)") - print() - - # Bid intelligence - print("LOTS BID INTELLIGENCE:") - b = stats['lots_bid_intelligence'] - print(f" First Bid Time: {b['has_first_bid_time']:,} / {b['total']:,} ({b['has_first_bid_time']/b['total']*100:.1f}%)") - print(f" Last Bid Time: {b['has_last_bid_time']:,} / {b['total']:,} ({b['has_last_bid_time']/b['total']*100:.1f}%)") - print(f" Bid Velocity: {b['has_bid_velocity']:,} / {b['total']:,} ({b['has_bid_velocity']/b['total']*100:.1f}%)") - print(f" Bid Increment: {b['has_bid_increment']:,} / {b['total']:,} ({b['has_bid_increment']/b['total']*100:.1f}%)") - print() - - # Bid history - print("BID HISTORY:") - h = stats['bid_history'] - print(f" Lots with History: {h['lots_with_history']:,}") - print(f" Total Bid Records: {h['total_bids']:,}") - print(f" Autobids: {h['autobids']:,} ({h['autobids']/max(h['total_bids'],1)*100:.1f}%)") - print(f" Has Bidder ID: {h['has_bidder_id']:,} ({h['has_bidder_id']/max(h['total_bids'],1)*100:.1f}%)") - print() - - # Images - print("IMAGES:") - i = stats['images'] - print(f" Lots with Images: {i['lots_with_images']:,}") - print(f" Total Images: {i['total_images']:,}") - print(f" Downloaded: {i['downloaded_images']:,} ({i['downloaded_images']/max(i['total_images'],1)*100:.1f}%)") - print(f" Has Local Path: {i['has_local_path']:,} ({i['has_local_path']/max(i['total_images'],1)*100:.1f}%)") - print() - - # Data quality issues - print("=" * 80) - print("DATA QUALITY ISSUES:") - print("=" * 80) - issues = check_data_quality() - if issues: - for severity, category, message in issues: - print(f" [{severity}] {category}: {message}") - else: - print(" No issues found!") - print() - -if __name__ == "__main__": - print_validation_report() diff --git a/verify_images.py b/verify_images.py deleted file mode 100644 index c93064b..0000000 --- a/verify_images.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 -""" -Verification script to check image download status and duplicates -Run this after deployment to verify the scraper is working correctly -""" -import sqlite3 -import sys -from pathlib import Path - -DB_PATH = "/mnt/okcomputer/output/cache.db" - -def verify_database(): - """Run verification queries on the database""" - - if not Path(DB_PATH).exists(): - print(f"❌ Database not found: {DB_PATH}") - sys.exit(1) - - conn = sqlite3.connect(DB_PATH) - - print("=" * 60) - print("IMAGE DOWNLOAD VERIFICATION") - print("=" * 60) - - # Check download success rate - print("\n[*] Download Success Rate:") - cursor = conn.execute(""" - SELECT - COUNT(*) as total_images, - SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded, - SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed, - ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate - FROM images - """) - row = cursor.fetchone() - print(f" Total images: {row[0]:,}") - print(f" Downloaded: {row[1]:,}") - print(f" Not downloaded: {row[2]:,}") - print(f" Success rate: {row[3]}%") - - # Check for duplicates - print("\n[*] Duplicate Check:") - cursor = conn.execute(""" - SELECT lot_id, url, COUNT(*) as dup_count - FROM images - GROUP BY lot_id, url - HAVING COUNT(*) > 1 - LIMIT 5 - """) - duplicates = cursor.fetchall() - - if duplicates: - print(f" [!] Found {len(duplicates)} duplicate entries!") - for lot_id, url, count in duplicates: - print(f" {lot_id}: {url[:50]}... (x{count})") - else: - print(" [+] No duplicates found!") - - # Verify file system - print("\n[*] File System Verification:") - cursor = conn.execute(""" - SELECT COUNT(*) - FROM images - WHERE downloaded = 1 - AND local_path IS NOT NULL - AND local_path != '' - """) - files_with_path = cursor.fetchone()[0] - print(f" Images with local_path: {files_with_path:,}") - - # Sample some downloaded images - print("\n[*] Sample Downloaded Images:") - cursor = conn.execute(""" - SELECT lot_id, local_path - FROM images - WHERE downloaded = 1 - AND local_path IS NOT NULL - LIMIT 5 - """) - samples = cursor.fetchall() - for lot_id, path in samples: - exists = "[+]" if Path(path).exists() else "[!]" - print(f" {exists} {lot_id}: {path}") - - conn.close() - - print("\n" + "=" * 60) - print("VERIFICATION COMPLETE") - print("=" * 60) - -if __name__ == "__main__": - verify_database()