enrich data
This commit is contained in:
@@ -1,54 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Check for Apollo state or other embedded data"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
|
|
||||||
content = await page.content()
|
|
||||||
|
|
||||||
# Look for embedded data structures
|
|
||||||
patterns = [
|
|
||||||
(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', "NEXT_DATA"),
|
|
||||||
(r'window\.__APOLLO_STATE__\s*=\s*({.+?});', "APOLLO_STATE"),
|
|
||||||
(r'"lots"\s*:\s*\[(.+?)\]', "LOTS_ARRAY"),
|
|
||||||
]
|
|
||||||
|
|
||||||
for pattern, name in patterns:
|
|
||||||
match = re.search(pattern, content, re.DOTALL)
|
|
||||||
if match:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"FOUND: {name}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
try:
|
|
||||||
if name == "LOTS_ARRAY":
|
|
||||||
print(f"Preview: {match.group(1)[:500]}")
|
|
||||||
else:
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
print(json.dumps(data, indent=2)[:2000])
|
|
||||||
except:
|
|
||||||
print(f"Preview: {match.group(1)[:1000]}")
|
|
||||||
|
|
||||||
# Also check for any script tags with "lot" and "bid" and "end"
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("SEARCHING FOR LOT DATA IN ALL SCRIPTS")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
scripts = re.findall(r'<script[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
for i, script in enumerate(scripts):
|
|
||||||
if all(term in script.lower() for term in ['lot', 'bid', 'end']):
|
|
||||||
print(f"\nScript #{i} (first 500 chars):")
|
|
||||||
print(script[:500])
|
|
||||||
if i > 3: # Limit output
|
|
||||||
break
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Check current data quality in cache.db"""
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
print("=" * 60)
|
|
||||||
print("CURRENT DATA QUALITY CHECK")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Check lots table
|
|
||||||
print("\n[*] Sample Lot Data:")
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT lot_id, current_bid, bid_count, closing_time
|
|
||||||
FROM lots
|
|
||||||
LIMIT 10
|
|
||||||
""")
|
|
||||||
for row in cursor:
|
|
||||||
print(f" Lot: {row[0]}")
|
|
||||||
print(f" Current Bid: {row[1]}")
|
|
||||||
print(f" Bid Count: {row[2]}")
|
|
||||||
print(f" Closing Time: {row[3]}")
|
|
||||||
|
|
||||||
# Check auctions table
|
|
||||||
print("\n[*] Sample Auction Data:")
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT auction_id, title, closing_time, first_lot_closing_time
|
|
||||||
FROM auctions
|
|
||||||
LIMIT 5
|
|
||||||
""")
|
|
||||||
for row in cursor:
|
|
||||||
print(f" Auction: {row[0]}")
|
|
||||||
print(f" Title: {row[1][:50]}...")
|
|
||||||
print(f" Closing Time: {row[2] if len(row) > 2 else 'N/A'}")
|
|
||||||
print(f" First Lot Closing: {row[3]}")
|
|
||||||
|
|
||||||
# Data completeness stats
|
|
||||||
print("\n[*] Data Completeness:")
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total,
|
|
||||||
SUM(CASE WHEN current_bid IS NULL OR current_bid = '' THEN 1 ELSE 0 END) as missing_current_bid,
|
|
||||||
SUM(CASE WHEN closing_time IS NULL OR closing_time = '' THEN 1 ELSE 0 END) as missing_closing_time,
|
|
||||||
SUM(CASE WHEN bid_count IS NULL OR bid_count = 0 THEN 1 ELSE 0 END) as zero_bid_count
|
|
||||||
FROM lots
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
print(f" Total lots: {row[0]:,}")
|
|
||||||
print(f" Missing current_bid: {row[1]:,} ({100*row[1]/row[0]:.1f}%)")
|
|
||||||
print(f" Missing closing_time: {row[2]:,} ({100*row[2]/row[0]:.1f}%)")
|
|
||||||
print(f" Zero bid_count: {row[3]:,} ({100*row[3]/row[0]:.1f}%)")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Check if GraphQL has viewing/pickup data"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from graphql_client import GRAPHQL_ENDPOINT
|
|
||||||
import aiohttp
|
|
||||||
|
|
||||||
# Expanded query to check for all available fields
|
|
||||||
EXTENDED_QUERY = """
|
|
||||||
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
|
||||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
|
||||||
lot {
|
|
||||||
id
|
|
||||||
displayId
|
|
||||||
auctionId
|
|
||||||
currentBidAmount { cents currency }
|
|
||||||
initialAmount { cents currency }
|
|
||||||
nextMinimalBid { cents currency }
|
|
||||||
bidsCount
|
|
||||||
startDate
|
|
||||||
endDate
|
|
||||||
|
|
||||||
# Try to find viewing/pickup fields
|
|
||||||
viewingDays { startDate endDate city countryCode }
|
|
||||||
collectionDays { startDate endDate city countryCode }
|
|
||||||
pickupDays { startDate endDate city countryCode }
|
|
||||||
}
|
|
||||||
auction {
|
|
||||||
id
|
|
||||||
displayId
|
|
||||||
viewingDays { startDate endDate city countryCode }
|
|
||||||
collectionDays { startDate endDate city countryCode }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
variables = {
|
|
||||||
"lotDisplayId": "A1-28505-5",
|
|
||||||
"locale": "nl",
|
|
||||||
"platform": "TWK"
|
|
||||||
}
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"query": EXTENDED_QUERY,
|
|
||||||
"variables": variables
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
data = await response.json()
|
|
||||||
print("Full GraphQL Response:")
|
|
||||||
print(json.dumps(data, indent=2))
|
|
||||||
else:
|
|
||||||
print(f"Error: {response.status}")
|
|
||||||
print(await response.text())
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Exception: {e}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
"""Check how lots link to auctions"""
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
||||||
|
|
||||||
from cache import CacheManager
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
cache = CacheManager()
|
|
||||||
conn = sqlite3.connect(cache.db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Get a lot page from cache
|
|
||||||
cursor.execute("SELECT url, content FROM cache WHERE url LIKE '%/l/%' LIMIT 1")
|
|
||||||
url, content_blob = cursor.fetchone()
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
|
|
||||||
# Extract __NEXT_DATA__
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
|
|
||||||
props = data.get('props', {}).get('pageProps', {})
|
|
||||||
print("PageProps keys:", list(props.keys()))
|
|
||||||
|
|
||||||
lot = props.get('lot', {})
|
|
||||||
print("\nLot data:")
|
|
||||||
print(f" displayId: {lot.get('displayId')}")
|
|
||||||
print(f" auctionId (UUID): {lot.get('auctionId')}")
|
|
||||||
|
|
||||||
# Check if auction data is also included
|
|
||||||
auction = props.get('auction')
|
|
||||||
if auction:
|
|
||||||
print("\nAuction data IS included in lot page!")
|
|
||||||
print(f" Auction displayId: {auction.get('displayId')}")
|
|
||||||
print(f" Auction id (UUID): {auction.get('id')}")
|
|
||||||
print(f" Auction name: {auction.get('name', '')[:60]}")
|
|
||||||
else:
|
|
||||||
print("\nAuction data NOT included in lot page")
|
|
||||||
print("Need to look up auction by UUID")
|
|
||||||
|
|
||||||
# Check if we can find the auction by UUID
|
|
||||||
lot_auction_uuid = lot.get('auctionId')
|
|
||||||
if lot_auction_uuid:
|
|
||||||
# Try to find auction page with this UUID
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT url, content FROM cache
|
|
||||||
WHERE url LIKE '%/a/%'
|
|
||||||
LIMIT 10
|
|
||||||
""")
|
|
||||||
|
|
||||||
found_match = False
|
|
||||||
for auction_url, auction_content_blob in cursor.fetchall():
|
|
||||||
auction_content = zlib.decompress(auction_content_blob).decode('utf-8')
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', auction_content, re.DOTALL)
|
|
||||||
if match:
|
|
||||||
auction_data = json.loads(match.group(1))
|
|
||||||
auction_obj = auction_data.get('props', {}).get('pageProps', {}).get('auction', {})
|
|
||||||
if auction_obj.get('id') == lot_auction_uuid:
|
|
||||||
print(f"\n✓ Found matching auction!")
|
|
||||||
print(f" Auction displayId: {auction_obj.get('displayId')}")
|
|
||||||
print(f" Auction UUID: {auction_obj.get('id')}")
|
|
||||||
print(f" Auction URL: {auction_url}")
|
|
||||||
found_match = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not found_match:
|
|
||||||
print(f"\n✗ Could not find auction with UUID {lot_auction_uuid} in first 10 cached auctions")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Check viewing time data"""
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
# Check if viewing_time has data
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT viewing_time, pickup_date
|
|
||||||
FROM lots
|
|
||||||
WHERE viewing_time IS NOT NULL AND viewing_time != ''
|
|
||||||
LIMIT 5
|
|
||||||
""")
|
|
||||||
|
|
||||||
rows = cursor.fetchall()
|
|
||||||
print("Existing viewing_time data:")
|
|
||||||
for r in rows:
|
|
||||||
print(f" Viewing: {r[0]}")
|
|
||||||
print(f" Pickup: {r[1]}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check overall completeness
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total,
|
|
||||||
SUM(CASE WHEN viewing_time IS NOT NULL AND viewing_time != '' THEN 1 ELSE 0 END) as has_viewing,
|
|
||||||
SUM(CASE WHEN pickup_date IS NOT NULL AND pickup_date != '' THEN 1 ELSE 0 END) as has_pickup
|
|
||||||
FROM lots
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
print(f"Completeness:")
|
|
||||||
print(f" Total lots: {row[0]}")
|
|
||||||
print(f" Has viewing_time: {row[1]} ({100*row[1]/row[0]:.1f}%)")
|
|
||||||
print(f" Has pickup_date: {row[2]} ({100*row[2]/row[0]:.1f}%)")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Check if viewing time is in the GraphQL response"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
responses = []
|
|
||||||
|
|
||||||
async def capture_response(response):
|
|
||||||
if 'graphql' in response.url and 'LotBiddingData' in await response.text():
|
|
||||||
try:
|
|
||||||
body = await response.json()
|
|
||||||
responses.append(body)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
page.on('response', capture_response)
|
|
||||||
|
|
||||||
await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle')
|
|
||||||
await asyncio.sleep(2)
|
|
||||||
|
|
||||||
if responses:
|
|
||||||
print("Full LotBiddingData Response:")
|
|
||||||
print("="*60)
|
|
||||||
print(json.dumps(responses[0], indent=2))
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Debug lot data structure from cached page"""
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from parse import DataParser
|
|
||||||
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
# Get a recent lot page
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT url, content
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/l/%'
|
|
||||||
ORDER BY timestamp DESC
|
|
||||||
LIMIT 1
|
|
||||||
""")
|
|
||||||
|
|
||||||
row = cursor.fetchone()
|
|
||||||
if not row:
|
|
||||||
print("No lot pages found")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
url, content_blob = row
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
|
|
||||||
parser = DataParser()
|
|
||||||
result = parser.parse_page(content, url)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
print(f"URL: {url}")
|
|
||||||
print(f"\nParsed Data:")
|
|
||||||
print(f" type: {result.get('type')}")
|
|
||||||
print(f" lot_id: {result.get('lot_id')}")
|
|
||||||
print(f" title: {result.get('title', '')[:50]}...")
|
|
||||||
print(f" current_bid: {result.get('current_bid')}")
|
|
||||||
print(f" bid_count: {result.get('bid_count')}")
|
|
||||||
print(f" closing_time: {result.get('closing_time')}")
|
|
||||||
print(f" location: {result.get('location')}")
|
|
||||||
|
|
||||||
# Also dump the raw JSON
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
if match:
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
page_props = data.get('props', {}).get('pageProps', {})
|
|
||||||
|
|
||||||
if 'lot' in page_props:
|
|
||||||
lot = page_props['lot']
|
|
||||||
print(f"\nRAW __NEXT_DATA__.lot keys: {list(lot.keys())}")
|
|
||||||
print(f"\nSearching for bid/timing fields...")
|
|
||||||
|
|
||||||
# Deep search for these fields
|
|
||||||
def deep_search(obj, prefix=""):
|
|
||||||
if isinstance(obj, dict):
|
|
||||||
for k, v in obj.items():
|
|
||||||
if any(term in k.lower() for term in ['bid', 'end', 'close', 'date', 'time']):
|
|
||||||
print(f" {prefix}{k}: {v}")
|
|
||||||
if isinstance(v, (dict, list)):
|
|
||||||
deep_search(v, prefix + k + ".")
|
|
||||||
elif isinstance(obj, list) and len(obj) > 0:
|
|
||||||
deep_search(obj[0], prefix + "[0].")
|
|
||||||
|
|
||||||
deep_search(lot)
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Deep inspect lot JSON for viewing/pickup data"""
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT url, content
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/l/%'
|
|
||||||
ORDER BY timestamp DESC
|
|
||||||
LIMIT 1
|
|
||||||
""")
|
|
||||||
|
|
||||||
row = cursor.fetchone()
|
|
||||||
url, content_blob = row
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
|
||||||
|
|
||||||
print(f"Inspecting: {url}\n")
|
|
||||||
|
|
||||||
# Check onboarding
|
|
||||||
if 'onboarding' in lot:
|
|
||||||
print("ONBOARDING:")
|
|
||||||
print(json.dumps(lot['onboarding'], indent=2))
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check attributes
|
|
||||||
if 'attributes' in lot:
|
|
||||||
print("ATTRIBUTES:")
|
|
||||||
attrs = lot['attributes']
|
|
||||||
print(json.dumps(attrs[:3] if isinstance(attrs, list) else attrs, indent=2))
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check condition
|
|
||||||
if 'condition' in lot:
|
|
||||||
print("CONDITION:")
|
|
||||||
print(json.dumps(lot['condition'], indent=2))
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check appearance
|
|
||||||
if 'appearance' in lot:
|
|
||||||
print("APPEARANCE:")
|
|
||||||
print(json.dumps(lot['appearance'], indent=2))
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check location
|
|
||||||
if 'location' in lot:
|
|
||||||
print("LOCATION:")
|
|
||||||
print(json.dumps(lot['location'], indent=2))
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check for any field with "view", "pick", "collect", "date", "time"
|
|
||||||
print("\nFIELDS WITH VIEWING/PICKUP/TIME:")
|
|
||||||
for key in lot.keys():
|
|
||||||
if any(term in key.lower() for term in ['view', 'pick', 'collect', 'date', 'time', 'day']):
|
|
||||||
print(f" {key}: {lot[key]}")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
@@ -1,120 +0,0 @@
|
|||||||
"""
|
|
||||||
Enrich existing lots with new intelligence fields:
|
|
||||||
- followers_count
|
|
||||||
- estimated_min_price / estimated_max_price
|
|
||||||
- lot_condition
|
|
||||||
- appearance
|
|
||||||
|
|
||||||
Reads from cached lot pages __NEXT_DATA__ JSON
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
from cache import CacheManager
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from graphql_client import fetch_lot_bidding_data, format_bid_data
|
|
||||||
|
|
||||||
async def enrich_existing_lots():
|
|
||||||
"""Enrich existing lots with new fields from GraphQL API"""
|
|
||||||
cache = CacheManager()
|
|
||||||
conn = sqlite3.connect(cache.db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Get all lot IDs
|
|
||||||
cursor.execute("SELECT lot_id FROM lots")
|
|
||||||
lot_ids = [r[0] for r in cursor.fetchall()]
|
|
||||||
|
|
||||||
print(f"Found {len(lot_ids)} lots to enrich")
|
|
||||||
print("Fetching enrichment data from GraphQL API...")
|
|
||||||
print("This will take ~{:.1f} minutes (0.5s rate limit)".format(len(lot_ids) * 0.5 / 60))
|
|
||||||
|
|
||||||
enriched = 0
|
|
||||||
failed = 0
|
|
||||||
no_data = 0
|
|
||||||
|
|
||||||
for i, lot_id in enumerate(lot_ids):
|
|
||||||
if (i + 1) % 10 == 0:
|
|
||||||
print(f"Progress: {i+1}/{len(lot_ids)} ({enriched} enriched, {no_data} no data, {failed} failed)", end='\r')
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Fetch from GraphQL API
|
|
||||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
|
||||||
|
|
||||||
if bidding_data:
|
|
||||||
formatted_data = format_bid_data(bidding_data)
|
|
||||||
|
|
||||||
# Update lot with new fields
|
|
||||||
cursor.execute("""
|
|
||||||
UPDATE lots
|
|
||||||
SET followers_count = ?,
|
|
||||||
estimated_min_price = ?,
|
|
||||||
estimated_max_price = ?,
|
|
||||||
lot_condition = ?,
|
|
||||||
appearance = ?
|
|
||||||
WHERE lot_id = ?
|
|
||||||
""", (
|
|
||||||
formatted_data.get('followers_count', 0),
|
|
||||||
formatted_data.get('estimated_min_price'),
|
|
||||||
formatted_data.get('estimated_max_price'),
|
|
||||||
formatted_data.get('lot_condition', ''),
|
|
||||||
formatted_data.get('appearance', ''),
|
|
||||||
lot_id
|
|
||||||
))
|
|
||||||
|
|
||||||
enriched += 1
|
|
||||||
|
|
||||||
# Commit every 50 lots
|
|
||||||
if enriched % 50 == 0:
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
else:
|
|
||||||
no_data += 1
|
|
||||||
|
|
||||||
# Rate limit
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
failed += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
print(f"\n\nComplete!")
|
|
||||||
print(f"Total lots: {len(lot_ids)}")
|
|
||||||
print(f"Enriched: {enriched}")
|
|
||||||
print(f"No data: {no_data}")
|
|
||||||
print(f"Failed: {failed}")
|
|
||||||
|
|
||||||
# Show statistics
|
|
||||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE followers_count > 0")
|
|
||||||
with_followers = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE estimated_min_price IS NOT NULL")
|
|
||||||
with_estimates = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
cursor.execute("SELECT COUNT(*) FROM lots WHERE lot_condition IS NOT NULL AND lot_condition != ''")
|
|
||||||
with_condition = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
print(f"\nEnrichment statistics:")
|
|
||||||
print(f" Lots with followers_count: {with_followers} ({with_followers/len(lot_ids)*100:.1f}%)")
|
|
||||||
print(f" Lots with estimated prices: {with_estimates} ({with_estimates/len(lot_ids)*100:.1f}%)")
|
|
||||||
print(f" Lots with condition: {with_condition} ({with_condition/len(lot_ids)*100:.1f}%)")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print("WARNING: This will make ~16,800 API calls at 0.5s intervals (~2.3 hours)")
|
|
||||||
print("Press Ctrl+C to cancel, or wait 5 seconds to continue...")
|
|
||||||
import time
|
|
||||||
try:
|
|
||||||
time.sleep(5)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\nCancelled")
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
asyncio.run(enrich_existing_lots())
|
|
||||||
@@ -1,370 +0,0 @@
|
|||||||
"""
|
|
||||||
Explore API responses to identify additional fields available for intelligence.
|
|
||||||
Tests GraphQL and REST API responses for field coverage.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
||||||
|
|
||||||
import json
|
|
||||||
import aiohttp
|
|
||||||
from graphql_client import fetch_lot_bidding_data, GRAPHQL_ENDPOINT
|
|
||||||
from bid_history_client import fetch_bid_history, BID_HISTORY_ENDPOINT
|
|
||||||
|
|
||||||
async def explore_graphql_schema():
|
|
||||||
"""Query GraphQL schema to see all available fields"""
|
|
||||||
print("=" * 80)
|
|
||||||
print("GRAPHQL SCHEMA EXPLORATION")
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
# Introspection query for LotDetails type
|
|
||||||
introspection_query = """
|
|
||||||
query IntrospectionQuery {
|
|
||||||
__type(name: "LotDetails") {
|
|
||||||
name
|
|
||||||
fields {
|
|
||||||
name
|
|
||||||
type {
|
|
||||||
name
|
|
||||||
kind
|
|
||||||
ofType {
|
|
||||||
name
|
|
||||||
kind
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
try:
|
|
||||||
async with session.post(
|
|
||||||
GRAPHQL_ENDPOINT,
|
|
||||||
json={
|
|
||||||
"query": introspection_query,
|
|
||||||
"variables": {}
|
|
||||||
},
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
data = await response.json()
|
|
||||||
lot_type = data.get('data', {}).get('__type')
|
|
||||||
if lot_type:
|
|
||||||
print("\nLotDetails available fields:")
|
|
||||||
for field in lot_type.get('fields', []):
|
|
||||||
field_name = field['name']
|
|
||||||
field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex')
|
|
||||||
print(f" - {field_name}: {field_type}")
|
|
||||||
print()
|
|
||||||
else:
|
|
||||||
print(f"Failed with status {response.status}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
|
|
||||||
# Also try Lot type
|
|
||||||
introspection_query_lot = """
|
|
||||||
query IntrospectionQuery {
|
|
||||||
__type(name: "Lot") {
|
|
||||||
name
|
|
||||||
fields {
|
|
||||||
name
|
|
||||||
type {
|
|
||||||
name
|
|
||||||
kind
|
|
||||||
ofType {
|
|
||||||
name
|
|
||||||
kind
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
try:
|
|
||||||
async with session.post(
|
|
||||||
GRAPHQL_ENDPOINT,
|
|
||||||
json={
|
|
||||||
"query": introspection_query_lot,
|
|
||||||
"variables": {}
|
|
||||||
},
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
data = await response.json()
|
|
||||||
lot_type = data.get('data', {}).get('__type')
|
|
||||||
if lot_type:
|
|
||||||
print("\nLot type available fields:")
|
|
||||||
for field in lot_type.get('fields', []):
|
|
||||||
field_name = field['name']
|
|
||||||
field_type = field['type'].get('name') or field['type'].get('ofType', {}).get('name', 'Complex')
|
|
||||||
print(f" - {field_name}: {field_type}")
|
|
||||||
print()
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
|
|
||||||
async def test_graphql_full_query():
|
|
||||||
"""Test a comprehensive GraphQL query to see all returned data"""
|
|
||||||
print("=" * 80)
|
|
||||||
print("GRAPHQL FULL QUERY TEST")
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
# Test with a real lot ID
|
|
||||||
lot_id = "A1-34731-107" # Example from database
|
|
||||||
|
|
||||||
comprehensive_query = """
|
|
||||||
query ComprehensiveLotQuery($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
|
||||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
|
||||||
lot {
|
|
||||||
id
|
|
||||||
displayId
|
|
||||||
title
|
|
||||||
description
|
|
||||||
currentBidAmount { cents currency }
|
|
||||||
initialAmount { cents currency }
|
|
||||||
nextMinimalBid { cents currency }
|
|
||||||
bidsCount
|
|
||||||
startDate
|
|
||||||
endDate
|
|
||||||
minimumBidAmountMet
|
|
||||||
lotNumber
|
|
||||||
auctionId
|
|
||||||
lotState
|
|
||||||
location {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
}
|
|
||||||
viewingDays {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
addressLine1
|
|
||||||
addressLine2
|
|
||||||
endDate
|
|
||||||
startDate
|
|
||||||
}
|
|
||||||
collectionDays {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
addressLine1
|
|
||||||
addressLine2
|
|
||||||
endDate
|
|
||||||
startDate
|
|
||||||
}
|
|
||||||
images {
|
|
||||||
url
|
|
||||||
thumbnailUrl
|
|
||||||
}
|
|
||||||
attributes {
|
|
||||||
name
|
|
||||||
value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
try:
|
|
||||||
async with session.post(
|
|
||||||
GRAPHQL_ENDPOINT,
|
|
||||||
json={
|
|
||||||
"query": comprehensive_query,
|
|
||||||
"variables": {
|
|
||||||
"lotDisplayId": lot_id,
|
|
||||||
"locale": "nl_NL",
|
|
||||||
"platform": "WEB"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
data = await response.json()
|
|
||||||
print(f"\nFull GraphQL response for {lot_id}:")
|
|
||||||
print(json.dumps(data, indent=2))
|
|
||||||
print()
|
|
||||||
else:
|
|
||||||
print(f"Failed with status {response.status}")
|
|
||||||
print(await response.text())
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
|
|
||||||
async def test_bid_history_response():
|
|
||||||
"""Test bid history API to see all returned fields"""
|
|
||||||
print("=" * 80)
|
|
||||||
print("BID HISTORY API TEST")
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
# Get a lot with bids from database
|
|
||||||
import sqlite3
|
|
||||||
from cache import CacheManager
|
|
||||||
|
|
||||||
cache = CacheManager()
|
|
||||||
conn = sqlite3.connect(cache.db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Find a lot with bids
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT lot_id, url FROM lots
|
|
||||||
WHERE bid_count > 0
|
|
||||||
ORDER BY bid_count DESC
|
|
||||||
LIMIT 1
|
|
||||||
""")
|
|
||||||
result = cursor.fetchone()
|
|
||||||
|
|
||||||
if result:
|
|
||||||
lot_id, url = result
|
|
||||||
# Extract UUID from URL
|
|
||||||
import re
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>', url)
|
|
||||||
# We need to get UUID from cached page
|
|
||||||
cursor.execute("SELECT content FROM cache WHERE url = ?", (url,))
|
|
||||||
page_result = cursor.fetchone()
|
|
||||||
|
|
||||||
if page_result:
|
|
||||||
import zlib
|
|
||||||
content = zlib.decompress(page_result[0]).decode('utf-8')
|
|
||||||
match = re.search(r'"lot":\s*\{[^}]*"id":\s*"([^"]+)"', content)
|
|
||||||
if match:
|
|
||||||
lot_uuid = match.group(1)
|
|
||||||
print(f"\nTesting with lot {lot_id} (UUID: {lot_uuid})")
|
|
||||||
|
|
||||||
# Fetch bid history
|
|
||||||
bid_history = await fetch_bid_history(lot_uuid)
|
|
||||||
if bid_history:
|
|
||||||
print(f"\nBid history sample (first 3 records):")
|
|
||||||
for i, bid in enumerate(bid_history[:3]):
|
|
||||||
print(f"\nBid {i+1}:")
|
|
||||||
print(json.dumps(bid, indent=2))
|
|
||||||
|
|
||||||
print(f"\n\nAll available fields in bid records:")
|
|
||||||
if bid_history:
|
|
||||||
all_keys = set()
|
|
||||||
for bid in bid_history:
|
|
||||||
all_keys.update(bid.keys())
|
|
||||||
for key in sorted(all_keys):
|
|
||||||
print(f" - {key}")
|
|
||||||
else:
|
|
||||||
print("No bid history found")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
async def check_auction_api():
|
|
||||||
"""Check if there's an auction details API"""
|
|
||||||
print("=" * 80)
|
|
||||||
print("AUCTION API EXPLORATION")
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
auction_query = """
|
|
||||||
query AuctionDetails($auctionId: String!, $locale: String!, $platform: Platform!) {
|
|
||||||
auctionDetails(auctionId: $auctionId, locale: $locale, platform: $platform) {
|
|
||||||
auction {
|
|
||||||
id
|
|
||||||
title
|
|
||||||
description
|
|
||||||
startDate
|
|
||||||
endDate
|
|
||||||
firstLotEndDate
|
|
||||||
location {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
}
|
|
||||||
viewingDays {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
startDate
|
|
||||||
endDate
|
|
||||||
addressLine1
|
|
||||||
addressLine2
|
|
||||||
}
|
|
||||||
collectionDays {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
startDate
|
|
||||||
endDate
|
|
||||||
addressLine1
|
|
||||||
addressLine2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Get an auction ID from database
|
|
||||||
import sqlite3
|
|
||||||
from cache import CacheManager
|
|
||||||
|
|
||||||
cache = CacheManager()
|
|
||||||
conn = sqlite3.connect(cache.db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Get auction ID from a lot
|
|
||||||
cursor.execute("SELECT DISTINCT auction_id FROM lots WHERE auction_id IS NOT NULL LIMIT 1")
|
|
||||||
result = cursor.fetchone()
|
|
||||||
|
|
||||||
if result:
|
|
||||||
auction_id = result[0]
|
|
||||||
print(f"\nTesting with auction {auction_id}")
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
try:
|
|
||||||
async with session.post(
|
|
||||||
GRAPHQL_ENDPOINT,
|
|
||||||
json={
|
|
||||||
"query": auction_query,
|
|
||||||
"variables": {
|
|
||||||
"auctionId": auction_id,
|
|
||||||
"locale": "nl_NL",
|
|
||||||
"platform": "WEB"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
data = await response.json()
|
|
||||||
print("\nAuction API response:")
|
|
||||||
print(json.dumps(data, indent=2))
|
|
||||||
else:
|
|
||||||
print(f"Failed with status {response.status}")
|
|
||||||
print(await response.text())
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Run all API explorations"""
|
|
||||||
await explore_graphql_schema()
|
|
||||||
await test_graphql_full_query()
|
|
||||||
await test_bid_history_response()
|
|
||||||
await check_auction_api()
|
|
||||||
|
|
||||||
print("\n" + "=" * 80)
|
|
||||||
print("SUMMARY: AVAILABLE DATA FIELDS")
|
|
||||||
print("=" * 80)
|
|
||||||
print("""
|
|
||||||
CURRENTLY CAPTURED:
|
|
||||||
- Lot bidding data: current_bid, starting_bid, minimum_bid, bid_count, closing_time
|
|
||||||
- Lot attributes: brand, model, manufacturer, year, condition, serial_number
|
|
||||||
- Bid history: bid_amount, bid_time, bidder_id, is_autobid
|
|
||||||
- Bid intelligence: first_bid_time, last_bid_time, bid_velocity, bid_increment
|
|
||||||
- Images: URLs and local paths
|
|
||||||
|
|
||||||
POTENTIALLY AVAILABLE (TO CHECK):
|
|
||||||
- Viewing/collection times with full address and date ranges
|
|
||||||
- Lot location details (city, country)
|
|
||||||
- Lot state/status
|
|
||||||
- Image thumbnails
|
|
||||||
- More detailed attributes
|
|
||||||
|
|
||||||
NOT AVAILABLE:
|
|
||||||
- Watch count (not exposed in API)
|
|
||||||
- Reserve price (not exposed in API)
|
|
||||||
- Estimated min/max value (not exposed in API)
|
|
||||||
- Bidder identities (anonymized)
|
|
||||||
""")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,93 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Explore the actual auction schema"""
|
|
||||||
import asyncio
|
|
||||||
import aiohttp
|
|
||||||
import json
|
|
||||||
|
|
||||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
|
||||||
|
|
||||||
# Try different field structures
|
|
||||||
QUERIES = {
|
|
||||||
"viewingDays_simple": """
|
|
||||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
|
||||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
|
||||||
viewingDays {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
""",
|
|
||||||
"viewingDays_with_times": """
|
|
||||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
|
||||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
|
||||||
viewingDays {
|
|
||||||
from
|
|
||||||
to
|
|
||||||
city
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
""",
|
|
||||||
"full_auction": """
|
|
||||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
|
||||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
|
||||||
id
|
|
||||||
displayId
|
|
||||||
biddingStatus
|
|
||||||
buyersPremium
|
|
||||||
viewingDays {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
from
|
|
||||||
to
|
|
||||||
}
|
|
||||||
collectionDays {
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
from
|
|
||||||
to
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
|
|
||||||
async def test_query(name, query, auction_id):
|
|
||||||
variables = {
|
|
||||||
"auctionId": auction_id,
|
|
||||||
"locale": "nl",
|
|
||||||
"platform": "TWK"
|
|
||||||
}
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"query": query,
|
|
||||||
"variables": variables
|
|
||||||
}
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
|
||||||
data = await response.json()
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"QUERY: {name}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
if 'errors' in data:
|
|
||||||
print("ERRORS:")
|
|
||||||
for error in data['errors']:
|
|
||||||
print(f" {error}")
|
|
||||||
else:
|
|
||||||
print("SUCCESS:")
|
|
||||||
print(json.dumps(data, indent=2))
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
# Test with the auction we know exists
|
|
||||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
|
||||||
|
|
||||||
for name, query in QUERIES.items():
|
|
||||||
await test_query(name, query, auction_id)
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,141 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Export only NEW auctions/lots that haven't been sent to server yet
|
|
||||||
Prevents UNIQUE constraint errors on server import
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sqlite3
|
|
||||||
import json
|
|
||||||
import csv
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
DB_PATH = "C:/mnt/okcomputer/output/cache.db"
|
|
||||||
OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
|
|
||||||
SYNC_STATE_FILE = OUTPUT_DIR / ".server_sync_state"
|
|
||||||
|
|
||||||
def get_last_export_timestamp():
|
|
||||||
"""Get timestamp of last successful export to server"""
|
|
||||||
if SYNC_STATE_FILE.exists():
|
|
||||||
return int(SYNC_STATE_FILE.read_text().strip())
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def save_export_timestamp(timestamp: int):
|
|
||||||
"""Save timestamp of successful export"""
|
|
||||||
SYNC_STATE_FILE.write_text(str(timestamp))
|
|
||||||
|
|
||||||
def export_new_data():
|
|
||||||
"""Export only records that are NEW since last server import"""
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
conn.row_factory = sqlite3.Row
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
last_export = get_last_export_timestamp()
|
|
||||||
current_time = int(datetime.now().timestamp())
|
|
||||||
|
|
||||||
print("="*60)
|
|
||||||
print("INCREMENTAL EXPORT FOR SERVER")
|
|
||||||
print("="*60)
|
|
||||||
print(f"Last export: {datetime.fromtimestamp(last_export).strftime('%Y-%m-%d %H:%M:%S') if last_export else 'Never (will export ALL)'}")
|
|
||||||
print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Get new auctions (discovered_at > last_export)
|
|
||||||
if last_export == 0:
|
|
||||||
# First run: export all
|
|
||||||
cursor.execute("SELECT * FROM auctions ORDER BY auction_id")
|
|
||||||
else:
|
|
||||||
# Subsequent runs: only new ones
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT * FROM auctions
|
|
||||||
WHERE discovered_at > ?
|
|
||||||
ORDER BY auction_id
|
|
||||||
""", (last_export,))
|
|
||||||
|
|
||||||
new_auctions = [dict(row) for row in cursor.fetchall()]
|
|
||||||
|
|
||||||
# Get new lots (scraped_at_timestamp > last_export)
|
|
||||||
if last_export == 0:
|
|
||||||
cursor.execute("SELECT * FROM lots ORDER BY lot_id")
|
|
||||||
else:
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT * FROM lots
|
|
||||||
WHERE scraped_at_timestamp > ?
|
|
||||||
ORDER BY lot_id
|
|
||||||
""", (last_export,))
|
|
||||||
|
|
||||||
new_lots = [dict(row) for row in cursor.fetchall()]
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
# Export to server-ready files
|
|
||||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
||||||
files_created = []
|
|
||||||
|
|
||||||
# Export auctions
|
|
||||||
if new_auctions:
|
|
||||||
auctions_csv = OUTPUT_DIR / f'auctions_{timestamp}.csv'
|
|
||||||
auctions_json = OUTPUT_DIR / f'auctions_{timestamp}.json'
|
|
||||||
|
|
||||||
with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
|
|
||||||
writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(new_auctions)
|
|
||||||
|
|
||||||
with open(auctions_json, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(new_auctions, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
files_created.extend([auctions_csv, auctions_json])
|
|
||||||
print(f"✓ Exported {len(new_auctions)} auctions")
|
|
||||||
print(f" CSV: {auctions_csv}")
|
|
||||||
print(f" JSON: {auctions_json}")
|
|
||||||
else:
|
|
||||||
print("✓ No new auctions to export")
|
|
||||||
|
|
||||||
# Export lots
|
|
||||||
if new_lots:
|
|
||||||
lots_csv = OUTPUT_DIR / f'lots_{timestamp}.csv'
|
|
||||||
lots_json = OUTPUT_DIR / f'lots_{timestamp}.json'
|
|
||||||
|
|
||||||
with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
|
|
||||||
writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(new_lots)
|
|
||||||
|
|
||||||
with open(lots_json, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(new_lots, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
files_created.extend([lots_csv, lots_json])
|
|
||||||
print(f"✓ Exported {len(new_lots)} lots")
|
|
||||||
print(f" CSV: {lots_csv}")
|
|
||||||
print(f" JSON: {lots_json}")
|
|
||||||
else:
|
|
||||||
print("✓ No new lots to export")
|
|
||||||
|
|
||||||
# Save sync state
|
|
||||||
if new_auctions or new_lots:
|
|
||||||
save_export_timestamp(current_time)
|
|
||||||
print()
|
|
||||||
print("="*60)
|
|
||||||
print("EXPORT COMPLETE")
|
|
||||||
print("="*60)
|
|
||||||
print(f"New auctions: {len(new_auctions)}")
|
|
||||||
print(f"New lots: {len(new_lots)}")
|
|
||||||
print()
|
|
||||||
print("Next export will only include records newer than:")
|
|
||||||
print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
|
||||||
else:
|
|
||||||
print()
|
|
||||||
print("="*60)
|
|
||||||
print("NOTHING TO EXPORT")
|
|
||||||
print("="*60)
|
|
||||||
print("All data already exported to server")
|
|
||||||
|
|
||||||
return {
|
|
||||||
'auctions': len(new_auctions),
|
|
||||||
'lots': len(new_lots),
|
|
||||||
'files': [str(f) for f in files_created]
|
|
||||||
}
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
export_new_data()
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Extract the GraphQL query being used"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
graphql_requests = []
|
|
||||||
|
|
||||||
async def capture_request(request):
|
|
||||||
if 'graphql' in request.url:
|
|
||||||
graphql_requests.append({
|
|
||||||
'url': request.url,
|
|
||||||
'method': request.method,
|
|
||||||
'post_data': request.post_data,
|
|
||||||
'headers': dict(request.headers)
|
|
||||||
})
|
|
||||||
|
|
||||||
page.on('request', capture_request)
|
|
||||||
|
|
||||||
await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle')
|
|
||||||
await asyncio.sleep(2)
|
|
||||||
|
|
||||||
print(f"Captured {len(graphql_requests)} GraphQL requests\n")
|
|
||||||
|
|
||||||
for i, req in enumerate(graphql_requests):
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"REQUEST #{i+1}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"URL: {req['url']}")
|
|
||||||
print(f"Method: {req['method']}")
|
|
||||||
|
|
||||||
if req['post_data']:
|
|
||||||
try:
|
|
||||||
data = json.loads(req['post_data'])
|
|
||||||
print(f"\nQuery Name: {data.get('operationName', 'N/A')}")
|
|
||||||
print(f"\nVariables:")
|
|
||||||
print(json.dumps(data.get('variables', {}), indent=2))
|
|
||||||
print(f"\nQuery:")
|
|
||||||
print(data.get('query', '')[:1000])
|
|
||||||
except:
|
|
||||||
print(f"\nPOST Data: {req['post_data'][:500]}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Find viewing/pickup in actual HTML"""
|
|
||||||
import asyncio
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
import re
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
# Try a lot that should have viewing times
|
|
||||||
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
|
||||||
|
|
||||||
# Get text content
|
|
||||||
text_content = await page.evaluate("document.body.innerText")
|
|
||||||
|
|
||||||
print("Searching for viewing/pickup patterns...\n")
|
|
||||||
|
|
||||||
# Look for "Bezichtigingen" section
|
|
||||||
lines = text_content.split('\n')
|
|
||||||
for i, line in enumerate(lines):
|
|
||||||
if 'bezichtig' in line.lower() or 'viewing' in line.lower():
|
|
||||||
# Print surrounding context
|
|
||||||
context = lines[max(0, i-1):min(len(lines), i+5)]
|
|
||||||
print("FOUND Bezichtigingen:")
|
|
||||||
for c in context:
|
|
||||||
print(f" {c}")
|
|
||||||
print()
|
|
||||||
break
|
|
||||||
|
|
||||||
# Look for "Ophalen" section
|
|
||||||
for i, line in enumerate(lines):
|
|
||||||
if 'ophalen' in line.lower() or 'collection' in line.lower() or 'pickup' in line.lower():
|
|
||||||
context = lines[max(0, i-1):min(len(lines), i+5)]
|
|
||||||
print("FOUND Ophalen:")
|
|
||||||
for c in context:
|
|
||||||
print(f" {c}")
|
|
||||||
print()
|
|
||||||
break
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,166 +0,0 @@
|
|||||||
"""
|
|
||||||
Fetch bid history for existing lots that have bids but no bid history records.
|
|
||||||
Reads cached lot pages to get lot UUIDs, then calls bid history API.
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
from cache import CacheManager
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from bid_history_client import fetch_bid_history, parse_bid_history
|
|
||||||
|
|
||||||
async def fetch_missing_bid_history():
|
|
||||||
"""Fetch bid history for lots that have bids but no history records"""
|
|
||||||
cache = CacheManager()
|
|
||||||
conn = sqlite3.connect(cache.db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Get lots with bids but no bid history
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT l.lot_id, l.bid_count
|
|
||||||
FROM lots l
|
|
||||||
WHERE l.bid_count > 0
|
|
||||||
AND l.lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
|
|
||||||
ORDER BY l.bid_count DESC
|
|
||||||
""")
|
|
||||||
|
|
||||||
lots_to_fetch = cursor.fetchall()
|
|
||||||
print(f"Found {len(lots_to_fetch)} lots with bids but no bid history")
|
|
||||||
|
|
||||||
if not lots_to_fetch:
|
|
||||||
print("No lots to process!")
|
|
||||||
conn.close()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Build mapping from lot_id to lot UUID from cached pages
|
|
||||||
print("Building lot_id -> UUID mapping from cache...")
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT url, content
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/l/%'
|
|
||||||
""")
|
|
||||||
|
|
||||||
lot_id_to_uuid = {}
|
|
||||||
total_cached = 0
|
|
||||||
|
|
||||||
for url, content_blob in cursor:
|
|
||||||
total_cached += 1
|
|
||||||
|
|
||||||
if total_cached % 100 == 0:
|
|
||||||
print(f"Processed {total_cached} cached pages...", end='\r')
|
|
||||||
|
|
||||||
try:
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
|
|
||||||
if not match:
|
|
||||||
continue
|
|
||||||
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
lot = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
|
||||||
|
|
||||||
if not lot:
|
|
||||||
continue
|
|
||||||
|
|
||||||
lot_display_id = lot.get('displayId')
|
|
||||||
lot_uuid = lot.get('id')
|
|
||||||
|
|
||||||
if lot_display_id and lot_uuid:
|
|
||||||
lot_id_to_uuid[lot_display_id] = lot_uuid
|
|
||||||
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"\n\nBuilt UUID mapping for {len(lot_id_to_uuid)} lots")
|
|
||||||
|
|
||||||
# Fetch bid history for each lot
|
|
||||||
print("\nFetching bid history from API...")
|
|
||||||
|
|
||||||
fetched = 0
|
|
||||||
failed = 0
|
|
||||||
no_uuid = 0
|
|
||||||
|
|
||||||
for lot_id, bid_count in lots_to_fetch:
|
|
||||||
lot_uuid = lot_id_to_uuid.get(lot_id)
|
|
||||||
|
|
||||||
if not lot_uuid:
|
|
||||||
no_uuid += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(f"\nFetching bid history for {lot_id} ({bid_count} bids)...")
|
|
||||||
bid_history = await fetch_bid_history(lot_uuid)
|
|
||||||
|
|
||||||
if bid_history:
|
|
||||||
bid_data = parse_bid_history(bid_history, lot_id)
|
|
||||||
|
|
||||||
# Update lots table with bid intelligence
|
|
||||||
cursor.execute("""
|
|
||||||
UPDATE lots
|
|
||||||
SET first_bid_time = ?,
|
|
||||||
last_bid_time = ?,
|
|
||||||
bid_velocity = ?
|
|
||||||
WHERE lot_id = ?
|
|
||||||
""", (
|
|
||||||
bid_data['first_bid_time'],
|
|
||||||
bid_data['last_bid_time'],
|
|
||||||
bid_data['bid_velocity'],
|
|
||||||
lot_id
|
|
||||||
))
|
|
||||||
|
|
||||||
# Save bid history records
|
|
||||||
cache.save_bid_history(lot_id, bid_data['bid_records'])
|
|
||||||
|
|
||||||
fetched += 1
|
|
||||||
print(f" Saved {len(bid_data['bid_records'])} bid records")
|
|
||||||
print(f" Bid velocity: {bid_data['bid_velocity']:.2f} bids/hour")
|
|
||||||
|
|
||||||
# Commit every 10 lots
|
|
||||||
if fetched % 10 == 0:
|
|
||||||
conn.commit()
|
|
||||||
print(f"\nProgress: {fetched}/{len(lots_to_fetch)} lots processed...")
|
|
||||||
|
|
||||||
# Rate limit to be respectful
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
|
|
||||||
else:
|
|
||||||
failed += 1
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Error fetching bid history for {lot_id}: {e}")
|
|
||||||
failed += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
print(f"\n\nComplete!")
|
|
||||||
print(f"Total lots to process: {len(lots_to_fetch)}")
|
|
||||||
print(f"Successfully fetched: {fetched}")
|
|
||||||
print(f"Failed: {failed}")
|
|
||||||
print(f"No UUID found: {no_uuid}")
|
|
||||||
|
|
||||||
# Verify fix
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT COUNT(DISTINCT lot_id) FROM bid_history
|
|
||||||
""")
|
|
||||||
lots_with_history = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT COUNT(*) FROM lots WHERE bid_count > 0
|
|
||||||
""")
|
|
||||||
lots_with_bids = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
print(f"\nLots with bids: {lots_with_bids}")
|
|
||||||
print(f"Lots with bid history: {lots_with_history}")
|
|
||||||
print(f"Coverage: {lots_with_history/lots_with_bids*100:.1f}%")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(fetch_missing_bid_history())
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Find the API endpoint by monitoring network requests"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
requests = []
|
|
||||||
responses = []
|
|
||||||
|
|
||||||
async def log_request(request):
|
|
||||||
if any(term in request.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
|
|
||||||
requests.append({
|
|
||||||
'url': request.url,
|
|
||||||
'method': request.method,
|
|
||||||
'headers': dict(request.headers),
|
|
||||||
'post_data': request.post_data
|
|
||||||
})
|
|
||||||
|
|
||||||
async def log_response(response):
|
|
||||||
if any(term in response.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
|
|
||||||
try:
|
|
||||||
body = await response.text()
|
|
||||||
responses.append({
|
|
||||||
'url': response.url,
|
|
||||||
'status': response.status,
|
|
||||||
'body': body[:1000]
|
|
||||||
})
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
page.on('request', log_request)
|
|
||||||
page.on('response', log_response)
|
|
||||||
|
|
||||||
print("Loading lot page...")
|
|
||||||
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
|
||||||
|
|
||||||
# Wait for dynamic content
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
|
|
||||||
print(f"\nFound {len(requests)} relevant requests")
|
|
||||||
print(f"Found {len(responses)} relevant responses\n")
|
|
||||||
|
|
||||||
for req in requests[:10]:
|
|
||||||
print(f"REQUEST: {req['method']} {req['url']}")
|
|
||||||
if req['post_data']:
|
|
||||||
print(f" POST DATA: {req['post_data'][:200]}")
|
|
||||||
|
|
||||||
print("\n" + "="*60 + "\n")
|
|
||||||
|
|
||||||
for resp in responses[:10]:
|
|
||||||
print(f"RESPONSE: {resp['url']}")
|
|
||||||
print(f" Status: {resp['status']}")
|
|
||||||
print(f" Body: {resp['body'][:300]}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,70 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Find API endpoint using a valid lot from database"""
|
|
||||||
import asyncio
|
|
||||||
import sqlite3
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
# Get a valid lot URL
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
cursor = conn.execute("SELECT url FROM lots WHERE url LIKE '%/l/%' LIMIT 5")
|
|
||||||
lot_urls = [row[0] for row in cursor.fetchall()]
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
api_calls = []
|
|
||||||
|
|
||||||
async def log_response(response):
|
|
||||||
url = response.url
|
|
||||||
# Look for API calls
|
|
||||||
if ('api' in url.lower() or 'graphql' in url.lower() or
|
|
||||||
'/v2/' in url or '/v3/' in url or '/v4/' in url or
|
|
||||||
'query' in url.lower() or 'mutation' in url.lower()):
|
|
||||||
try:
|
|
||||||
body = await response.text()
|
|
||||||
api_calls.append({
|
|
||||||
'url': url,
|
|
||||||
'status': response.status,
|
|
||||||
'body': body
|
|
||||||
})
|
|
||||||
print(f"\nAPI: {url}")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
page.on('response', log_response)
|
|
||||||
|
|
||||||
for lot_url in lot_urls[:2]:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"Loading: {lot_url}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
await page.goto(lot_url, wait_until='networkidle', timeout=30000)
|
|
||||||
await asyncio.sleep(2)
|
|
||||||
|
|
||||||
# Check if page has bid info
|
|
||||||
content = await page.content()
|
|
||||||
if 'currentBid' in content or 'Current bid' in content or 'Huidig bod' in content:
|
|
||||||
print("[+] Page contains bid information")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[!] Error: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"\n\n{'='*60}")
|
|
||||||
print(f"CAPTURED {len(api_calls)} API CALLS")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
for call in api_calls:
|
|
||||||
print(f"\n{call['url']}")
|
|
||||||
print(f"Status: {call['status']}")
|
|
||||||
if 'json' in call['body'][:100].lower() or call['body'].startswith('{'):
|
|
||||||
print(f"Body (first 500 chars): {call['body'][:500]}")
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Find an auction page with lots data"""
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT url, content
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/a/%'
|
|
||||||
""")
|
|
||||||
|
|
||||||
for row in cursor:
|
|
||||||
url, content_blob = row
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
if not match:
|
|
||||||
continue
|
|
||||||
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
page_props = data.get('props', {}).get('pageProps', {})
|
|
||||||
|
|
||||||
if 'auction' in page_props:
|
|
||||||
auction = page_props['auction']
|
|
||||||
lots = auction.get('lots', [])
|
|
||||||
|
|
||||||
if lots and len(lots) > 0:
|
|
||||||
print(f"Found auction with {len(lots)} lots: {url}\n")
|
|
||||||
|
|
||||||
lot = lots[0]
|
|
||||||
print(f"SAMPLE LOT FROM AUCTION.LOTS[]:")
|
|
||||||
print(f" displayId: {lot.get('displayId')}")
|
|
||||||
print(f" title: {lot.get('title', '')[:50]}...")
|
|
||||||
print(f" urlSlug: {lot.get('urlSlug')}")
|
|
||||||
print(f"\nBIDDING FIELDS:")
|
|
||||||
for key in ['currentBid', 'highestBid', 'startingBid', 'minimumBidAmount', 'bidCount', 'numberOfBids']:
|
|
||||||
print(f" {key}: {lot.get(key)}")
|
|
||||||
print(f"\nTIMING FIELDS:")
|
|
||||||
for key in ['endDate', 'startDate', 'closingTime']:
|
|
||||||
print(f" {key}: {lot.get(key)}")
|
|
||||||
print(f"\nALL KEYS: {list(lot.keys())[:30]}...")
|
|
||||||
break
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
@@ -1,155 +0,0 @@
|
|||||||
"""
|
|
||||||
Fix auctions table by replacing with correct data from cached auction pages.
|
|
||||||
The auctions table currently has wrong auction_ids (numeric instead of displayId).
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
||||||
|
|
||||||
from cache import CacheManager
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
def fix_auctions_table():
|
|
||||||
"""Rebuild auctions table from cached auction pages"""
|
|
||||||
cache = CacheManager()
|
|
||||||
conn = sqlite3.connect(cache.db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Clear existing auctions table
|
|
||||||
print("Clearing auctions table...")
|
|
||||||
cursor.execute("DELETE FROM auctions")
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
# Get all auction pages from cache
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT url, content
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/a/%'
|
|
||||||
""")
|
|
||||||
|
|
||||||
auction_pages = cursor.fetchall()
|
|
||||||
print(f"Found {len(auction_pages)} auction pages in cache")
|
|
||||||
|
|
||||||
total = 0
|
|
||||||
inserted = 0
|
|
||||||
errors = 0
|
|
||||||
|
|
||||||
print("Extracting auction data from cached pages...")
|
|
||||||
|
|
||||||
for url, content_blob in auction_pages:
|
|
||||||
total += 1
|
|
||||||
|
|
||||||
if total % 10 == 0:
|
|
||||||
print(f"Processed {total}/{len(auction_pages)}...", end='\r')
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Decompress and parse __NEXT_DATA__
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
|
|
||||||
if not match:
|
|
||||||
errors += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
page_props = data.get('props', {}).get('pageProps', {})
|
|
||||||
auction = page_props.get('auction', {})
|
|
||||||
|
|
||||||
if not auction:
|
|
||||||
errors += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Extract auction data
|
|
||||||
auction_id = auction.get('displayId')
|
|
||||||
if not auction_id:
|
|
||||||
errors += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
title = auction.get('name', '')
|
|
||||||
|
|
||||||
# Get location
|
|
||||||
location = ''
|
|
||||||
viewing_days = auction.get('viewingDays', [])
|
|
||||||
if viewing_days and isinstance(viewing_days, list) and len(viewing_days) > 0:
|
|
||||||
loc = viewing_days[0]
|
|
||||||
city = loc.get('city', '')
|
|
||||||
country = loc.get('countryCode', '').upper()
|
|
||||||
location = f"{city}, {country}" if city and country else (city or country)
|
|
||||||
|
|
||||||
lots_count = auction.get('lotCount', 0)
|
|
||||||
|
|
||||||
# Get first lot closing time
|
|
||||||
first_lot_closing = ''
|
|
||||||
min_end_date = auction.get('minEndDate', '')
|
|
||||||
if min_end_date:
|
|
||||||
# Format timestamp
|
|
||||||
try:
|
|
||||||
dt = datetime.fromisoformat(min_end_date.replace('Z', '+00:00'))
|
|
||||||
first_lot_closing = dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
||||||
except:
|
|
||||||
first_lot_closing = min_end_date
|
|
||||||
|
|
||||||
scraped_at = datetime.now().isoformat()
|
|
||||||
|
|
||||||
# Insert into auctions table
|
|
||||||
cursor.execute("""
|
|
||||||
INSERT OR REPLACE INTO auctions
|
|
||||||
(auction_id, url, title, location, lots_count, first_lot_closing_time, scraped_at)
|
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
||||||
""", (auction_id, url, title, location, lots_count, first_lot_closing, scraped_at))
|
|
||||||
|
|
||||||
inserted += 1
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
errors += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
print(f"\n\nComplete!")
|
|
||||||
print(f"Total auction pages processed: {total}")
|
|
||||||
print(f"Auctions inserted: {inserted}")
|
|
||||||
print(f"Errors: {errors}")
|
|
||||||
|
|
||||||
# Verify fix
|
|
||||||
cursor.execute("SELECT COUNT(*) FROM auctions")
|
|
||||||
total_auctions = cursor.fetchone()[0]
|
|
||||||
print(f"\nTotal auctions in table: {total_auctions}")
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT COUNT(*) FROM lots
|
|
||||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
|
||||||
AND auction_id != ''
|
|
||||||
""")
|
|
||||||
orphaned = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
print(f"Orphaned lots remaining: {orphaned}")
|
|
||||||
|
|
||||||
if orphaned == 0:
|
|
||||||
print("\nSUCCESS! All lots now have matching auctions!")
|
|
||||||
else:
|
|
||||||
# Show sample of remaining orphans
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT lot_id, auction_id FROM lots
|
|
||||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
|
||||||
AND auction_id != ''
|
|
||||||
LIMIT 5
|
|
||||||
""")
|
|
||||||
print("\nSample remaining orphaned lots:")
|
|
||||||
for lot_id, auction_id in cursor.fetchall():
|
|
||||||
print(f" {lot_id} -> auction_id: {auction_id}")
|
|
||||||
|
|
||||||
# Show what auction_ids we do have
|
|
||||||
cursor.execute("SELECT auction_id FROM auctions LIMIT 10")
|
|
||||||
print("\nSample auction_ids in auctions table:")
|
|
||||||
for row in cursor.fetchall():
|
|
||||||
print(f" {row[0]}")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
fix_auctions_table()
|
|
||||||
@@ -1,136 +0,0 @@
|
|||||||
"""
|
|
||||||
Fix orphaned lots by updating auction_id from UUID to displayId.
|
|
||||||
This migration reads cached lot pages and extracts the correct auction displayId.
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
||||||
|
|
||||||
from cache import CacheManager
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
def fix_orphaned_lots():
|
|
||||||
"""Update lot auction_id from UUID to auction displayId"""
|
|
||||||
cache = CacheManager()
|
|
||||||
conn = sqlite3.connect(cache.db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Get all lots that need fixing (have UUID auction_id)
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT l.lot_id, l.auction_id
|
|
||||||
FROM lots l
|
|
||||||
WHERE length(l.auction_id) > 20 -- UUID is longer than displayId like "A1-12345"
|
|
||||||
""")
|
|
||||||
|
|
||||||
lots_to_fix = {lot_id: auction_uuid for lot_id, auction_uuid in cursor.fetchall()}
|
|
||||||
print(f"Found {len(lots_to_fix)} lots with UUID auction_id that need fixing")
|
|
||||||
|
|
||||||
if not lots_to_fix:
|
|
||||||
print("No lots to fix!")
|
|
||||||
conn.close()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Build mapping from lot displayId to auction displayId from cached pages
|
|
||||||
print("Building lot displayId -> auction displayId mapping from cache...")
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT url, content
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/l/%'
|
|
||||||
""")
|
|
||||||
|
|
||||||
lot_to_auction_map = {}
|
|
||||||
total = 0
|
|
||||||
errors = 0
|
|
||||||
|
|
||||||
for url, content_blob in cursor:
|
|
||||||
total += 1
|
|
||||||
|
|
||||||
if total % 100 == 0:
|
|
||||||
print(f"Processing cached pages... {total}", end='\r')
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Decompress and parse __NEXT_DATA__
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
|
|
||||||
if not match:
|
|
||||||
continue
|
|
||||||
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
page_props = data.get('props', {}).get('pageProps', {})
|
|
||||||
|
|
||||||
lot = page_props.get('lot', {})
|
|
||||||
auction = page_props.get('auction', {})
|
|
||||||
|
|
||||||
if not lot or not auction:
|
|
||||||
continue
|
|
||||||
|
|
||||||
lot_display_id = lot.get('displayId')
|
|
||||||
auction_display_id = auction.get('displayId')
|
|
||||||
|
|
||||||
if lot_display_id and auction_display_id:
|
|
||||||
lot_to_auction_map[lot_display_id] = auction_display_id
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
errors += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"\n\nBuilt mapping for {len(lot_to_auction_map)} lots")
|
|
||||||
print(f"Errors while parsing: {errors}")
|
|
||||||
|
|
||||||
# Now update the lots table
|
|
||||||
print("\nUpdating lots table...")
|
|
||||||
updated = 0
|
|
||||||
not_found = 0
|
|
||||||
|
|
||||||
for lot_id, old_auction_uuid in lots_to_fix.items():
|
|
||||||
if lot_id in lot_to_auction_map:
|
|
||||||
new_auction_id = lot_to_auction_map[lot_id]
|
|
||||||
cursor.execute("""
|
|
||||||
UPDATE lots
|
|
||||||
SET auction_id = ?
|
|
||||||
WHERE lot_id = ?
|
|
||||||
""", (new_auction_id, lot_id))
|
|
||||||
updated += 1
|
|
||||||
else:
|
|
||||||
not_found += 1
|
|
||||||
|
|
||||||
if (updated + not_found) % 100 == 0:
|
|
||||||
print(f"Updated: {updated}, not found: {not_found}", end='\r')
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
print(f"\n\nComplete!")
|
|
||||||
print(f"Total cached pages processed: {total}")
|
|
||||||
print(f"Lots updated with auction displayId: {updated}")
|
|
||||||
print(f"Lots not found in cache: {not_found}")
|
|
||||||
print(f"Parse errors: {errors}")
|
|
||||||
|
|
||||||
# Verify fix
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT COUNT(*) FROM lots
|
|
||||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
|
||||||
""")
|
|
||||||
orphaned = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
print(f"\nOrphaned lots remaining: {orphaned}")
|
|
||||||
|
|
||||||
if orphaned > 0:
|
|
||||||
# Show sample of remaining orphans
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT lot_id, auction_id FROM lots
|
|
||||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
|
||||||
LIMIT 5
|
|
||||||
""")
|
|
||||||
print("\nSample remaining orphaned lots:")
|
|
||||||
for lot_id, auction_id in cursor.fetchall():
|
|
||||||
print(f" {lot_id} -> auction_id: {auction_id}")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
fix_orphaned_lots()
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Extract and inspect __NEXT_DATA__ from a cached lot page"""
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
# Get a cached auction page
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT url, content
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/a/%'
|
|
||||||
LIMIT 1
|
|
||||||
""")
|
|
||||||
|
|
||||||
row = cursor.fetchone()
|
|
||||||
if not row:
|
|
||||||
print("No cached lot pages found")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
url, content_blob = row
|
|
||||||
print(f"Inspecting: {url}\n")
|
|
||||||
|
|
||||||
# Decompress
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
|
|
||||||
# Extract __NEXT_DATA__
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
if not match:
|
|
||||||
print("No __NEXT_DATA__ found")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
page_props = data.get('props', {}).get('pageProps', {})
|
|
||||||
|
|
||||||
if 'auction' in page_props:
|
|
||||||
auction = page_props['auction']
|
|
||||||
print("AUCTION DATA STRUCTURE:")
|
|
||||||
print("=" * 60)
|
|
||||||
print(f"displayId: {auction.get('displayId')}")
|
|
||||||
print(f"name: {auction.get('name', '')[:50]}...")
|
|
||||||
print(f"lots count: {len(auction.get('lots', []))}")
|
|
||||||
|
|
||||||
if auction.get('lots'):
|
|
||||||
lot = auction['lots'][0]
|
|
||||||
print(f"\nFIRST LOT STRUCTURE:")
|
|
||||||
print(f" displayId: {lot.get('displayId')}")
|
|
||||||
print(f" title: {lot.get('title', '')[:50]}...")
|
|
||||||
print(f"\n BIDDING:")
|
|
||||||
print(f" currentBid: {lot.get('currentBid')}")
|
|
||||||
print(f" highestBid: {lot.get('highestBid')}")
|
|
||||||
print(f" startingBid: {lot.get('startingBid')}")
|
|
||||||
print(f" minimumBidAmount: {lot.get('minimumBidAmount')}")
|
|
||||||
print(f" bidCount: {lot.get('bidCount')}")
|
|
||||||
print(f" numberOfBids: {lot.get('numberOfBids')}")
|
|
||||||
print(f" TIMING:")
|
|
||||||
print(f" endDate: {lot.get('endDate')}")
|
|
||||||
print(f" startDate: {lot.get('startDate')}")
|
|
||||||
print(f" closingTime: {lot.get('closingTime')}")
|
|
||||||
print(f" ALL KEYS: {list(lot.keys())}")
|
|
||||||
|
|
||||||
print(f"\nAUCTION TIMING:")
|
|
||||||
print(f" minEndDate: {auction.get('minEndDate')}")
|
|
||||||
print(f" maxEndDate: {auction.get('maxEndDate')}")
|
|
||||||
print(f" ALL KEYS: {list(auction.keys())}")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Inspect a lot page HTML to find viewing_time and pickup_date"""
|
|
||||||
import asyncio
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
# Use the known lot
|
|
||||||
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
|
||||||
content = await page.content()
|
|
||||||
|
|
||||||
print("Searching for patterns...")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
# Search for viewing time patterns
|
|
||||||
import re
|
|
||||||
patterns = {
|
|
||||||
'Bezichtigingen': r'Bezichtigingen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
|
||||||
'viewing': r'(?i)viewing.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
|
||||||
'Ophalen': r'Ophalen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
|
||||||
'pickup': r'(?i)pickup.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
|
|
||||||
'Status': r'Status\s+([^<]+)',
|
|
||||||
}
|
|
||||||
|
|
||||||
for name, pattern in patterns.items():
|
|
||||||
matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
|
|
||||||
if matches:
|
|
||||||
print(f"\n{name}:")
|
|
||||||
for match in matches[:3]:
|
|
||||||
print(f" {match[:200]}")
|
|
||||||
|
|
||||||
# Also look for structured data
|
|
||||||
print("\n\nSearching for 'Bezichtigingen' section:")
|
|
||||||
bez_match = re.search(r'Bezichtigingen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
|
|
||||||
if bez_match:
|
|
||||||
print(bez_match.group(0)[:500])
|
|
||||||
|
|
||||||
print("\n\nSearching for 'Ophalen' section:")
|
|
||||||
oph_match = re.search(r'Ophalen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
|
|
||||||
if oph_match:
|
|
||||||
print(oph_match.group(0)[:500])
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Intercept API calls to find where lot data comes from"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=False)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
# Track API calls
|
|
||||||
api_calls = []
|
|
||||||
|
|
||||||
async def handle_response(response):
|
|
||||||
if 'api' in response.url.lower() or 'graphql' in response.url.lower():
|
|
||||||
try:
|
|
||||||
body = await response.json()
|
|
||||||
api_calls.append({
|
|
||||||
'url': response.url,
|
|
||||||
'status': response.status,
|
|
||||||
'body': body
|
|
||||||
})
|
|
||||||
print(f"\nAPI CALL: {response.url}")
|
|
||||||
print(f"Status: {response.status}")
|
|
||||||
if 'lot' in response.url.lower() or 'auction' in response.url.lower():
|
|
||||||
print(f"Body preview: {json.dumps(body, indent=2)[:500]}")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
page.on('response', handle_response)
|
|
||||||
|
|
||||||
# Visit auction page
|
|
||||||
print("Loading auction page...")
|
|
||||||
await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
|
|
||||||
|
|
||||||
# Wait a bit for lazy loading
|
|
||||||
await asyncio.sleep(5)
|
|
||||||
|
|
||||||
print(f"\n\nCaptured {len(api_calls)} API calls")
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,148 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Migrate existing lot data to extract missing enriched fields
|
|
||||||
"""
|
|
||||||
import sqlite3
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from datetime import datetime
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from graphql_client import extract_enriched_attributes, extract_attributes_from_lot_json
|
|
||||||
|
|
||||||
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
|
||||||
|
|
||||||
def migrate_lot_attributes():
|
|
||||||
"""Extract attributes from cached lot pages"""
|
|
||||||
print("="*60)
|
|
||||||
print("MIGRATING EXISTING LOT DATA")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
|
|
||||||
# Get cached lot pages
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT url, content, timestamp
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/l/%'
|
|
||||||
ORDER BY timestamp DESC
|
|
||||||
""")
|
|
||||||
|
|
||||||
import zlib
|
|
||||||
updated_count = 0
|
|
||||||
|
|
||||||
for url, content_blob, timestamp in cursor:
|
|
||||||
try:
|
|
||||||
# Get lot_id from URL
|
|
||||||
lot_id_match = re.search(r'/l/.*?([A-Z]\d+-\d+-\d+)', url)
|
|
||||||
if not lot_id_match:
|
|
||||||
lot_id_match = re.search(r'([A-Z]\d+-\d+-\d+)', url)
|
|
||||||
if not lot_id_match:
|
|
||||||
continue
|
|
||||||
|
|
||||||
lot_id = lot_id_match.group(1)
|
|
||||||
|
|
||||||
# Check if lot exists in database
|
|
||||||
lot_cursor = conn.execute("SELECT lot_id, title, description FROM lots WHERE lot_id = ?", (lot_id,))
|
|
||||||
lot_row = lot_cursor.fetchone()
|
|
||||||
if not lot_row:
|
|
||||||
continue
|
|
||||||
|
|
||||||
_, title, description = lot_row
|
|
||||||
|
|
||||||
# Decompress and parse __NEXT_DATA__
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
if not match:
|
|
||||||
continue
|
|
||||||
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
lot_json = data.get('props', {}).get('pageProps', {}).get('lot', {})
|
|
||||||
if not lot_json:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Extract basic attributes
|
|
||||||
attrs = extract_attributes_from_lot_json(lot_json)
|
|
||||||
|
|
||||||
# Extract enriched attributes
|
|
||||||
page_data = {'title': title, 'description': description, 'brand': attrs.get('brand', '')}
|
|
||||||
enriched = extract_enriched_attributes(lot_json, page_data)
|
|
||||||
|
|
||||||
# Merge
|
|
||||||
all_attrs = {**attrs, **enriched}
|
|
||||||
|
|
||||||
# Update database
|
|
||||||
conn.execute("""
|
|
||||||
UPDATE lots
|
|
||||||
SET brand = ?,
|
|
||||||
model = ?,
|
|
||||||
attributes_json = ?,
|
|
||||||
year_manufactured = ?,
|
|
||||||
condition_score = ?,
|
|
||||||
condition_description = ?,
|
|
||||||
serial_number = ?,
|
|
||||||
manufacturer = ?,
|
|
||||||
damage_description = ?
|
|
||||||
WHERE lot_id = ?
|
|
||||||
""", (
|
|
||||||
all_attrs.get('brand', ''),
|
|
||||||
all_attrs.get('model', ''),
|
|
||||||
all_attrs.get('attributes_json', ''),
|
|
||||||
all_attrs.get('year_manufactured'),
|
|
||||||
all_attrs.get('condition_score'),
|
|
||||||
all_attrs.get('condition_description', ''),
|
|
||||||
all_attrs.get('serial_number', ''),
|
|
||||||
all_attrs.get('manufacturer', ''),
|
|
||||||
all_attrs.get('damage_description', ''),
|
|
||||||
lot_id
|
|
||||||
))
|
|
||||||
|
|
||||||
updated_count += 1
|
|
||||||
if updated_count % 100 == 0:
|
|
||||||
print(f" Processed {updated_count} lots...")
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Error processing {url}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
print(f"\n✓ Updated {updated_count} lots with enriched attributes")
|
|
||||||
|
|
||||||
# Show stats
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total,
|
|
||||||
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
|
||||||
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
|
|
||||||
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
|
||||||
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
|
|
||||||
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
|
|
||||||
FROM lots
|
|
||||||
""")
|
|
||||||
stats = cursor.fetchone()
|
|
||||||
|
|
||||||
print(f"\nENRICHMENT STATISTICS:")
|
|
||||||
print(f" Total lots: {stats[0]:,}")
|
|
||||||
print(f" Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
|
|
||||||
print(f" Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
|
|
||||||
print(f" Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
|
|
||||||
print(f" Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
|
|
||||||
print(f" Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("\nStarting migration of existing data...")
|
|
||||||
print(f"Database: {DB_PATH}\n")
|
|
||||||
|
|
||||||
migrate_lot_attributes()
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("MIGRATION COMPLETE")
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,51 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Scrape a fresh auction page to see the lots array structure"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
# Get first auction
|
|
||||||
await page.goto("https://www.troostwijkauctions.com/auctions", wait_until='networkidle')
|
|
||||||
content = await page.content()
|
|
||||||
|
|
||||||
# Find first auction link
|
|
||||||
match = re.search(r'href="(/a/[^"]+)"', content)
|
|
||||||
if not match:
|
|
||||||
print("No auction found")
|
|
||||||
return
|
|
||||||
|
|
||||||
auction_url = f"https://www.troostwijkauctions.com{match.group(1)}"
|
|
||||||
print(f"Scraping: {auction_url}\n")
|
|
||||||
|
|
||||||
await page.goto(auction_url, wait_until='networkidle')
|
|
||||||
content = await page.content()
|
|
||||||
|
|
||||||
# Extract __NEXT_DATA__
|
|
||||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
||||||
if not match:
|
|
||||||
print("No __NEXT_DATA__ found")
|
|
||||||
return
|
|
||||||
|
|
||||||
data = json.loads(match.group(1))
|
|
||||||
page_props = data.get('props', {}).get('pageProps', {})
|
|
||||||
|
|
||||||
if 'auction' in page_props:
|
|
||||||
auction = page_props['auction']
|
|
||||||
print(f"Auction: {auction.get('name', '')[:50]}...")
|
|
||||||
print(f"Lots in array: {len(auction.get('lots', []))}")
|
|
||||||
|
|
||||||
if auction.get('lots'):
|
|
||||||
lot = auction['lots'][0]
|
|
||||||
print(f"\nFIRST LOT:")
|
|
||||||
print(json.dumps(lot, indent=2)[:1500])
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Search cached pages for viewing/pickup text"""
|
|
||||||
import sqlite3
|
|
||||||
import zlib
|
|
||||||
import re
|
|
||||||
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT url, content
|
|
||||||
FROM cache
|
|
||||||
WHERE url LIKE '%/l/%'
|
|
||||||
ORDER BY timestamp DESC
|
|
||||||
LIMIT 20
|
|
||||||
""")
|
|
||||||
|
|
||||||
for url, content_blob in cursor:
|
|
||||||
try:
|
|
||||||
content = zlib.decompress(content_blob).decode('utf-8')
|
|
||||||
|
|
||||||
# Look for viewing/pickup patterns
|
|
||||||
if 'bezichtig' in content.lower() or 'ophalen' in content.lower():
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"URL: {url}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
# Extract sections with context
|
|
||||||
patterns = [
|
|
||||||
(r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'),
|
|
||||||
(r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'),
|
|
||||||
]
|
|
||||||
|
|
||||||
for pattern, label in patterns:
|
|
||||||
matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
|
|
||||||
if matches:
|
|
||||||
print(f"\n{label}:")
|
|
||||||
for match in matches[:1]: # First match
|
|
||||||
# Clean up HTML
|
|
||||||
clean = re.sub(r'<[^>]+>', ' ', match)
|
|
||||||
clean = re.sub(r'\s+', ' ', clean).strip()
|
|
||||||
print(f" {clean[:200]}")
|
|
||||||
|
|
||||||
break # Found one, that's enough
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
# PowerShell script to create Windows Task Scheduler job for Scaev Monitor
|
|
||||||
# Run as Administrator
|
|
||||||
|
|
||||||
$TaskName = "ScaevAuctionMonitor"
|
|
||||||
$ScriptPath = "C:\vibe\scaev\src\monitor.py"
|
|
||||||
$PythonPath = "python3" # Adjust if needed
|
|
||||||
$WorkingDir = "C:\vibe\scaev"
|
|
||||||
|
|
||||||
# Create the action (run Python script)
|
|
||||||
$Action = New-ScheduledTaskAction -Execute $PythonPath `
|
|
||||||
-Argument "$ScriptPath 30" `
|
|
||||||
-WorkingDirectory $WorkingDir
|
|
||||||
|
|
||||||
# Trigger: On system startup
|
|
||||||
$TriggerStartup = New-ScheduledTaskTrigger -AtStartup
|
|
||||||
|
|
||||||
# Settings
|
|
||||||
$Settings = New-ScheduledTaskSettingsSet `
|
|
||||||
-AllowStartIfOnBatteries `
|
|
||||||
-DontStopIfGoingOnBatteries `
|
|
||||||
-StartWhenAvailable `
|
|
||||||
-RestartCount 3 `
|
|
||||||
-RestartInterval (New-TimeSpan -Minutes 5)
|
|
||||||
|
|
||||||
# Principal: Run with highest privileges
|
|
||||||
$Principal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest
|
|
||||||
|
|
||||||
# Register the task
|
|
||||||
Register-ScheduledTask `
|
|
||||||
-TaskName $TaskName `
|
|
||||||
-Action $Action `
|
|
||||||
-Trigger $TriggerStartup `
|
|
||||||
-Settings $Settings `
|
|
||||||
-Principal $Principal `
|
|
||||||
-Description "Scaev auction monitor - polls for new auctions every 30 minutes" `
|
|
||||||
-Force
|
|
||||||
|
|
||||||
Write-Host "`nTask '$TaskName' created successfully!" -ForegroundColor Green
|
|
||||||
Write-Host "`nTo manage the task:"
|
|
||||||
Write-Host " 1. Open Task Scheduler (taskschd.msc)"
|
|
||||||
Write-Host " 2. Find 'ScaevAuctionMonitor' in Task Scheduler Library"
|
|
||||||
Write-Host " 3. Right-click to Run, Stop, or Disable"
|
|
||||||
Write-Host "`nOr use PowerShell commands:"
|
|
||||||
Write-Host " Start-ScheduledTask -TaskName '$TaskName'"
|
|
||||||
Write-Host " Stop-ScheduledTask -TaskName '$TaskName'"
|
|
||||||
Write-Host " Disable-ScheduledTask -TaskName '$TaskName'"
|
|
||||||
Write-Host " Get-ScheduledTask -TaskName '$TaskName' | Get-ScheduledTaskInfo"
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Show migration statistics"""
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total,
|
|
||||||
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
|
||||||
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition,
|
|
||||||
SUM(CASE WHEN manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
|
||||||
SUM(CASE WHEN brand != '' THEN 1 ELSE 0 END) as has_brand,
|
|
||||||
SUM(CASE WHEN model != '' THEN 1 ELSE 0 END) as has_model
|
|
||||||
FROM lots
|
|
||||||
""")
|
|
||||||
|
|
||||||
stats = cursor.fetchone()
|
|
||||||
|
|
||||||
print("="*60)
|
|
||||||
print("MIGRATION RESULTS")
|
|
||||||
print("="*60)
|
|
||||||
print(f"\nTotal lots: {stats[0]:,}")
|
|
||||||
print(f"Has year: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
|
|
||||||
print(f"Has condition: {stats[2]:,} ({100*stats[2]/stats[0]:.1f}%)")
|
|
||||||
print(f"Has manufacturer: {stats[3]:,} ({100*stats[3]/stats[0]:.1f}%)")
|
|
||||||
print(f"Has brand: {stats[4]:,} ({100*stats[4]/stats[0]:.1f}%)")
|
|
||||||
print(f"Has model: {stats[5]:,} ({100*stats[5]/stats[0]:.1f}%)")
|
|
||||||
|
|
||||||
# Show sample enriched data
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("SAMPLE ENRICHED LOTS")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT lot_id, year_manufactured, manufacturer, model, condition_score
|
|
||||||
FROM lots
|
|
||||||
WHERE year_manufactured IS NOT NULL OR manufacturer != ''
|
|
||||||
LIMIT 5
|
|
||||||
""")
|
|
||||||
|
|
||||||
for row in cursor:
|
|
||||||
print(f"\n{row[0]}:")
|
|
||||||
print(f" Year: {row[1]}")
|
|
||||||
print(f" Manufacturer: {row[2]}")
|
|
||||||
print(f" Model: {row[3]}")
|
|
||||||
print(f" Condition: {row[4]}")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
173
src/cache.py
173
src/cache.py
@@ -19,8 +19,9 @@ class CacheManager:
|
|||||||
self._init_db()
|
self._init_db()
|
||||||
|
|
||||||
def _init_db(self):
|
def _init_db(self):
|
||||||
"""Initialize cache and data storage database"""
|
"""Initialize cache and data storage database with consolidated schema"""
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
# Cache table
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS cache (
|
CREATE TABLE IF NOT EXISTS cache (
|
||||||
url TEXT PRIMARY KEY,
|
url TEXT PRIMARY KEY,
|
||||||
@@ -32,6 +33,8 @@ class CacheManager:
|
|||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
|
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
# Auctions table - consolidated schema
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS auctions (
|
CREATE TABLE IF NOT EXISTS auctions (
|
||||||
auction_id TEXT PRIMARY KEY,
|
auction_id TEXT PRIMARY KEY,
|
||||||
@@ -40,9 +43,18 @@ class CacheManager:
|
|||||||
location TEXT,
|
location TEXT,
|
||||||
lots_count INTEGER,
|
lots_count INTEGER,
|
||||||
first_lot_closing_time TEXT,
|
first_lot_closing_time TEXT,
|
||||||
scraped_at TEXT
|
scraped_at TEXT,
|
||||||
|
city TEXT,
|
||||||
|
country TEXT,
|
||||||
|
type TEXT,
|
||||||
|
lot_count INTEGER DEFAULT 0,
|
||||||
|
closing_time TEXT,
|
||||||
|
discovered_at INTEGER
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
|
||||||
|
|
||||||
|
# Lots table - consolidated schema with all fields from working database
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS lots (
|
CREATE TABLE IF NOT EXISTS lots (
|
||||||
lot_id TEXT PRIMARY KEY,
|
lot_id TEXT PRIMARY KEY,
|
||||||
@@ -50,8 +62,6 @@ class CacheManager:
|
|||||||
url TEXT UNIQUE,
|
url TEXT UNIQUE,
|
||||||
title TEXT,
|
title TEXT,
|
||||||
current_bid TEXT,
|
current_bid TEXT,
|
||||||
starting_bid TEXT,
|
|
||||||
minimum_bid TEXT,
|
|
||||||
bid_count INTEGER,
|
bid_count INTEGER,
|
||||||
closing_time TEXT,
|
closing_time TEXT,
|
||||||
viewing_time TEXT,
|
viewing_time TEXT,
|
||||||
@@ -60,9 +70,54 @@ class CacheManager:
|
|||||||
description TEXT,
|
description TEXT,
|
||||||
category TEXT,
|
category TEXT,
|
||||||
scraped_at TEXT,
|
scraped_at TEXT,
|
||||||
|
sale_id INTEGER,
|
||||||
|
manufacturer TEXT,
|
||||||
|
type TEXT,
|
||||||
|
year INTEGER,
|
||||||
|
currency TEXT DEFAULT 'EUR',
|
||||||
|
closing_notified INTEGER DEFAULT 0,
|
||||||
|
starting_bid TEXT,
|
||||||
|
minimum_bid TEXT,
|
||||||
|
status TEXT,
|
||||||
|
brand TEXT,
|
||||||
|
model TEXT,
|
||||||
|
attributes_json TEXT,
|
||||||
|
first_bid_time TEXT,
|
||||||
|
last_bid_time TEXT,
|
||||||
|
bid_velocity REAL,
|
||||||
|
bid_increment REAL,
|
||||||
|
year_manufactured INTEGER,
|
||||||
|
condition_score REAL,
|
||||||
|
condition_description TEXT,
|
||||||
|
serial_number TEXT,
|
||||||
|
damage_description TEXT,
|
||||||
|
followers_count INTEGER DEFAULT 0,
|
||||||
|
estimated_min_price REAL,
|
||||||
|
estimated_max_price REAL,
|
||||||
|
lot_condition TEXT,
|
||||||
|
appearance TEXT,
|
||||||
|
estimated_min REAL,
|
||||||
|
estimated_max REAL,
|
||||||
|
next_bid_step_cents INTEGER,
|
||||||
|
condition TEXT,
|
||||||
|
category_path TEXT,
|
||||||
|
city_location TEXT,
|
||||||
|
country_code TEXT,
|
||||||
|
bidding_status TEXT,
|
||||||
|
packaging TEXT,
|
||||||
|
quantity INTEGER,
|
||||||
|
vat REAL,
|
||||||
|
buyer_premium_percentage REAL,
|
||||||
|
remarks TEXT,
|
||||||
|
reserve_price REAL,
|
||||||
|
reserve_met INTEGER,
|
||||||
|
view_count INTEGER,
|
||||||
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
|
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
|
||||||
|
|
||||||
|
# Images table
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS images (
|
CREATE TABLE IF NOT EXISTS images (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
@@ -70,86 +125,28 @@ class CacheManager:
|
|||||||
url TEXT,
|
url TEXT,
|
||||||
local_path TEXT,
|
local_path TEXT,
|
||||||
downloaded INTEGER DEFAULT 0,
|
downloaded INTEGER DEFAULT 0,
|
||||||
|
labels TEXT,
|
||||||
|
processed_at INTEGER,
|
||||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_images_lot_id ON images(lot_id)")
|
||||||
|
|
||||||
# Add new columns to auctions table if they don't exist
|
# Remove duplicates before creating unique index
|
||||||
cursor = conn.execute("PRAGMA table_info(auctions)")
|
conn.execute("""
|
||||||
auction_columns = {row[1] for row in cursor.fetchall()}
|
DELETE FROM images
|
||||||
|
WHERE id NOT IN (
|
||||||
|
SELECT MIN(id)
|
||||||
|
FROM images
|
||||||
|
GROUP BY lot_id, url
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.execute("""
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
|
||||||
|
ON images(lot_id, url)
|
||||||
|
""")
|
||||||
|
|
||||||
if 'city' not in auction_columns:
|
# Bid history table
|
||||||
conn.execute("ALTER TABLE auctions ADD COLUMN city TEXT")
|
|
||||||
if 'country' not in auction_columns:
|
|
||||||
conn.execute("ALTER TABLE auctions ADD COLUMN country TEXT")
|
|
||||||
if 'type' not in auction_columns:
|
|
||||||
conn.execute("ALTER TABLE auctions ADD COLUMN type TEXT")
|
|
||||||
if 'lot_count' not in auction_columns:
|
|
||||||
conn.execute("ALTER TABLE auctions ADD COLUMN lot_count INTEGER DEFAULT 0")
|
|
||||||
if 'closing_time' not in auction_columns:
|
|
||||||
conn.execute("ALTER TABLE auctions ADD COLUMN closing_time TEXT")
|
|
||||||
if 'discovered_at' not in auction_columns:
|
|
||||||
conn.execute("ALTER TABLE auctions ADD COLUMN discovered_at INTEGER")
|
|
||||||
|
|
||||||
# Add index for country filtering
|
|
||||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
|
|
||||||
|
|
||||||
# Add new columns to lots table if they don't exist
|
|
||||||
cursor = conn.execute("PRAGMA table_info(lots)")
|
|
||||||
columns = {row[1] for row in cursor.fetchall()}
|
|
||||||
|
|
||||||
if 'starting_bid' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
|
|
||||||
if 'minimum_bid' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
|
|
||||||
if 'status' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN status TEXT")
|
|
||||||
if 'brand' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT")
|
|
||||||
if 'model' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN model TEXT")
|
|
||||||
if 'attributes_json' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT")
|
|
||||||
|
|
||||||
# Bidding intelligence fields
|
|
||||||
if 'first_bid_time' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT")
|
|
||||||
if 'last_bid_time' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT")
|
|
||||||
if 'bid_velocity' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL")
|
|
||||||
if 'bid_increment' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL")
|
|
||||||
|
|
||||||
# Valuation intelligence fields
|
|
||||||
if 'year_manufactured' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER")
|
|
||||||
if 'condition_score' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL")
|
|
||||||
if 'condition_description' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT")
|
|
||||||
if 'serial_number' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT")
|
|
||||||
if 'manufacturer' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT")
|
|
||||||
if 'damage_description' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT")
|
|
||||||
|
|
||||||
# NEW: High-value API fields
|
|
||||||
if 'followers_count' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN followers_count INTEGER DEFAULT 0")
|
|
||||||
if 'estimated_min_price' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN estimated_min_price REAL")
|
|
||||||
if 'estimated_max_price' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN estimated_max_price REAL")
|
|
||||||
if 'lot_condition' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN lot_condition TEXT")
|
|
||||||
if 'appearance' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN appearance TEXT")
|
|
||||||
if 'scraped_at_timestamp' not in columns:
|
|
||||||
conn.execute("ALTER TABLE lots ADD COLUMN scraped_at_timestamp INTEGER")
|
|
||||||
|
|
||||||
# Create bid_history table
|
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS bid_history (
|
CREATE TABLE IF NOT EXISTS bid_history (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
@@ -163,33 +160,15 @@ class CacheManager:
|
|||||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
|
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
|
||||||
ON bid_history(lot_id, bid_time)
|
ON bid_history(lot_id, bid_time)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
|
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
|
||||||
ON bid_history(bidder_id)
|
ON bid_history(bidder_id)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# Remove duplicates before creating unique index
|
|
||||||
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
|
|
||||||
conn.execute("""
|
|
||||||
DELETE FROM images
|
|
||||||
WHERE id NOT IN (
|
|
||||||
SELECT MIN(id)
|
|
||||||
FROM images
|
|
||||||
GROUP BY lot_id, url
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
|
|
||||||
# Now create the unique index
|
|
||||||
conn.execute("""
|
|
||||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
|
|
||||||
ON images(lot_id, url)
|
|
||||||
""")
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
|
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
|
||||||
|
|||||||
256
sync_updates.py
256
sync_updates.py
@@ -1,256 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Sync local database updates to server-compatible format
|
|
||||||
Creates incremental exports with only NEW or UPDATED records
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sqlite3
|
|
||||||
import json
|
|
||||||
import csv
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
DB_PATH = "C:/mnt/okcomputer/output/cache.db"
|
|
||||||
OUTPUT_DIR = Path("C:/mnt/okcomputer/output")
|
|
||||||
|
|
||||||
def fill_missing_auction_fields():
|
|
||||||
"""Fill in missing fields in auctions table from scraped data"""
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
print("Filling missing auction fields...")
|
|
||||||
|
|
||||||
# Update closing_time from first_lot_closing_time
|
|
||||||
cursor.execute("""
|
|
||||||
UPDATE auctions
|
|
||||||
SET closing_time = first_lot_closing_time
|
|
||||||
WHERE closing_time IS NULL AND first_lot_closing_time IS NOT NULL
|
|
||||||
""")
|
|
||||||
updated = cursor.rowcount
|
|
||||||
print(f" ✓ Updated {updated} closing_time fields")
|
|
||||||
|
|
||||||
# Parse location to extract city and country
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT auction_id, location
|
|
||||||
FROM auctions
|
|
||||||
WHERE location IS NOT NULL AND (city IS NULL OR country IS NULL)
|
|
||||||
""")
|
|
||||||
locations = cursor.fetchall()
|
|
||||||
|
|
||||||
city_updates = 0
|
|
||||||
for auction_id, location in locations:
|
|
||||||
if not location:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Parse "City, COUNTRY" or "City, Region, COUNTRY"
|
|
||||||
parts = [p.strip() for p in location.split(',')]
|
|
||||||
if len(parts) >= 2:
|
|
||||||
city = parts[0]
|
|
||||||
country = parts[-1]
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
UPDATE auctions
|
|
||||||
SET city = ?, country = ?
|
|
||||||
WHERE auction_id = ?
|
|
||||||
""", (city, country, auction_id))
|
|
||||||
city_updates += 1
|
|
||||||
|
|
||||||
print(f" ✓ Updated {city_updates} city/country fields")
|
|
||||||
|
|
||||||
# Set type to 'online' for all (Troostwijk is online platform)
|
|
||||||
cursor.execute("""
|
|
||||||
UPDATE auctions
|
|
||||||
SET type = 'online'
|
|
||||||
WHERE type IS NULL
|
|
||||||
""")
|
|
||||||
type_updates = cursor.rowcount
|
|
||||||
print(f" ✓ Updated {type_updates} type fields")
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
print(f"✓ Auction fields updated\n")
|
|
||||||
|
|
||||||
def get_last_sync_timestamp():
|
|
||||||
"""Get timestamp of last successful sync"""
|
|
||||||
sync_file = OUTPUT_DIR / ".last_sync"
|
|
||||||
if sync_file.exists():
|
|
||||||
return int(sync_file.read_text().strip())
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def save_sync_timestamp(timestamp: int):
|
|
||||||
"""Save timestamp of successful sync"""
|
|
||||||
sync_file = OUTPUT_DIR / ".last_sync"
|
|
||||||
sync_file.write_text(str(timestamp))
|
|
||||||
|
|
||||||
def export_incremental():
|
|
||||||
"""Export only records that are new or updated since last sync"""
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
conn.row_factory = sqlite3.Row
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
last_sync = get_last_sync_timestamp()
|
|
||||||
current_time = int(datetime.now().timestamp())
|
|
||||||
|
|
||||||
print(f"Last sync: {datetime.fromtimestamp(last_sync).strftime('%Y-%m-%d %H:%M:%S') if last_sync else 'Never'}")
|
|
||||||
print(f"Current time: {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
|
||||||
|
|
||||||
# Get new/updated auctions
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT * FROM auctions
|
|
||||||
WHERE discovered_at IS NULL OR discovered_at > ?
|
|
||||||
ORDER BY auction_id
|
|
||||||
""", (last_sync,))
|
|
||||||
new_auctions = [dict(row) for row in cursor.fetchall()]
|
|
||||||
|
|
||||||
# Get new/updated lots
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT * FROM lots
|
|
||||||
WHERE scraped_at_timestamp IS NULL OR scraped_at_timestamp > ?
|
|
||||||
ORDER BY lot_id
|
|
||||||
""", (last_sync,))
|
|
||||||
new_lots = [dict(row) for row in cursor.fetchall()]
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
# Export to timestamped files
|
|
||||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
||||||
|
|
||||||
results = {
|
|
||||||
'auctions': 0,
|
|
||||||
'lots': 0,
|
|
||||||
'files': {}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Export auctions if any new
|
|
||||||
if new_auctions:
|
|
||||||
auctions_csv = OUTPUT_DIR / f'auctions_update_{timestamp}.csv'
|
|
||||||
auctions_json = OUTPUT_DIR / f'auctions_update_{timestamp}.json'
|
|
||||||
|
|
||||||
with open(auctions_csv, 'w', newline='', encoding='utf-8') as f:
|
|
||||||
writer = csv.DictWriter(f, fieldnames=new_auctions[0].keys())
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(new_auctions)
|
|
||||||
|
|
||||||
with open(auctions_json, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(new_auctions, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
results['auctions'] = len(new_auctions)
|
|
||||||
results['files']['auctions_csv'] = str(auctions_csv)
|
|
||||||
results['files']['auctions_json'] = str(auctions_json)
|
|
||||||
|
|
||||||
print(f"\n✓ Exported {len(new_auctions)} new/updated auctions")
|
|
||||||
print(f" CSV: {auctions_csv}")
|
|
||||||
print(f" JSON: {auctions_json}")
|
|
||||||
|
|
||||||
# Export lots if any new
|
|
||||||
if new_lots:
|
|
||||||
lots_csv = OUTPUT_DIR / f'lots_update_{timestamp}.csv'
|
|
||||||
lots_json = OUTPUT_DIR / f'lots_update_{timestamp}.json'
|
|
||||||
|
|
||||||
with open(lots_csv, 'w', newline='', encoding='utf-8') as f:
|
|
||||||
writer = csv.DictWriter(f, fieldnames=new_lots[0].keys())
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(new_lots)
|
|
||||||
|
|
||||||
with open(lots_json, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(new_lots, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
results['lots'] = len(new_lots)
|
|
||||||
results['files']['lots_csv'] = str(lots_csv)
|
|
||||||
results['files']['lots_json'] = str(lots_json)
|
|
||||||
|
|
||||||
print(f"\n✓ Exported {len(new_lots)} new/updated lots")
|
|
||||||
print(f" CSV: {lots_csv}")
|
|
||||||
print(f" JSON: {lots_json}")
|
|
||||||
|
|
||||||
if not new_auctions and not new_lots:
|
|
||||||
print("\n✓ No new updates since last sync")
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
def create_upsert_export():
|
|
||||||
"""Create SQL script for server to UPSERT (update or insert) data"""
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
conn.row_factory = sqlite3.Row
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
last_sync = get_last_sync_timestamp()
|
|
||||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
||||||
|
|
||||||
# Get new/updated auctions
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT * FROM auctions
|
|
||||||
WHERE discovered_at IS NULL OR discovered_at > ?
|
|
||||||
""", (last_sync,))
|
|
||||||
new_auctions = [dict(row) for row in cursor.fetchall()]
|
|
||||||
|
|
||||||
if new_auctions:
|
|
||||||
sql_file = OUTPUT_DIR / f'upsert_auctions_{timestamp}.sql'
|
|
||||||
|
|
||||||
with open(sql_file, 'w', encoding='utf-8') as f:
|
|
||||||
f.write("-- UPSERT script for auctions (updates existing, inserts new)\n\n")
|
|
||||||
|
|
||||||
for auction in new_auctions:
|
|
||||||
# Create INSERT OR REPLACE statement
|
|
||||||
columns = list(auction.keys())
|
|
||||||
placeholders = []
|
|
||||||
|
|
||||||
for col, val in auction.items():
|
|
||||||
if val is None:
|
|
||||||
placeholders.append("NULL")
|
|
||||||
elif isinstance(val, (int, float)):
|
|
||||||
placeholders.append(str(val))
|
|
||||||
else:
|
|
||||||
# Escape single quotes
|
|
||||||
escaped = str(val).replace("'", "''")
|
|
||||||
placeholders.append(f"'{escaped}'")
|
|
||||||
|
|
||||||
f.write(f"INSERT OR REPLACE INTO auctions ({', '.join(columns)})\n")
|
|
||||||
f.write(f"VALUES ({', '.join(placeholders)});\n\n")
|
|
||||||
|
|
||||||
print(f"\n✓ Created UPSERT SQL script: {sql_file}")
|
|
||||||
print(f" Server can execute this to avoid constraint errors")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main sync process"""
|
|
||||||
print("="*60)
|
|
||||||
print("DATABASE SYNC UTILITY")
|
|
||||||
print("="*60)
|
|
||||||
print(f"Database: {DB_PATH}")
|
|
||||||
print(f"Output: {OUTPUT_DIR}")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
# Step 1: Fill missing fields
|
|
||||||
fill_missing_auction_fields()
|
|
||||||
|
|
||||||
# Step 2: Export incremental updates
|
|
||||||
print("Exporting incremental updates...")
|
|
||||||
results = export_incremental()
|
|
||||||
|
|
||||||
# Step 3: Create UPSERT SQL (prevents constraint errors on server)
|
|
||||||
if results['auctions'] > 0:
|
|
||||||
create_upsert_export()
|
|
||||||
|
|
||||||
# Step 4: Save sync timestamp
|
|
||||||
current_time = int(datetime.now().timestamp())
|
|
||||||
save_sync_timestamp(current_time)
|
|
||||||
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("SYNC COMPLETE")
|
|
||||||
print("="*60)
|
|
||||||
print(f"New auctions: {results['auctions']}")
|
|
||||||
print(f"New lots: {results['lots']}")
|
|
||||||
|
|
||||||
if results['files']:
|
|
||||||
print("\nFiles ready for server import:")
|
|
||||||
for key, path in results['files'].items():
|
|
||||||
print(f" {key}: {path}")
|
|
||||||
|
|
||||||
print("\nNext sync will only export records newer than:")
|
|
||||||
print(f" {datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Test auction data fetch"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from graphql_client import fetch_auction_data, format_auction_data
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
|
||||||
|
|
||||||
print(f"Fetching auction: {auction_id}\n")
|
|
||||||
auction_data = await fetch_auction_data(auction_id)
|
|
||||||
|
|
||||||
if auction_data:
|
|
||||||
print("Raw Auction Data:")
|
|
||||||
print(json.dumps(auction_data, indent=2))
|
|
||||||
|
|
||||||
print("\n\nFormatted:")
|
|
||||||
formatted = format_auction_data(auction_data)
|
|
||||||
print(f"Viewing: {formatted['viewing_time']}")
|
|
||||||
print(f"Pickup: {formatted['pickup_date']}")
|
|
||||||
else:
|
|
||||||
print("No auction data returned")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Test if the auction query works at all"""
|
|
||||||
import asyncio
|
|
||||||
import aiohttp
|
|
||||||
import json
|
|
||||||
|
|
||||||
GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
|
|
||||||
|
|
||||||
# Try a simpler query first
|
|
||||||
SIMPLE_QUERY = """
|
|
||||||
query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) {
|
|
||||||
auction(id: $auctionId, locale: $locale, platform: $platform) {
|
|
||||||
id
|
|
||||||
displayId
|
|
||||||
viewingDays {
|
|
||||||
startDate
|
|
||||||
endDate
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
}
|
|
||||||
collectionDays {
|
|
||||||
startDate
|
|
||||||
endDate
|
|
||||||
city
|
|
||||||
countryCode
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
auction_id = "9d5d9d6b-94de-4147-b523-dfa512d85dfa"
|
|
||||||
|
|
||||||
variables = {
|
|
||||||
"auctionId": auction_id,
|
|
||||||
"locale": "nl",
|
|
||||||
"platform": "TWK"
|
|
||||||
}
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"query": SIMPLE_QUERY,
|
|
||||||
"variables": variables
|
|
||||||
}
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
|
|
||||||
print(f"Status: {response.status}")
|
|
||||||
text = await response.text()
|
|
||||||
print(f"Response: {text}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
data = await response.json()
|
|
||||||
print(f"\nParsed:")
|
|
||||||
print(json.dumps(data, indent=2))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,95 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Test comprehensive data enrichment"""
|
|
||||||
import asyncio
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from scraper import TroostwijkScraper
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
scraper = TroostwijkScraper()
|
|
||||||
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page(
|
|
||||||
viewport={'width': 1920, 'height': 1080},
|
|
||||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test with lot that has bids
|
|
||||||
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
|
|
||||||
|
|
||||||
print(f"Testing comprehensive extraction\n")
|
|
||||||
result = await scraper.crawl_page(page, lot_url)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("COMPREHENSIVE DATA EXTRACTION:")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Lot ID: {result.get('lot_id')}")
|
|
||||||
print(f"Title: {result.get('title', '')[:50]}...")
|
|
||||||
print(f"\n[Bidding Intelligence]")
|
|
||||||
print(f" Status: {result.get('status')}")
|
|
||||||
print(f" Current Bid: {result.get('current_bid')}")
|
|
||||||
print(f" Starting Bid: {result.get('starting_bid')}")
|
|
||||||
print(f" Bid Increment: EUR {result.get('bid_increment', 0):.2f}")
|
|
||||||
print(f" Bid Count: {result.get('bid_count')}")
|
|
||||||
print(f" First Bid: {result.get('first_bid_time', 'N/A')}")
|
|
||||||
print(f" Last Bid: {result.get('last_bid_time', 'N/A')}")
|
|
||||||
print(f" Bid Velocity: {result.get('bid_velocity', 0)} bids/hour")
|
|
||||||
print(f"\n[Valuation Intelligence]")
|
|
||||||
print(f" Brand: {result.get('brand', 'N/A')}")
|
|
||||||
print(f" Model: {result.get('model', 'N/A')}")
|
|
||||||
print(f" Year: {result.get('year_manufactured', 'N/A')}")
|
|
||||||
print(f" Manufacturer: {result.get('manufacturer', 'N/A')}")
|
|
||||||
print(f" Condition Score: {result.get('condition_score', 'N/A')}")
|
|
||||||
print(f" Condition: {result.get('condition_description', 'N/A')}")
|
|
||||||
print(f" Serial#: {result.get('serial_number', 'N/A')}")
|
|
||||||
print(f" Damage: {result.get('damage_description', 'N/A')[:50] if result.get('damage_description') else 'N/A'}...")
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
# Verify database
|
|
||||||
import sqlite3
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
|
|
||||||
# Check lot data
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT bid_velocity, first_bid_time, year_manufactured, condition_score
|
|
||||||
FROM lots
|
|
||||||
WHERE lot_id = ?
|
|
||||||
""", (result.get('lot_id'),))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
|
|
||||||
if row:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("DATABASE VERIFICATION (lots table):")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f" Bid Velocity: {row[0]}")
|
|
||||||
print(f" First Bid Time: {row[1]}")
|
|
||||||
print(f" Year: {row[2]}")
|
|
||||||
print(f" Condition Score: {row[3]}")
|
|
||||||
|
|
||||||
# Check bid history
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT COUNT(*), MIN(bid_time), MAX(bid_time), SUM(is_autobid)
|
|
||||||
FROM bid_history
|
|
||||||
WHERE lot_id = ?
|
|
||||||
""", (result.get('lot_id'),))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
|
|
||||||
if row and row[0] > 0:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("DATABASE VERIFICATION (bid_history table):")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f" Total Bids Stored: {row[0]}")
|
|
||||||
print(f" First Bid: {row[1]}")
|
|
||||||
print(f" Last Bid: {row[2]}")
|
|
||||||
print(f" Autobids: {row[3]}")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Test concurrent image downloads"""
|
|
||||||
import asyncio
|
|
||||||
import time
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from scraper import TroostwijkScraper
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
scraper = TroostwijkScraper()
|
|
||||||
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page(
|
|
||||||
viewport={'width': 1920, 'height': 1080},
|
|
||||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test with a lot that has multiple images
|
|
||||||
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
|
|
||||||
|
|
||||||
print(f"Testing concurrent image downloads\n")
|
|
||||||
print(f"Lot: {lot_url}\n")
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
result = await scraper.crawl_page(page, lot_url)
|
|
||||||
elapsed = time.time() - start_time
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"TIMING RESULTS:")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Total time: {elapsed:.2f}s")
|
|
||||||
|
|
||||||
image_count = len(result.get('images', []))
|
|
||||||
print(f"Images: {image_count}")
|
|
||||||
|
|
||||||
if image_count > 1:
|
|
||||||
print(f"Time per image: {elapsed/image_count:.2f}s (if sequential)")
|
|
||||||
print(f"Actual time: {elapsed:.2f}s (concurrent!)")
|
|
||||||
speedup = (image_count * 0.5) / elapsed if elapsed > 0 else 1
|
|
||||||
print(f"Speedup factor: {speedup:.1f}x")
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Test the full scraper with one lot"""
|
|
||||||
import asyncio
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from scraper import TroostwijkScraper
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
scraper = TroostwijkScraper()
|
|
||||||
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page(
|
|
||||||
viewport={'width': 1920, 'height': 1080},
|
|
||||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test with a known lot
|
|
||||||
lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
|
|
||||||
|
|
||||||
print(f"Testing with: {lot_url}\n")
|
|
||||||
result = await scraper.crawl_page(page, lot_url)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("FINAL RESULT:")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Lot ID: {result.get('lot_id')}")
|
|
||||||
print(f"Title: {result.get('title', '')[:50]}...")
|
|
||||||
print(f"Current Bid: {result.get('current_bid')}")
|
|
||||||
print(f"Starting Bid: {result.get('starting_bid')}")
|
|
||||||
print(f"Minimum Bid: {result.get('minimum_bid')}")
|
|
||||||
print(f"Bid Count: {result.get('bid_count')}")
|
|
||||||
print(f"Closing Time: {result.get('closing_time')}")
|
|
||||||
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
|
|
||||||
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
|
|
||||||
print(f"Location: {result.get('location')}")
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
# Verify database
|
|
||||||
import sqlite3
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT current_bid, starting_bid, minimum_bid, bid_count, closing_time
|
|
||||||
FROM lots
|
|
||||||
WHERE lot_id = 'A1-28505-5'
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if row:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("DATABASE VERIFICATION:")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Current Bid: {row[0]}")
|
|
||||||
print(f"Starting Bid: {row[1]}")
|
|
||||||
print(f"Minimum Bid: {row[2]}")
|
|
||||||
print(f"Bid Count: {row[3]}")
|
|
||||||
print(f"Closing Time: {row[4]}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Test the updated scraper with GraphQL integration"""
|
|
||||||
import asyncio
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from graphql_client import fetch_lot_bidding_data, format_bid_data
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
# Test with known lot ID
|
|
||||||
lot_id = "A1-28505-5"
|
|
||||||
|
|
||||||
print(f"Testing GraphQL API with lot: {lot_id}\n")
|
|
||||||
|
|
||||||
bidding_data = await fetch_lot_bidding_data(lot_id)
|
|
||||||
|
|
||||||
if bidding_data:
|
|
||||||
print("Raw GraphQL Response:")
|
|
||||||
print("="*60)
|
|
||||||
import json
|
|
||||||
print(json.dumps(bidding_data, indent=2))
|
|
||||||
|
|
||||||
print("\n\nFormatted Data:")
|
|
||||||
print("="*60)
|
|
||||||
formatted = format_bid_data(bidding_data)
|
|
||||||
for key, value in formatted.items():
|
|
||||||
print(f" {key}: {value}")
|
|
||||||
else:
|
|
||||||
print("Failed to fetch bidding data")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,43 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Test scraping a single live lot page"""
|
|
||||||
import asyncio
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from scraper import TroostwijkScraper
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
scraper = TroostwijkScraper()
|
|
||||||
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page()
|
|
||||||
|
|
||||||
# Get a lot URL from the database
|
|
||||||
import sqlite3
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
cursor = conn.execute("SELECT url FROM lots LIMIT 1")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if not row:
|
|
||||||
print("No lots in database")
|
|
||||||
return
|
|
||||||
|
|
||||||
lot_url = row[0]
|
|
||||||
print(f"Fetching: {lot_url}\n")
|
|
||||||
|
|
||||||
result = await scraper.crawl_page(page, lot_url)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
print(f"\nExtracted Data:")
|
|
||||||
print(f" current_bid: {result.get('current_bid')}")
|
|
||||||
print(f" bid_count: {result.get('bid_count')}")
|
|
||||||
print(f" closing_time: {result.get('closing_time')}")
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Test the new fields extraction"""
|
|
||||||
import asyncio
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, 'src')
|
|
||||||
|
|
||||||
from scraper import TroostwijkScraper
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
scraper = TroostwijkScraper()
|
|
||||||
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch(headless=True)
|
|
||||||
page = await browser.new_page(
|
|
||||||
viewport={'width': 1920, 'height': 1080},
|
|
||||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test with lot that has attributes
|
|
||||||
lot_url = "https://www.troostwijkauctions.com/l/47-5kg-hexagon-dumbbell-%25282x%2529-A1-40668-34"
|
|
||||||
|
|
||||||
print(f"Testing new fields with: {lot_url}\n")
|
|
||||||
result = await scraper.crawl_page(page, lot_url)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("EXTRACTED FIELDS:")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Lot ID: {result.get('lot_id')}")
|
|
||||||
print(f"Title: {result.get('title', '')[:50]}...")
|
|
||||||
print(f"Status: {result.get('status')}")
|
|
||||||
print(f"Brand: {result.get('brand')}")
|
|
||||||
print(f"Model: {result.get('model')}")
|
|
||||||
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
|
|
||||||
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
|
|
||||||
print(f"Attributes: {result.get('attributes_json', '')[:100]}...")
|
|
||||||
|
|
||||||
await browser.close()
|
|
||||||
|
|
||||||
# Verify database
|
|
||||||
import sqlite3
|
|
||||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT status, brand, model, viewing_time, pickup_date
|
|
||||||
FROM lots
|
|
||||||
WHERE lot_id = ?
|
|
||||||
""", (result.get('lot_id'),))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if row:
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("DATABASE VERIFICATION:")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Status: {row[0]}")
|
|
||||||
print(f"Brand: {row[1]}")
|
|
||||||
print(f"Model: {row[2]}")
|
|
||||||
print(f"Viewing: {row[3][:100] if row[3] else 'N/A'}...")
|
|
||||||
print(f"Pickup: {row[4][:100] if row[4] else 'N/A'}...")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
306
validate_data.py
306
validate_data.py
@@ -1,306 +0,0 @@
|
|||||||
"""
|
|
||||||
Validate data quality and completeness in the database.
|
|
||||||
Checks if scraped data matches expectations and API capabilities.
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
||||||
|
|
||||||
import sqlite3
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Dict, List, Tuple
|
|
||||||
from cache import CacheManager
|
|
||||||
|
|
||||||
cache = CacheManager()
|
|
||||||
DB_PATH = cache.db_path
|
|
||||||
|
|
||||||
def get_db_stats() -> Dict:
|
|
||||||
"""Get comprehensive database statistics"""
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
stats = {}
|
|
||||||
|
|
||||||
# Total counts
|
|
||||||
stats['total_auctions'] = cursor.execute("SELECT COUNT(*) FROM auctions").fetchone()[0]
|
|
||||||
stats['total_lots'] = cursor.execute("SELECT COUNT(*) FROM lots").fetchone()[0]
|
|
||||||
stats['total_images'] = cursor.execute("SELECT COUNT(*) FROM images").fetchone()[0]
|
|
||||||
stats['total_bid_history'] = cursor.execute("SELECT COUNT(*) FROM bid_history").fetchone()[0]
|
|
||||||
|
|
||||||
# Auctions completeness
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total,
|
|
||||||
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
|
|
||||||
SUM(CASE WHEN lots_count IS NOT NULL THEN 1 ELSE 0 END) as has_lots_count,
|
|
||||||
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
|
|
||||||
SUM(CASE WHEN first_lot_closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_lot_closing
|
|
||||||
FROM auctions
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
stats['auctions'] = {
|
|
||||||
'total': row[0],
|
|
||||||
'has_title': row[1],
|
|
||||||
'has_lots_count': row[2],
|
|
||||||
'has_closing_time': row[3],
|
|
||||||
'has_first_lot_closing': row[4]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Lots completeness - Core fields
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total,
|
|
||||||
SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
|
|
||||||
SUM(CASE WHEN current_bid IS NOT NULL THEN 1 ELSE 0 END) as has_current_bid,
|
|
||||||
SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid,
|
|
||||||
SUM(CASE WHEN minimum_bid IS NOT NULL THEN 1 ELSE 0 END) as has_minimum_bid,
|
|
||||||
SUM(CASE WHEN bid_count IS NOT NULL AND bid_count > 0 THEN 1 ELSE 0 END) as has_bids,
|
|
||||||
SUM(CASE WHEN closing_time IS NOT NULL THEN 1 ELSE 0 END) as has_closing_time,
|
|
||||||
SUM(CASE WHEN status IS NOT NULL AND status != '' THEN 1 ELSE 0 END) as has_status
|
|
||||||
FROM lots
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
stats['lots_core'] = {
|
|
||||||
'total': row[0],
|
|
||||||
'has_title': row[1],
|
|
||||||
'has_current_bid': row[2],
|
|
||||||
'has_starting_bid': row[3],
|
|
||||||
'has_minimum_bid': row[4],
|
|
||||||
'has_bids': row[5],
|
|
||||||
'has_closing_time': row[6],
|
|
||||||
'has_status': row[7]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Lots completeness - Enriched fields
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total,
|
|
||||||
SUM(CASE WHEN brand IS NOT NULL AND brand != '' THEN 1 ELSE 0 END) as has_brand,
|
|
||||||
SUM(CASE WHEN model IS NOT NULL AND model != '' THEN 1 ELSE 0 END) as has_model,
|
|
||||||
SUM(CASE WHEN manufacturer IS NOT NULL AND manufacturer != '' THEN 1 ELSE 0 END) as has_manufacturer,
|
|
||||||
SUM(CASE WHEN year_manufactured IS NOT NULL THEN 1 ELSE 0 END) as has_year,
|
|
||||||
SUM(CASE WHEN condition_score IS NOT NULL THEN 1 ELSE 0 END) as has_condition_score,
|
|
||||||
SUM(CASE WHEN condition_description IS NOT NULL AND condition_description != '' THEN 1 ELSE 0 END) as has_condition_desc,
|
|
||||||
SUM(CASE WHEN serial_number IS NOT NULL AND serial_number != '' THEN 1 ELSE 0 END) as has_serial,
|
|
||||||
SUM(CASE WHEN damage_description IS NOT NULL AND damage_description != '' THEN 1 ELSE 0 END) as has_damage
|
|
||||||
FROM lots
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
stats['lots_enriched'] = {
|
|
||||||
'total': row[0],
|
|
||||||
'has_brand': row[1],
|
|
||||||
'has_model': row[2],
|
|
||||||
'has_manufacturer': row[3],
|
|
||||||
'has_year': row[4],
|
|
||||||
'has_condition_score': row[5],
|
|
||||||
'has_condition_desc': row[6],
|
|
||||||
'has_serial': row[7],
|
|
||||||
'has_damage': row[8]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Lots completeness - Bid intelligence
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total,
|
|
||||||
SUM(CASE WHEN first_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_first_bid_time,
|
|
||||||
SUM(CASE WHEN last_bid_time IS NOT NULL THEN 1 ELSE 0 END) as has_last_bid_time,
|
|
||||||
SUM(CASE WHEN bid_velocity IS NOT NULL THEN 1 ELSE 0 END) as has_bid_velocity,
|
|
||||||
SUM(CASE WHEN bid_increment IS NOT NULL THEN 1 ELSE 0 END) as has_bid_increment
|
|
||||||
FROM lots
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
stats['lots_bid_intelligence'] = {
|
|
||||||
'total': row[0],
|
|
||||||
'has_first_bid_time': row[1],
|
|
||||||
'has_last_bid_time': row[2],
|
|
||||||
'has_bid_velocity': row[3],
|
|
||||||
'has_bid_increment': row[4]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Bid history stats
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(DISTINCT lot_id) as lots_with_history,
|
|
||||||
COUNT(*) as total_bids,
|
|
||||||
SUM(CASE WHEN is_autobid = 1 THEN 1 ELSE 0 END) as autobids,
|
|
||||||
SUM(CASE WHEN bidder_id IS NOT NULL THEN 1 ELSE 0 END) as has_bidder_id
|
|
||||||
FROM bid_history
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
stats['bid_history'] = {
|
|
||||||
'lots_with_history': row[0],
|
|
||||||
'total_bids': row[1],
|
|
||||||
'autobids': row[2],
|
|
||||||
'has_bidder_id': row[3]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Image stats
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(DISTINCT lot_id) as lots_with_images,
|
|
||||||
COUNT(*) as total_images,
|
|
||||||
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded_images,
|
|
||||||
SUM(CASE WHEN local_path IS NOT NULL THEN 1 ELSE 0 END) as has_local_path
|
|
||||||
FROM images
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
stats['images'] = {
|
|
||||||
'lots_with_images': row[0],
|
|
||||||
'total_images': row[1],
|
|
||||||
'downloaded_images': row[2],
|
|
||||||
'has_local_path': row[3]
|
|
||||||
}
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
return stats
|
|
||||||
|
|
||||||
def check_data_quality() -> List[Tuple[str, str, str]]:
|
|
||||||
"""Check for data quality issues"""
|
|
||||||
issues = []
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Check for lots without auction
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT COUNT(*) FROM lots
|
|
||||||
WHERE auction_id NOT IN (SELECT auction_id FROM auctions)
|
|
||||||
""")
|
|
||||||
orphaned_lots = cursor.fetchone()[0]
|
|
||||||
if orphaned_lots > 0:
|
|
||||||
issues.append(("ERROR", "Orphaned Lots", f"{orphaned_lots} lots without matching auction"))
|
|
||||||
|
|
||||||
# Check for lots with bids but no bid history
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT COUNT(*) FROM lots
|
|
||||||
WHERE bid_count > 0
|
|
||||||
AND lot_id NOT IN (SELECT DISTINCT lot_id FROM bid_history)
|
|
||||||
""")
|
|
||||||
missing_history = cursor.fetchone()[0]
|
|
||||||
if missing_history > 0:
|
|
||||||
issues.append(("WARNING", "Missing Bid History", f"{missing_history} lots have bids but no bid history records"))
|
|
||||||
|
|
||||||
# Check for lots with closing time in past but still active
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT COUNT(*) FROM lots
|
|
||||||
WHERE closing_time IS NOT NULL
|
|
||||||
AND closing_time < datetime('now')
|
|
||||||
AND status NOT LIKE '%gesloten%'
|
|
||||||
""")
|
|
||||||
past_closing = cursor.fetchone()[0]
|
|
||||||
if past_closing > 0:
|
|
||||||
issues.append(("INFO", "Past Closing Time", f"{past_closing} lots have closing time in past"))
|
|
||||||
|
|
||||||
# Check for duplicate lot_ids
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT lot_id, COUNT(*) FROM lots
|
|
||||||
GROUP BY lot_id
|
|
||||||
HAVING COUNT(*) > 1
|
|
||||||
""")
|
|
||||||
duplicates = cursor.fetchall()
|
|
||||||
if duplicates:
|
|
||||||
issues.append(("ERROR", "Duplicate Lot IDs", f"{len(duplicates)} duplicate lot_id values found"))
|
|
||||||
|
|
||||||
# Check for lots without images
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT COUNT(*) FROM lots
|
|
||||||
WHERE lot_id NOT IN (SELECT DISTINCT lot_id FROM images)
|
|
||||||
""")
|
|
||||||
no_images = cursor.fetchone()[0]
|
|
||||||
if no_images > 0:
|
|
||||||
issues.append(("WARNING", "No Images", f"{no_images} lots have no images"))
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
return issues
|
|
||||||
|
|
||||||
def print_validation_report():
|
|
||||||
"""Print comprehensive validation report"""
|
|
||||||
print("=" * 80)
|
|
||||||
print("DATABASE VALIDATION REPORT")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
stats = get_db_stats()
|
|
||||||
|
|
||||||
# Overall counts
|
|
||||||
print("OVERALL COUNTS:")
|
|
||||||
print(f" Auctions: {stats['total_auctions']:,}")
|
|
||||||
print(f" Lots: {stats['total_lots']:,}")
|
|
||||||
print(f" Images: {stats['total_images']:,}")
|
|
||||||
print(f" Bid History Records: {stats['total_bid_history']:,}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Auctions completeness
|
|
||||||
print("AUCTIONS COMPLETENESS:")
|
|
||||||
a = stats['auctions']
|
|
||||||
print(f" Title: {a['has_title']:,} / {a['total']:,} ({a['has_title']/a['total']*100:.1f}%)")
|
|
||||||
print(f" Lots Count: {a['has_lots_count']:,} / {a['total']:,} ({a['has_lots_count']/a['total']*100:.1f}%)")
|
|
||||||
print(f" Closing Time: {a['has_closing_time']:,} / {a['total']:,} ({a['has_closing_time']/a['total']*100:.1f}%)")
|
|
||||||
print(f" First Lot Closing: {a['has_first_lot_closing']:,} / {a['total']:,} ({a['has_first_lot_closing']/a['total']*100:.1f}%)")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Lots core completeness
|
|
||||||
print("LOTS CORE FIELDS:")
|
|
||||||
l = stats['lots_core']
|
|
||||||
print(f" Title: {l['has_title']:,} / {l['total']:,} ({l['has_title']/l['total']*100:.1f}%)")
|
|
||||||
print(f" Current Bid: {l['has_current_bid']:,} / {l['total']:,} ({l['has_current_bid']/l['total']*100:.1f}%)")
|
|
||||||
print(f" Starting Bid: {l['has_starting_bid']:,} / {l['total']:,} ({l['has_starting_bid']/l['total']*100:.1f}%)")
|
|
||||||
print(f" Minimum Bid: {l['has_minimum_bid']:,} / {l['total']:,} ({l['has_minimum_bid']/l['total']*100:.1f}%)")
|
|
||||||
print(f" Has Bids (>0): {l['has_bids']:,} / {l['total']:,} ({l['has_bids']/l['total']*100:.1f}%)")
|
|
||||||
print(f" Closing Time: {l['has_closing_time']:,} / {l['total']:,} ({l['has_closing_time']/l['total']*100:.1f}%)")
|
|
||||||
print(f" Status: {l['has_status']:,} / {l['total']:,} ({l['has_status']/l['total']*100:.1f}%)")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Lots enriched fields
|
|
||||||
print("LOTS ENRICHED FIELDS:")
|
|
||||||
e = stats['lots_enriched']
|
|
||||||
print(f" Brand: {e['has_brand']:,} / {e['total']:,} ({e['has_brand']/e['total']*100:.1f}%)")
|
|
||||||
print(f" Model: {e['has_model']:,} / {e['total']:,} ({e['has_model']/e['total']*100:.1f}%)")
|
|
||||||
print(f" Manufacturer: {e['has_manufacturer']:,} / {e['total']:,} ({e['has_manufacturer']/e['total']*100:.1f}%)")
|
|
||||||
print(f" Year: {e['has_year']:,} / {e['total']:,} ({e['has_year']/e['total']*100:.1f}%)")
|
|
||||||
print(f" Condition Score: {e['has_condition_score']:,} / {e['total']:,} ({e['has_condition_score']/e['total']*100:.1f}%)")
|
|
||||||
print(f" Condition Desc: {e['has_condition_desc']:,} / {e['total']:,} ({e['has_condition_desc']/e['total']*100:.1f}%)")
|
|
||||||
print(f" Serial Number: {e['has_serial']:,} / {e['total']:,} ({e['has_serial']/e['total']*100:.1f}%)")
|
|
||||||
print(f" Damage Desc: {e['has_damage']:,} / {e['total']:,} ({e['has_damage']/e['total']*100:.1f}%)")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Bid intelligence
|
|
||||||
print("LOTS BID INTELLIGENCE:")
|
|
||||||
b = stats['lots_bid_intelligence']
|
|
||||||
print(f" First Bid Time: {b['has_first_bid_time']:,} / {b['total']:,} ({b['has_first_bid_time']/b['total']*100:.1f}%)")
|
|
||||||
print(f" Last Bid Time: {b['has_last_bid_time']:,} / {b['total']:,} ({b['has_last_bid_time']/b['total']*100:.1f}%)")
|
|
||||||
print(f" Bid Velocity: {b['has_bid_velocity']:,} / {b['total']:,} ({b['has_bid_velocity']/b['total']*100:.1f}%)")
|
|
||||||
print(f" Bid Increment: {b['has_bid_increment']:,} / {b['total']:,} ({b['has_bid_increment']/b['total']*100:.1f}%)")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Bid history
|
|
||||||
print("BID HISTORY:")
|
|
||||||
h = stats['bid_history']
|
|
||||||
print(f" Lots with History: {h['lots_with_history']:,}")
|
|
||||||
print(f" Total Bid Records: {h['total_bids']:,}")
|
|
||||||
print(f" Autobids: {h['autobids']:,} ({h['autobids']/max(h['total_bids'],1)*100:.1f}%)")
|
|
||||||
print(f" Has Bidder ID: {h['has_bidder_id']:,} ({h['has_bidder_id']/max(h['total_bids'],1)*100:.1f}%)")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Images
|
|
||||||
print("IMAGES:")
|
|
||||||
i = stats['images']
|
|
||||||
print(f" Lots with Images: {i['lots_with_images']:,}")
|
|
||||||
print(f" Total Images: {i['total_images']:,}")
|
|
||||||
print(f" Downloaded: {i['downloaded_images']:,} ({i['downloaded_images']/max(i['total_images'],1)*100:.1f}%)")
|
|
||||||
print(f" Has Local Path: {i['has_local_path']:,} ({i['has_local_path']/max(i['total_images'],1)*100:.1f}%)")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Data quality issues
|
|
||||||
print("=" * 80)
|
|
||||||
print("DATA QUALITY ISSUES:")
|
|
||||||
print("=" * 80)
|
|
||||||
issues = check_data_quality()
|
|
||||||
if issues:
|
|
||||||
for severity, category, message in issues:
|
|
||||||
print(f" [{severity}] {category}: {message}")
|
|
||||||
else:
|
|
||||||
print(" No issues found!")
|
|
||||||
print()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print_validation_report()
|
|
||||||
@@ -1,92 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Verification script to check image download status and duplicates
|
|
||||||
Run this after deployment to verify the scraper is working correctly
|
|
||||||
"""
|
|
||||||
import sqlite3
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
|
||||||
|
|
||||||
def verify_database():
|
|
||||||
"""Run verification queries on the database"""
|
|
||||||
|
|
||||||
if not Path(DB_PATH).exists():
|
|
||||||
print(f"❌ Database not found: {DB_PATH}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
|
|
||||||
print("=" * 60)
|
|
||||||
print("IMAGE DOWNLOAD VERIFICATION")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Check download success rate
|
|
||||||
print("\n[*] Download Success Rate:")
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT
|
|
||||||
COUNT(*) as total_images,
|
|
||||||
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
|
|
||||||
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
|
|
||||||
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
|
|
||||||
FROM images
|
|
||||||
""")
|
|
||||||
row = cursor.fetchone()
|
|
||||||
print(f" Total images: {row[0]:,}")
|
|
||||||
print(f" Downloaded: {row[1]:,}")
|
|
||||||
print(f" Not downloaded: {row[2]:,}")
|
|
||||||
print(f" Success rate: {row[3]}%")
|
|
||||||
|
|
||||||
# Check for duplicates
|
|
||||||
print("\n[*] Duplicate Check:")
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT lot_id, url, COUNT(*) as dup_count
|
|
||||||
FROM images
|
|
||||||
GROUP BY lot_id, url
|
|
||||||
HAVING COUNT(*) > 1
|
|
||||||
LIMIT 5
|
|
||||||
""")
|
|
||||||
duplicates = cursor.fetchall()
|
|
||||||
|
|
||||||
if duplicates:
|
|
||||||
print(f" [!] Found {len(duplicates)} duplicate entries!")
|
|
||||||
for lot_id, url, count in duplicates:
|
|
||||||
print(f" {lot_id}: {url[:50]}... (x{count})")
|
|
||||||
else:
|
|
||||||
print(" [+] No duplicates found!")
|
|
||||||
|
|
||||||
# Verify file system
|
|
||||||
print("\n[*] File System Verification:")
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT COUNT(*)
|
|
||||||
FROM images
|
|
||||||
WHERE downloaded = 1
|
|
||||||
AND local_path IS NOT NULL
|
|
||||||
AND local_path != ''
|
|
||||||
""")
|
|
||||||
files_with_path = cursor.fetchone()[0]
|
|
||||||
print(f" Images with local_path: {files_with_path:,}")
|
|
||||||
|
|
||||||
# Sample some downloaded images
|
|
||||||
print("\n[*] Sample Downloaded Images:")
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT lot_id, local_path
|
|
||||||
FROM images
|
|
||||||
WHERE downloaded = 1
|
|
||||||
AND local_path IS NOT NULL
|
|
||||||
LIMIT 5
|
|
||||||
""")
|
|
||||||
samples = cursor.fetchall()
|
|
||||||
for lot_id, path in samples:
|
|
||||||
exists = "[+]" if Path(path).exists() else "[!]"
|
|
||||||
print(f" {exists} {lot_id}: {path}")
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("VERIFICATION COMPLETE")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
verify_database()
|
|
||||||
Reference in New Issue
Block a user