enrich data

This commit is contained in:
Tour
2025-12-07 01:26:48 +01:00
parent bb7f4bbe9d
commit d09ee5574f
14 changed files with 1221 additions and 7 deletions

64
test_new_fields.py Normal file
View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""Test the new fields extraction"""
import asyncio
import sys
sys.path.insert(0, 'src')
from scraper import TroostwijkScraper
async def main():
scraper = TroostwijkScraper()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
# Test with lot that has attributes
lot_url = "https://www.troostwijkauctions.com/l/47-5kg-hexagon-dumbbell-%25282x%2529-A1-40668-34"
print(f"Testing new fields with: {lot_url}\n")
result = await scraper.crawl_page(page, lot_url)
if result:
print(f"\n{'='*60}")
print("EXTRACTED FIELDS:")
print(f"{'='*60}")
print(f"Lot ID: {result.get('lot_id')}")
print(f"Title: {result.get('title', '')[:50]}...")
print(f"Status: {result.get('status')}")
print(f"Brand: {result.get('brand')}")
print(f"Model: {result.get('model')}")
print(f"Viewing Time: {result.get('viewing_time', 'N/A')}")
print(f"Pickup Date: {result.get('pickup_date', 'N/A')}")
print(f"Attributes: {result.get('attributes_json', '')[:100]}...")
await browser.close()
# Verify database
import sqlite3
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("""
SELECT status, brand, model, viewing_time, pickup_date
FROM lots
WHERE lot_id = ?
""", (result.get('lot_id'),))
row = cursor.fetchone()
conn.close()
if row:
print(f"\n{'='*60}")
print("DATABASE VERIFICATION:")
print(f"{'='*60}")
print(f"Status: {row[0]}")
print(f"Brand: {row[1]}")
print(f"Model: {row[2]}")
print(f"Viewing: {row[3][:100] if row[3] else 'N/A'}...")
print(f"Pickup: {row[4][:100] if row[4] else 'N/A'}...")
if __name__ == "__main__":
asyncio.run(main())