From 8a2b005d4ac755953696a16cf682366ebc88c421 Mon Sep 17 00:00:00 2001 From: Tour Date: Tue, 9 Dec 2025 07:11:09 +0100 Subject: [PATCH] move.venv --- src/cache.py | 2 +- src/graphql_client.py | 11 -- src/scraper.py | 18 ++- test/test_description_simple.py | 51 ++++++++ test/test_missing_fields.py | 208 ++++++++++++++++++++++++++++++++ 5 files changed, 276 insertions(+), 14 deletions(-) create mode 100644 test/test_description_simple.py create mode 100644 test/test_missing_fields.py diff --git a/src/cache.py b/src/cache.py index 43081cf..b3ff940 100644 --- a/src/cache.py +++ b/src/cache.py @@ -315,7 +315,7 @@ class CacheManager: (url, compressed_content, time.time(), status_code) ) conn.commit() - print(f" → Cached: {url} (compressed {ratio:.1f}%)") + print(f" -> Cached: {url} (compressed {ratio:.1f}%)") def clear_old(self, max_age_hours: int = 168): """Clear old cache entries to prevent database bloat""" diff --git a/src/graphql_client.py b/src/graphql_client.py index b2991c2..654d4bd 100644 --- a/src/graphql_client.py +++ b/src/graphql_client.py @@ -31,17 +31,6 @@ query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!) LOT_BIDDING_QUERY = """ query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) { lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) { - estimatedFullPrice { - min { - cents - currency - } - max { - cents - currency - } - saleTerm - } lot { id displayId diff --git a/src/scraper.py b/src/scraper.py index 3af4b70..4a256ab 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -358,7 +358,7 @@ class TroostwijkScraper: conn = sqlite3.connect(self.cache.db_path) cursor = conn.cursor() cursor.execute(""" - SELECT followers_count, estimated_min_price, current_bid, bid_count, closing_time + SELECT followers_count, estimated_min_price, current_bid, bid_count, closing_time, status FROM lots WHERE lot_id = ? """, (lot_id,)) existing = cursor.fetchone() @@ -377,6 +377,8 @@ class TroostwijkScraper: page_data['estimated_min_price'] = existing[1] page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids') page_data['bid_count'] = existing[3] or 0 + page_data['closing_time'] = existing[4] # Add closing_time + page_data['status'] = existing[5] or '' # Add status bidding_data = None bid_history_data = None else: @@ -439,7 +441,19 @@ class TroostwijkScraper: if bidding_data: formatted_data = format_bid_data(bidding_data) - page_data.update(formatted_data) + + # Merge data intelligently - don't overwrite existing fields + # Parser (from __NEXT_DATA__) has: description, category, images + # API has: current_bid, bid_count, closing_time, status, followers, estimates + # Keep parser data, enhance with API data + for key, value in formatted_data.items(): + # Only update if current value is missing/empty + current_value = page_data.get(key) + if current_value is None or current_value == '' or current_value == 0 or current_value == 'No bids': + page_data[key] = value + # Special case: always update bid_count if API has higher value + elif key == 'bid_count' and isinstance(value, int) and value > current_value: + page_data[key] = value # Enhanced logging with new intelligence fields print(f" Bid: {page_data.get('current_bid', 'N/A')}") diff --git a/test/test_description_simple.py b/test/test_description_simple.py new file mode 100644 index 0000000..f167a79 --- /dev/null +++ b/test/test_description_simple.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import sys +import os +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.insert(0, parent_dir) +sys.path.insert(0, os.path.join(parent_dir, 'src')) + +import asyncio +from scraper import TroostwijkScraper +import config +import os + +async def test(): + # Force online mode + os.environ['SCAEV_OFFLINE'] = '0' + config.OFFLINE = False + + scraper = TroostwijkScraper() + scraper.offline = False + + from playwright.async_api import async_playwright + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context() + page = await context.new_page() + + url = "https://www.troostwijkauctions.com/l/used-dometic-seastar-tfxchx8641p-top-mount-engine-control-liver-A1-39684-12" + + # Add debug logging to parser + original_parse = scraper.parser.parse_page + def debug_parse(content, url): + result = original_parse(content, url) + if result: + print(f"PARSER OUTPUT:") + print(f" description: {result.get('description', 'NONE')[:100] if result.get('description') else 'EMPTY'}") + print(f" closing_time: {result.get('closing_time', 'NONE')}") + print(f" bid_count: {result.get('bid_count', 'NONE')}") + return result + scraper.parser.parse_page = debug_parse + + page_data = await scraper.crawl_page(page, url) + + await browser.close() + + print(f"\nFINAL page_data:") + print(f" description: {page_data.get('description', 'NONE')[:100] if page_data and page_data.get('description') else 'EMPTY'}") + print(f" closing_time: {page_data.get('closing_time', 'NONE') if page_data else 'NONE'}") + print(f" bid_count: {page_data.get('bid_count', 'NONE') if page_data else 'NONE'}") + print(f" status: {page_data.get('status', 'NONE') if page_data else 'NONE'}") + +asyncio.run(test()) diff --git a/test/test_missing_fields.py b/test/test_missing_fields.py new file mode 100644 index 0000000..14c417a --- /dev/null +++ b/test/test_missing_fields.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +""" +Test to validate that all expected fields are populated after scraping +""" +import sys +import os +import asyncio +import sqlite3 + +# Add parent and src directory to path +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.insert(0, parent_dir) +sys.path.insert(0, os.path.join(parent_dir, 'src')) + +# Force online mode before importing +os.environ['SCAEV_OFFLINE'] = '0' + +from scraper import TroostwijkScraper +import config + + +async def test_lot_has_all_fields(): + """Test that a lot page has all expected fields populated""" + + print("\n" + "="*60) + print("TEST: Lot has all required fields") + print("="*60) + + # Use the example lot from user + test_url = "https://www.troostwijkauctions.com/l/radaway-idea-black-dwj-doucheopstelling-A1-39956-18" + + # Ensure we're not in offline mode + config.OFFLINE = False + + scraper = TroostwijkScraper() + scraper.offline = False + + print(f"\n[1] Scraping: {test_url}") + + # Start playwright and scrape + from playwright.async_api import async_playwright + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context() + page = await context.new_page() + + page_data = await scraper.crawl_page(page, test_url) + + await browser.close() + + if not page_data: + print(" [FAIL] No data returned") + return False + + print(f"\n[2] Validating fields...") + + # Fields that MUST have values (critical for auction functionality) + required_fields = { + 'closing_time': 'Closing time', + 'current_bid': 'Current bid', + 'bid_count': 'Bid count', + 'status': 'Status', + } + + # Fields that SHOULD have values but may legitimately be empty + optional_fields = { + 'description': 'Description', + } + + missing_fields = [] + empty_fields = [] + optional_missing = [] + + # Check required fields + for field, label in required_fields.items(): + value = page_data.get(field) + + if value is None: + missing_fields.append(label) + print(f" [FAIL] {label}: MISSING (None)") + elif value == '' or value == 0 or value == 'No bids': + # Special case: 'No bids' is only acceptable if bid_count is 0 + if field == 'current_bid' and page_data.get('bid_count', 0) == 0: + print(f" [PASS] {label}: '{value}' (acceptable - no bids)") + else: + empty_fields.append(label) + print(f" [FAIL] {label}: EMPTY ('{value}')") + else: + print(f" [PASS] {label}: {value}") + + # Check optional fields (warn but don't fail) + for field, label in optional_fields.items(): + value = page_data.get(field) + if value is None or value == '': + optional_missing.append(label) + print(f" [WARN] {label}: EMPTY (may be legitimate)") + else: + print(f" [PASS] {label}: {value[:50]}...") + + # Check database + print(f"\n[3] Checking database entry...") + conn = sqlite3.connect(scraper.cache.db_path) + cursor = conn.cursor() + cursor.execute(""" + SELECT closing_time, current_bid, bid_count, description, status + FROM lots WHERE url = ? + """, (test_url,)) + row = cursor.fetchone() + conn.close() + + if row: + db_closing, db_bid, db_count, db_desc, db_status = row + print(f" DB closing_time: {db_closing or 'EMPTY'}") + print(f" DB current_bid: {db_bid or 'EMPTY'}") + print(f" DB bid_count: {db_count}") + print(f" DB description: {db_desc[:50] if db_desc else 'EMPTY'}...") + print(f" DB status: {db_status or 'EMPTY'}") + + # Verify DB matches page_data + if db_closing != page_data.get('closing_time'): + print(f" [WARN] DB closing_time doesn't match page_data") + if db_count != page_data.get('bid_count'): + print(f" [WARN] DB bid_count doesn't match page_data") + else: + print(f" [WARN] No database entry found") + + print(f"\n" + "="*60) + if missing_fields or empty_fields: + print(f"[FAIL] Missing fields: {', '.join(missing_fields)}") + print(f"[FAIL] Empty fields: {', '.join(empty_fields)}") + if optional_missing: + print(f"[WARN] Optional missing: {', '.join(optional_missing)}") + return False + else: + print("[PASS] All required fields are populated") + if optional_missing: + print(f"[WARN] Optional missing: {', '.join(optional_missing)}") + return True + + +async def test_lot_with_description(): + """Test that a lot with description preserves it""" + + print("\n" + "="*60) + print("TEST: Lot with description") + print("="*60) + + # Use a lot known to have description + test_url = "https://www.troostwijkauctions.com/l/used-dometic-seastar-tfxchx8641p-top-mount-engine-control-liver-A1-39684-12" + + config.OFFLINE = False + + scraper = TroostwijkScraper() + scraper.offline = False + + print(f"\n[1] Scraping: {test_url}") + + from playwright.async_api import async_playwright + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context() + page = await context.new_page() + + page_data = await scraper.crawl_page(page, test_url) + + await browser.close() + + if not page_data: + print(" [FAIL] No data returned") + return False + + print(f"\n[2] Checking description...") + description = page_data.get('description', '') + + if not description or description == '': + print(f" [FAIL] Description is empty") + return False + else: + print(f" [PASS] Description: {description[:100]}...") + return True + + +async def main(): + """Run all tests""" + print("\n" + "="*60) + print("MISSING FIELDS TEST SUITE") + print("="*60) + + test1 = await test_lot_has_all_fields() + test2 = await test_lot_with_description() + + print("\n" + "="*60) + if test1 and test2: + print("ALL TESTS PASSED") + else: + print("SOME TESTS FAILED") + if not test1: + print(" - test_lot_has_all_fields FAILED") + if not test2: + print(" - test_lot_with_description FAILED") + print("="*60 + "\n") + + return 0 if (test1 and test2) else 1 + + +if __name__ == '__main__': + exit_code = asyncio.run(main()) + sys.exit(exit_code)