move.venv
This commit is contained in:
@@ -315,7 +315,7 @@ class CacheManager:
|
|||||||
(url, compressed_content, time.time(), status_code)
|
(url, compressed_content, time.time(), status_code)
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
print(f" → Cached: {url} (compressed {ratio:.1f}%)")
|
print(f" -> Cached: {url} (compressed {ratio:.1f}%)")
|
||||||
|
|
||||||
def clear_old(self, max_age_hours: int = 168):
|
def clear_old(self, max_age_hours: int = 168):
|
||||||
"""Clear old cache entries to prevent database bloat"""
|
"""Clear old cache entries to prevent database bloat"""
|
||||||
|
|||||||
@@ -31,17 +31,6 @@ query AuctionData($auctionId: TbaUuid!, $locale: String!, $platform: Platform!)
|
|||||||
LOT_BIDDING_QUERY = """
|
LOT_BIDDING_QUERY = """
|
||||||
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
|
||||||
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
|
||||||
estimatedFullPrice {
|
|
||||||
min {
|
|
||||||
cents
|
|
||||||
currency
|
|
||||||
}
|
|
||||||
max {
|
|
||||||
cents
|
|
||||||
currency
|
|
||||||
}
|
|
||||||
saleTerm
|
|
||||||
}
|
|
||||||
lot {
|
lot {
|
||||||
id
|
id
|
||||||
displayId
|
displayId
|
||||||
|
|||||||
@@ -358,7 +358,7 @@ class TroostwijkScraper:
|
|||||||
conn = sqlite3.connect(self.cache.db_path)
|
conn = sqlite3.connect(self.cache.db_path)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
SELECT followers_count, estimated_min_price, current_bid, bid_count, closing_time
|
SELECT followers_count, estimated_min_price, current_bid, bid_count, closing_time, status
|
||||||
FROM lots WHERE lot_id = ?
|
FROM lots WHERE lot_id = ?
|
||||||
""", (lot_id,))
|
""", (lot_id,))
|
||||||
existing = cursor.fetchone()
|
existing = cursor.fetchone()
|
||||||
@@ -377,6 +377,8 @@ class TroostwijkScraper:
|
|||||||
page_data['estimated_min_price'] = existing[1]
|
page_data['estimated_min_price'] = existing[1]
|
||||||
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
page_data['current_bid'] = existing[2] or page_data.get('current_bid', 'No bids')
|
||||||
page_data['bid_count'] = existing[3] or 0
|
page_data['bid_count'] = existing[3] or 0
|
||||||
|
page_data['closing_time'] = existing[4] # Add closing_time
|
||||||
|
page_data['status'] = existing[5] or '' # Add status
|
||||||
bidding_data = None
|
bidding_data = None
|
||||||
bid_history_data = None
|
bid_history_data = None
|
||||||
else:
|
else:
|
||||||
@@ -439,7 +441,19 @@ class TroostwijkScraper:
|
|||||||
|
|
||||||
if bidding_data:
|
if bidding_data:
|
||||||
formatted_data = format_bid_data(bidding_data)
|
formatted_data = format_bid_data(bidding_data)
|
||||||
page_data.update(formatted_data)
|
|
||||||
|
# Merge data intelligently - don't overwrite existing fields
|
||||||
|
# Parser (from __NEXT_DATA__) has: description, category, images
|
||||||
|
# API has: current_bid, bid_count, closing_time, status, followers, estimates
|
||||||
|
# Keep parser data, enhance with API data
|
||||||
|
for key, value in formatted_data.items():
|
||||||
|
# Only update if current value is missing/empty
|
||||||
|
current_value = page_data.get(key)
|
||||||
|
if current_value is None or current_value == '' or current_value == 0 or current_value == 'No bids':
|
||||||
|
page_data[key] = value
|
||||||
|
# Special case: always update bid_count if API has higher value
|
||||||
|
elif key == 'bid_count' and isinstance(value, int) and value > current_value:
|
||||||
|
page_data[key] = value
|
||||||
|
|
||||||
# Enhanced logging with new intelligence fields
|
# Enhanced logging with new intelligence fields
|
||||||
print(f" Bid: {page_data.get('current_bid', 'N/A')}")
|
print(f" Bid: {page_data.get('current_bid', 'N/A')}")
|
||||||
|
|||||||
51
test/test_description_simple.py
Normal file
51
test/test_description_simple.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
||||||
|
sys.path.insert(0, parent_dir)
|
||||||
|
sys.path.insert(0, os.path.join(parent_dir, 'src'))
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from scraper import TroostwijkScraper
|
||||||
|
import config
|
||||||
|
import os
|
||||||
|
|
||||||
|
async def test():
|
||||||
|
# Force online mode
|
||||||
|
os.environ['SCAEV_OFFLINE'] = '0'
|
||||||
|
config.OFFLINE = False
|
||||||
|
|
||||||
|
scraper = TroostwijkScraper()
|
||||||
|
scraper.offline = False
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
context = await browser.new_context()
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
url = "https://www.troostwijkauctions.com/l/used-dometic-seastar-tfxchx8641p-top-mount-engine-control-liver-A1-39684-12"
|
||||||
|
|
||||||
|
# Add debug logging to parser
|
||||||
|
original_parse = scraper.parser.parse_page
|
||||||
|
def debug_parse(content, url):
|
||||||
|
result = original_parse(content, url)
|
||||||
|
if result:
|
||||||
|
print(f"PARSER OUTPUT:")
|
||||||
|
print(f" description: {result.get('description', 'NONE')[:100] if result.get('description') else 'EMPTY'}")
|
||||||
|
print(f" closing_time: {result.get('closing_time', 'NONE')}")
|
||||||
|
print(f" bid_count: {result.get('bid_count', 'NONE')}")
|
||||||
|
return result
|
||||||
|
scraper.parser.parse_page = debug_parse
|
||||||
|
|
||||||
|
page_data = await scraper.crawl_page(page, url)
|
||||||
|
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
print(f"\nFINAL page_data:")
|
||||||
|
print(f" description: {page_data.get('description', 'NONE')[:100] if page_data and page_data.get('description') else 'EMPTY'}")
|
||||||
|
print(f" closing_time: {page_data.get('closing_time', 'NONE') if page_data else 'NONE'}")
|
||||||
|
print(f" bid_count: {page_data.get('bid_count', 'NONE') if page_data else 'NONE'}")
|
||||||
|
print(f" status: {page_data.get('status', 'NONE') if page_data else 'NONE'}")
|
||||||
|
|
||||||
|
asyncio.run(test())
|
||||||
208
test/test_missing_fields.py
Normal file
208
test/test_missing_fields.py
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test to validate that all expected fields are populated after scraping
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
# Add parent and src directory to path
|
||||||
|
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
||||||
|
sys.path.insert(0, parent_dir)
|
||||||
|
sys.path.insert(0, os.path.join(parent_dir, 'src'))
|
||||||
|
|
||||||
|
# Force online mode before importing
|
||||||
|
os.environ['SCAEV_OFFLINE'] = '0'
|
||||||
|
|
||||||
|
from scraper import TroostwijkScraper
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
|
async def test_lot_has_all_fields():
|
||||||
|
"""Test that a lot page has all expected fields populated"""
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("TEST: Lot has all required fields")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# Use the example lot from user
|
||||||
|
test_url = "https://www.troostwijkauctions.com/l/radaway-idea-black-dwj-doucheopstelling-A1-39956-18"
|
||||||
|
|
||||||
|
# Ensure we're not in offline mode
|
||||||
|
config.OFFLINE = False
|
||||||
|
|
||||||
|
scraper = TroostwijkScraper()
|
||||||
|
scraper.offline = False
|
||||||
|
|
||||||
|
print(f"\n[1] Scraping: {test_url}")
|
||||||
|
|
||||||
|
# Start playwright and scrape
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
context = await browser.new_context()
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
page_data = await scraper.crawl_page(page, test_url)
|
||||||
|
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
if not page_data:
|
||||||
|
print(" [FAIL] No data returned")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"\n[2] Validating fields...")
|
||||||
|
|
||||||
|
# Fields that MUST have values (critical for auction functionality)
|
||||||
|
required_fields = {
|
||||||
|
'closing_time': 'Closing time',
|
||||||
|
'current_bid': 'Current bid',
|
||||||
|
'bid_count': 'Bid count',
|
||||||
|
'status': 'Status',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fields that SHOULD have values but may legitimately be empty
|
||||||
|
optional_fields = {
|
||||||
|
'description': 'Description',
|
||||||
|
}
|
||||||
|
|
||||||
|
missing_fields = []
|
||||||
|
empty_fields = []
|
||||||
|
optional_missing = []
|
||||||
|
|
||||||
|
# Check required fields
|
||||||
|
for field, label in required_fields.items():
|
||||||
|
value = page_data.get(field)
|
||||||
|
|
||||||
|
if value is None:
|
||||||
|
missing_fields.append(label)
|
||||||
|
print(f" [FAIL] {label}: MISSING (None)")
|
||||||
|
elif value == '' or value == 0 or value == 'No bids':
|
||||||
|
# Special case: 'No bids' is only acceptable if bid_count is 0
|
||||||
|
if field == 'current_bid' and page_data.get('bid_count', 0) == 0:
|
||||||
|
print(f" [PASS] {label}: '{value}' (acceptable - no bids)")
|
||||||
|
else:
|
||||||
|
empty_fields.append(label)
|
||||||
|
print(f" [FAIL] {label}: EMPTY ('{value}')")
|
||||||
|
else:
|
||||||
|
print(f" [PASS] {label}: {value}")
|
||||||
|
|
||||||
|
# Check optional fields (warn but don't fail)
|
||||||
|
for field, label in optional_fields.items():
|
||||||
|
value = page_data.get(field)
|
||||||
|
if value is None or value == '':
|
||||||
|
optional_missing.append(label)
|
||||||
|
print(f" [WARN] {label}: EMPTY (may be legitimate)")
|
||||||
|
else:
|
||||||
|
print(f" [PASS] {label}: {value[:50]}...")
|
||||||
|
|
||||||
|
# Check database
|
||||||
|
print(f"\n[3] Checking database entry...")
|
||||||
|
conn = sqlite3.connect(scraper.cache.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT closing_time, current_bid, bid_count, description, status
|
||||||
|
FROM lots WHERE url = ?
|
||||||
|
""", (test_url,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
db_closing, db_bid, db_count, db_desc, db_status = row
|
||||||
|
print(f" DB closing_time: {db_closing or 'EMPTY'}")
|
||||||
|
print(f" DB current_bid: {db_bid or 'EMPTY'}")
|
||||||
|
print(f" DB bid_count: {db_count}")
|
||||||
|
print(f" DB description: {db_desc[:50] if db_desc else 'EMPTY'}...")
|
||||||
|
print(f" DB status: {db_status or 'EMPTY'}")
|
||||||
|
|
||||||
|
# Verify DB matches page_data
|
||||||
|
if db_closing != page_data.get('closing_time'):
|
||||||
|
print(f" [WARN] DB closing_time doesn't match page_data")
|
||||||
|
if db_count != page_data.get('bid_count'):
|
||||||
|
print(f" [WARN] DB bid_count doesn't match page_data")
|
||||||
|
else:
|
||||||
|
print(f" [WARN] No database entry found")
|
||||||
|
|
||||||
|
print(f"\n" + "="*60)
|
||||||
|
if missing_fields or empty_fields:
|
||||||
|
print(f"[FAIL] Missing fields: {', '.join(missing_fields)}")
|
||||||
|
print(f"[FAIL] Empty fields: {', '.join(empty_fields)}")
|
||||||
|
if optional_missing:
|
||||||
|
print(f"[WARN] Optional missing: {', '.join(optional_missing)}")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print("[PASS] All required fields are populated")
|
||||||
|
if optional_missing:
|
||||||
|
print(f"[WARN] Optional missing: {', '.join(optional_missing)}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def test_lot_with_description():
|
||||||
|
"""Test that a lot with description preserves it"""
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("TEST: Lot with description")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# Use a lot known to have description
|
||||||
|
test_url = "https://www.troostwijkauctions.com/l/used-dometic-seastar-tfxchx8641p-top-mount-engine-control-liver-A1-39684-12"
|
||||||
|
|
||||||
|
config.OFFLINE = False
|
||||||
|
|
||||||
|
scraper = TroostwijkScraper()
|
||||||
|
scraper.offline = False
|
||||||
|
|
||||||
|
print(f"\n[1] Scraping: {test_url}")
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
context = await browser.new_context()
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
page_data = await scraper.crawl_page(page, test_url)
|
||||||
|
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
if not page_data:
|
||||||
|
print(" [FAIL] No data returned")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"\n[2] Checking description...")
|
||||||
|
description = page_data.get('description', '')
|
||||||
|
|
||||||
|
if not description or description == '':
|
||||||
|
print(f" [FAIL] Description is empty")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print(f" [PASS] Description: {description[:100]}...")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all tests"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("MISSING FIELDS TEST SUITE")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
test1 = await test_lot_has_all_fields()
|
||||||
|
test2 = await test_lot_with_description()
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
if test1 and test2:
|
||||||
|
print("ALL TESTS PASSED")
|
||||||
|
else:
|
||||||
|
print("SOME TESTS FAILED")
|
||||||
|
if not test1:
|
||||||
|
print(" - test_lot_has_all_fields FAILED")
|
||||||
|
if not test2:
|
||||||
|
print(" - test_lot_with_description FAILED")
|
||||||
|
print("="*60 + "\n")
|
||||||
|
|
||||||
|
return 0 if (test1 and test2) else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
exit_code = asyncio.run(main())
|
||||||
|
sys.exit(exit_code)
|
||||||
Reference in New Issue
Block a user