- Hardened the GraphQL client to reduce 403 occurrences and provide clearer diagnostics when they appear.
- Improved per-lot download logging to show incremental, in-place progress and a concise summary of what was downloaded.
### Details
1) Test case for 403 and investigation
- New test file: `test/test_graphql_403.py`.
- Uses `importlib` to load `src/config.py` and `src/graphql_client.py` directly so it’s independent of sys.path quirks.
- Mocks `aiohttp.ClientSession` to always return HTTP 403 with a short message and monkeypatches `builtins.print` to capture logs.
- Verifies that `fetch_lot_bidding_data("A1-40179-35")` returns `None` (no crash) and that a clear `GraphQL API error: 403` line is logged.
- Result: `pytest test/test_graphql_403.py -q` passes locally.
- Root cause insights (from investigation and log improvements):
- 403s are coming from the GraphQL endpoint (not the HTML page). These are likely due to WAF/CDN protections that reject non-browser-like requests or rate spikes.
- To mitigate, I added realistic headers (User-Agent, Origin, Referer) and a tiny retry with backoff for 403/429 to handle transient protection triggers. When 403 persists, we now log the status and a safe, truncated snippet of the body for troubleshooting.
2) Incremental/in-place logging for downloads
- Updated `src/scraper.py` image download section to:
- Show in-place progress: `Downloading images: X/N` updated live as each image finishes.
- After completion, print: `Downloaded: K/N new images`.
- Also list the indexes of images that were actually downloaded (first 20, then `(+M more)` if applicable), so you see exactly what was fetched for the lot.
3) GraphQL client improvements
- Updated `src/graphql_client.py`:
- Added browser-like headers and contextual Referer.
- Added small retry with backoff for 403/429.
- Improved error logs to include status, lot id, and a short body snippet.
### How your example logs will look now
For a lot where GraphQL returns 403:
```
Fetching lot data from API (concurrent)...
GraphQL API error: 403 (lot=A1-40179-35) — Forbidden by WAF
```
For image downloads:
```
Images: 6
Downloading images: 0/6
... 6/6
Downloaded: 6/6 new images
Indexes: 0, 1, 2, 3, 4, 5
```
(When all cached: `All 6 images already cached`)
### Notes
- Full test run surfaced a pre-existing import error in `test/test_scraper.py` (unrelated to these changes). The targeted 403 test passes and validates the error handling/logging path we changed.
- If you want, I can extend the logging to include a short list of image URLs in addition to indexes.
143 lines
4.8 KiB
Python
143 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test module for debugging extraction patterns
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
import re
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import config
|
|
from cache import CacheManager
|
|
from scraper import TroostwijkScraper
|
|
|
|
|
|
def test_extraction(
|
|
test_url: str = "https://www.troostwijkauctions.com/a/machines-en-toebehoren-%28hout-en-kunststofverwerking-handlingapparatuur-bouwmachines-landbouwindustrie%29-oost-europa-december-A7-35847"):
|
|
"""Test extraction on a specific cached URL to debug patterns"""
|
|
scraper = TroostwijkScraper()
|
|
|
|
# Try to get from cache
|
|
cached = scraper.cache.get(test_url)
|
|
if not cached:
|
|
print(f"ERROR: URL not found in cache: {test_url}")
|
|
print(f"\nAvailable cached URLs:")
|
|
try:
|
|
for url in scraper.cache.get_recent_cached_urls(limit=10):
|
|
print(f" - {url}")
|
|
except Exception as e:
|
|
print(f" (failed to list recent cached URLs: {e})")
|
|
return
|
|
|
|
content = cached['content']
|
|
print(f"\n{'=' * 60}")
|
|
print(f"TESTING EXTRACTION FROM: {test_url}")
|
|
print(f"{'=' * 60}")
|
|
print(f"Content length: {len(content)} chars")
|
|
print(f"Cache age: {(time.time() - cached['timestamp']) / 3600:.1f} hours")
|
|
|
|
# Test each extraction method
|
|
page_data = scraper._parse_page(content, test_url)
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print("EXTRACTED DATA:")
|
|
print(f"{'=' * 60}")
|
|
|
|
if not page_data:
|
|
print("ERROR: No data extracted!")
|
|
return
|
|
|
|
print(f"Page Type: {page_data.get('type', 'UNKNOWN')}")
|
|
print()
|
|
|
|
for key, value in page_data.items():
|
|
if key == 'images':
|
|
print(f"{key:.<20}: {len(value)} images")
|
|
for img in value[:3]:
|
|
print(f"{'':.<20} - {img}")
|
|
elif key == 'lots':
|
|
print(f"{key:.<20}: {len(value)} lots in auction")
|
|
else:
|
|
display_value = str(value)[:100] if value else "(empty)"
|
|
# Handle Unicode characters that Windows console can't display
|
|
try:
|
|
print(f"{key:.<20}: {display_value}")
|
|
except UnicodeEncodeError:
|
|
safe_value = display_value.encode('ascii', 'replace').decode('ascii')
|
|
print(f"{key:.<20}: {safe_value}")
|
|
|
|
# Validation checks
|
|
print(f"\n{'=' * 60}")
|
|
print("VALIDATION CHECKS:")
|
|
print(f"{'=' * 60}")
|
|
|
|
issues = []
|
|
|
|
if page_data.get('type') == 'lot':
|
|
if page_data.get('current_bid') in ['Huidig bod', 'Current bid', '€0', '']:
|
|
issues.append("[!] Current bid not extracted correctly")
|
|
else:
|
|
print("[OK] Current bid looks valid:", page_data.get('current_bid'))
|
|
|
|
if page_data.get('location') in ['Locatie', 'Location', '']:
|
|
issues.append("[!] Location not extracted correctly")
|
|
else:
|
|
print("[OK] Location looks valid:", page_data.get('location'))
|
|
|
|
if page_data.get('title') in ['', '...']:
|
|
issues.append("[!] Title not extracted correctly")
|
|
else:
|
|
print("[OK] Title looks valid:", page_data.get('title', '')[:50])
|
|
|
|
if issues:
|
|
print(f"\n[ISSUES FOUND]")
|
|
for issue in issues:
|
|
print(f" {issue}")
|
|
else:
|
|
print(f"\n[ALL FIELDS EXTRACTED SUCCESSFULLY!]")
|
|
|
|
# Debug: Show raw HTML snippets for problematic fields
|
|
print(f"\n{'=' * 60}")
|
|
print("DEBUG: RAW HTML SNIPPETS")
|
|
print(f"{'=' * 60}")
|
|
|
|
# Look for bid-related content
|
|
print(f"\n1. Bid patterns in content:")
|
|
bid_matches = re.findall(r'.{0,50}(€[\d,.\s]+).{0,50}', content[:10000])
|
|
for i, match in enumerate(bid_matches[:5], 1):
|
|
print(f" {i}. {match}")
|
|
|
|
# Look for location content
|
|
print(f"\n2. Location patterns in content:")
|
|
loc_matches = re.findall(r'.{0,30}(Locatie|Location).{0,100}', content, re.IGNORECASE)
|
|
for i, match in enumerate(loc_matches[:5], 1):
|
|
print(f" {i}. ...{match}...")
|
|
|
|
# Look for JSON data
|
|
print(f"\n3. JSON/Script data containing auction info:")
|
|
json_patterns = [
|
|
r'"currentBid"[^,}]+',
|
|
r'"location"[^,}]+',
|
|
r'"price"[^,}]+',
|
|
r'"addressLocality"[^,}]+'
|
|
]
|
|
for pattern in json_patterns:
|
|
matches = re.findall(pattern, content[:50000], re.IGNORECASE)
|
|
if matches:
|
|
print(f" {pattern}: {matches[:3]}")
|
|
|
|
# Look for script tags with structured data
|
|
script_matches = re.findall(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', content, re.DOTALL)
|
|
if script_matches:
|
|
print(f"\n4. Structured data (JSON-LD) found:")
|
|
for i, script in enumerate(script_matches[:2], 1):
|
|
try:
|
|
data = json.loads(script)
|
|
print(f" Script {i}: {json.dumps(data, indent=6)[:500]}...")
|
|
except:
|
|
print(f" Script {i}: {script[:300]}...")
|