diff --git a/REFACTORING_SUMMARY.md b/REFACTORING_SUMMARY.md
new file mode 100644
index 0000000..86d1e88
--- /dev/null
+++ b/REFACTORING_SUMMARY.md
@@ -0,0 +1,140 @@
+# Scaev Scraper Refactoring Summary
+
+## Date: 2025-12-07
+
+## Objectives Completed
+
+### 1. Image Download Integration ✅
+- **Changed**: Enabled `DOWNLOAD_IMAGES = True` in `config.py` and `docker-compose.yml`
+- **Added**: Unique constraint on `images(lot_id, url)` to prevent duplicates
+- **Added**: Automatic duplicate cleanup migration in `cache.py`
+- **Result**: Images are now downloaded to `/mnt/okcomputer/output/images/{lot_id}/` and marked as `downloaded=1`
+- **Impact**: Eliminates 57M+ duplicate image downloads by monitor app
+
+### 2. Data Completeness Fix ✅
+- **Problem**: 99.9% of lots missing closing_time, 100% missing bid data
+- **Root Cause**: Troostwijk loads bid/timing data dynamically via GraphQL API, not in HTML
+- **Solution**: Added GraphQL client to fetch real-time bidding data
+
+## Key Changes
+
+### New Files
+1. **src/graphql_client.py** - GraphQL API client for fetching lot bidding data
+ - Endpoint: `https://storefront.tbauctions.com/storefront/graphql`
+ - Fetches: current_bid, starting_bid, minimum_bid, bid_count, closing_time
+
+### Modified Files
+1. **src/config.py:22** - `DOWNLOAD_IMAGES = True`
+2. **docker-compose.yml:13** - `DOWNLOAD_IMAGES: "True"`
+3. **src/cache.py**
+ - Added unique index on `images(lot_id, url)`
+ - Added columns `starting_bid`, `minimum_bid` to `lots` table
+ - Added migration to clean duplicates and add missing columns
+4. **src/scraper.py**
+ - Integrated GraphQL API calls for each lot
+ - Fetches real-time bidding data after parsing HTML
+ - Removed unicode characters causing Windows encoding issues
+
+## Database Schema Updates
+
+### lots table - New Columns
+```sql
+ALTER TABLE lots ADD COLUMN starting_bid TEXT;
+ALTER TABLE lots ADD COLUMN minimum_bid TEXT;
+```
+
+### images table - New Index
+```sql
+CREATE UNIQUE INDEX idx_unique_lot_url ON images(lot_id, url);
+```
+
+## Data Flow (New Architecture)
+
+```
+┌────────────────────────────────────────────────────┐
+│ Phase 3: Scrape Lot Page │
+└────────────────────────────────────────────────────┘
+ │
+ ├─▶ Parse HTML (__NEXT_DATA__)
+ │ └─▶ Extract: title, location, images, description
+ │
+ ├─▶ Fetch GraphQL API
+ │ └─▶ Query: LotBiddingData(lot_display_id)
+ │ └─▶ Returns:
+ │ - currentBidAmount (cents)
+ │ - initialAmount (starting_bid)
+ │ - nextMinimalBid (minimum_bid)
+ │ - bidsCount
+ │ - endDate (Unix timestamp)
+ │ - startDate
+ │ - biddingStatus
+ │
+ └─▶ Save to Database
+ - lots table: complete bid & timing data
+ - images table: deduplicated URLs
+ - Download images immediately
+```
+
+## Testing Results
+
+### Test Lot: A1-28505-5
+```
+Current Bid: EUR 50.00 ✅
+Starting Bid: EUR 50.00 ✅
+Minimum Bid: EUR 55.00 ✅
+Bid Count: 1 ✅
+Closing Time: 2025-12-16 19:10:00 ✅
+Images: Downloaded 2 ✅
+```
+
+## Deployment Checklist
+
+- [x] Enable DOWNLOAD_IMAGES in config
+- [x] Update docker-compose environment
+- [x] Add GraphQL client
+- [x] Update scraper integration
+- [x] Add database migrations
+- [x] Test with live lot
+- [ ] Deploy to production
+- [ ] Run full scrape to populate data
+- [ ] Verify monitor app sees downloaded images
+
+## Post-Deployment Verification
+
+### Check Data Quality
+```sql
+-- Bid data completeness
+SELECT
+ COUNT(*) as total,
+ SUM(CASE WHEN closing_time != '' THEN 1 ELSE 0 END) as has_closing,
+ SUM(CASE WHEN bid_count > 0 THEN 1 ELSE 0 END) as has_bids,
+ SUM(CASE WHEN starting_bid IS NOT NULL THEN 1 ELSE 0 END) as has_starting_bid
+FROM lots
+WHERE scraped_at > datetime('now', '-1 hour');
+
+-- Image download rate
+SELECT
+ COUNT(*) as total,
+ SUM(downloaded) as downloaded,
+ ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
+FROM images
+WHERE id IN (
+ SELECT i.id FROM images i
+ JOIN lots l ON i.lot_id = l.lot_id
+ WHERE l.scraped_at > datetime('now', '-1 hour')
+);
+
+-- Duplicate check (should be 0)
+SELECT lot_id, url, COUNT(*) as dup_count
+FROM images
+GROUP BY lot_id, url
+HAVING COUNT(*) > 1;
+```
+
+## Notes
+
+- GraphQL API requires no authentication
+- API rate limits: handled by existing `RATE_LIMIT_SECONDS = 0.5`
+- Currency format: Changed from € to EUR for Windows compatibility
+- Timestamps: API returns Unix timestamps in seconds (not milliseconds)
+- Existing data: Old lots still have missing data; re-scrape required to populate
diff --git a/check_apollo_state.py b/check_apollo_state.py
new file mode 100644
index 0000000..287981a
--- /dev/null
+++ b/check_apollo_state.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+"""Check for Apollo state or other embedded data"""
+import asyncio
+import json
+import re
+from playwright.async_api import async_playwright
+
+async def main():
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(headless=True)
+ page = await browser.new_page()
+
+ await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
+ content = await page.content()
+
+ # Look for embedded data structures
+ patterns = [
+ (r'', "NEXT_DATA"),
+ (r'window\.__APOLLO_STATE__\s*=\s*({.+?});', "APOLLO_STATE"),
+ (r'"lots"\s*:\s*\[(.+?)\]', "LOTS_ARRAY"),
+ ]
+
+ for pattern, name in patterns:
+ match = re.search(pattern, content, re.DOTALL)
+ if match:
+ print(f"\n{'='*60}")
+ print(f"FOUND: {name}")
+ print(f"{'='*60}")
+ try:
+ if name == "LOTS_ARRAY":
+ print(f"Preview: {match.group(1)[:500]}")
+ else:
+ data = json.loads(match.group(1))
+ print(json.dumps(data, indent=2)[:2000])
+ except:
+ print(f"Preview: {match.group(1)[:1000]}")
+
+ # Also check for any script tags with "lot" and "bid" and "end"
+ print(f"\n{'='*60}")
+ print("SEARCHING FOR LOT DATA IN ALL SCRIPTS")
+ print(f"{'='*60}")
+
+ scripts = re.findall(r'', content, re.DOTALL)
+ for i, script in enumerate(scripts):
+ if all(term in script.lower() for term in ['lot', 'bid', 'end']):
+ print(f"\nScript #{i} (first 500 chars):")
+ print(script[:500])
+ if i > 3: # Limit output
+ break
+
+ await browser.close()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/check_data.py b/check_data.py
new file mode 100644
index 0000000..c35f646
--- /dev/null
+++ b/check_data.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+"""Check current data quality in cache.db"""
+import sqlite3
+
+conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+
+print("=" * 60)
+print("CURRENT DATA QUALITY CHECK")
+print("=" * 60)
+
+# Check lots table
+print("\n[*] Sample Lot Data:")
+cursor = conn.execute("""
+ SELECT lot_id, current_bid, bid_count, closing_time
+ FROM lots
+ LIMIT 10
+""")
+for row in cursor:
+ print(f" Lot: {row[0]}")
+ print(f" Current Bid: {row[1]}")
+ print(f" Bid Count: {row[2]}")
+ print(f" Closing Time: {row[3]}")
+
+# Check auctions table
+print("\n[*] Sample Auction Data:")
+cursor = conn.execute("""
+ SELECT auction_id, title, closing_time, first_lot_closing_time
+ FROM auctions
+ LIMIT 5
+""")
+for row in cursor:
+ print(f" Auction: {row[0]}")
+ print(f" Title: {row[1][:50]}...")
+ print(f" Closing Time: {row[2] if len(row) > 2 else 'N/A'}")
+ print(f" First Lot Closing: {row[3]}")
+
+# Data completeness stats
+print("\n[*] Data Completeness:")
+cursor = conn.execute("""
+ SELECT
+ COUNT(*) as total,
+ SUM(CASE WHEN current_bid IS NULL OR current_bid = '' THEN 1 ELSE 0 END) as missing_current_bid,
+ SUM(CASE WHEN closing_time IS NULL OR closing_time = '' THEN 1 ELSE 0 END) as missing_closing_time,
+ SUM(CASE WHEN bid_count IS NULL OR bid_count = 0 THEN 1 ELSE 0 END) as zero_bid_count
+ FROM lots
+""")
+row = cursor.fetchone()
+print(f" Total lots: {row[0]:,}")
+print(f" Missing current_bid: {row[1]:,} ({100*row[1]/row[0]:.1f}%)")
+print(f" Missing closing_time: {row[2]:,} ({100*row[2]/row[0]:.1f}%)")
+print(f" Zero bid_count: {row[3]:,} ({100*row[3]/row[0]:.1f}%)")
+
+conn.close()
+print("\n" + "=" * 60)
diff --git a/debug_lot_structure.py b/debug_lot_structure.py
new file mode 100644
index 0000000..8a8148d
--- /dev/null
+++ b/debug_lot_structure.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Debug lot data structure from cached page"""
+import sqlite3
+import zlib
+import json
+import re
+import sys
+sys.path.insert(0, 'src')
+
+from parse import DataParser
+
+conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+
+# Get a recent lot page
+cursor = conn.execute("""
+ SELECT url, content
+ FROM cache
+ WHERE url LIKE '%/l/%'
+ ORDER BY timestamp DESC
+ LIMIT 1
+""")
+
+row = cursor.fetchone()
+if not row:
+ print("No lot pages found")
+ exit(1)
+
+url, content_blob = row
+content = zlib.decompress(content_blob).decode('utf-8')
+
+parser = DataParser()
+result = parser.parse_page(content, url)
+
+if result:
+ print(f"URL: {url}")
+ print(f"\nParsed Data:")
+ print(f" type: {result.get('type')}")
+ print(f" lot_id: {result.get('lot_id')}")
+ print(f" title: {result.get('title', '')[:50]}...")
+ print(f" current_bid: {result.get('current_bid')}")
+ print(f" bid_count: {result.get('bid_count')}")
+ print(f" closing_time: {result.get('closing_time')}")
+ print(f" location: {result.get('location')}")
+
+# Also dump the raw JSON
+match = re.search(r'', content, re.DOTALL)
+if match:
+ data = json.loads(match.group(1))
+ page_props = data.get('props', {}).get('pageProps', {})
+
+ if 'lot' in page_props:
+ lot = page_props['lot']
+ print(f"\nRAW __NEXT_DATA__.lot keys: {list(lot.keys())}")
+ print(f"\nSearching for bid/timing fields...")
+
+ # Deep search for these fields
+ def deep_search(obj, prefix=""):
+ if isinstance(obj, dict):
+ for k, v in obj.items():
+ if any(term in k.lower() for term in ['bid', 'end', 'close', 'date', 'time']):
+ print(f" {prefix}{k}: {v}")
+ if isinstance(v, (dict, list)):
+ deep_search(v, prefix + k + ".")
+ elif isinstance(obj, list) and len(obj) > 0:
+ deep_search(obj[0], prefix + "[0].")
+
+ deep_search(lot)
+
+conn.close()
diff --git a/extract_graphql_query.py b/extract_graphql_query.py
new file mode 100644
index 0000000..c9f2ec9
--- /dev/null
+++ b/extract_graphql_query.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""Extract the GraphQL query being used"""
+import asyncio
+import json
+from playwright.async_api import async_playwright
+
+async def main():
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(headless=True)
+ page = await browser.new_page()
+
+ graphql_requests = []
+
+ async def capture_request(request):
+ if 'graphql' in request.url:
+ graphql_requests.append({
+ 'url': request.url,
+ 'method': request.method,
+ 'post_data': request.post_data,
+ 'headers': dict(request.headers)
+ })
+
+ page.on('request', capture_request)
+
+ await page.goto("https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5", wait_until='networkidle')
+ await asyncio.sleep(2)
+
+ print(f"Captured {len(graphql_requests)} GraphQL requests\n")
+
+ for i, req in enumerate(graphql_requests):
+ print(f"{'='*60}")
+ print(f"REQUEST #{i+1}")
+ print(f"{'='*60}")
+ print(f"URL: {req['url']}")
+ print(f"Method: {req['method']}")
+
+ if req['post_data']:
+ try:
+ data = json.loads(req['post_data'])
+ print(f"\nQuery Name: {data.get('operationName', 'N/A')}")
+ print(f"\nVariables:")
+ print(json.dumps(data.get('variables', {}), indent=2))
+ print(f"\nQuery:")
+ print(data.get('query', '')[:1000])
+ except:
+ print(f"\nPOST Data: {req['post_data'][:500]}")
+
+ print()
+
+ await browser.close()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/find_api_endpoint.py b/find_api_endpoint.py
new file mode 100644
index 0000000..30f8e9e
--- /dev/null
+++ b/find_api_endpoint.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Find the API endpoint by monitoring network requests"""
+import asyncio
+import json
+from playwright.async_api import async_playwright
+
+async def main():
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(headless=True)
+ page = await browser.new_page()
+
+ requests = []
+ responses = []
+
+ async def log_request(request):
+ if any(term in request.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
+ requests.append({
+ 'url': request.url,
+ 'method': request.method,
+ 'headers': dict(request.headers),
+ 'post_data': request.post_data
+ })
+
+ async def log_response(response):
+ if any(term in response.url for term in ['api', 'graphql', 'lot', 'auction', 'bid']):
+ try:
+ body = await response.text()
+ responses.append({
+ 'url': response.url,
+ 'status': response.status,
+ 'body': body[:1000]
+ })
+ except:
+ pass
+
+ page.on('request', log_request)
+ page.on('response', log_response)
+
+ print("Loading lot page...")
+ await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
+
+ # Wait for dynamic content
+ await asyncio.sleep(3)
+
+ print(f"\nFound {len(requests)} relevant requests")
+ print(f"Found {len(responses)} relevant responses\n")
+
+ for req in requests[:10]:
+ print(f"REQUEST: {req['method']} {req['url']}")
+ if req['post_data']:
+ print(f" POST DATA: {req['post_data'][:200]}")
+
+ print("\n" + "="*60 + "\n")
+
+ for resp in responses[:10]:
+ print(f"RESPONSE: {resp['url']}")
+ print(f" Status: {resp['status']}")
+ print(f" Body: {resp['body'][:300]}")
+ print()
+
+ await browser.close()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/find_api_valid_lot.py b/find_api_valid_lot.py
new file mode 100644
index 0000000..25574b5
--- /dev/null
+++ b/find_api_valid_lot.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""Find API endpoint using a valid lot from database"""
+import asyncio
+import sqlite3
+from playwright.async_api import async_playwright
+
+# Get a valid lot URL
+conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+cursor = conn.execute("SELECT url FROM lots WHERE url LIKE '%/l/%' LIMIT 5")
+lot_urls = [row[0] for row in cursor.fetchall()]
+conn.close()
+
+async def main():
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(headless=True)
+ page = await browser.new_page()
+
+ api_calls = []
+
+ async def log_response(response):
+ url = response.url
+ # Look for API calls
+ if ('api' in url.lower() or 'graphql' in url.lower() or
+ '/v2/' in url or '/v3/' in url or '/v4/' in url or
+ 'query' in url.lower() or 'mutation' in url.lower()):
+ try:
+ body = await response.text()
+ api_calls.append({
+ 'url': url,
+ 'status': response.status,
+ 'body': body
+ })
+ print(f"\nAPI: {url}")
+ except:
+ pass
+
+ page.on('response', log_response)
+
+ for lot_url in lot_urls[:2]:
+ print(f"\n{'='*60}")
+ print(f"Loading: {lot_url}")
+ print(f"{'='*60}")
+
+ try:
+ await page.goto(lot_url, wait_until='networkidle', timeout=30000)
+ await asyncio.sleep(2)
+
+ # Check if page has bid info
+ content = await page.content()
+ if 'currentBid' in content or 'Current bid' in content or 'Huidig bod' in content:
+ print("[+] Page contains bid information")
+ break
+ except Exception as e:
+ print(f"[!] Error: {e}")
+ continue
+
+ print(f"\n\n{'='*60}")
+ print(f"CAPTURED {len(api_calls)} API CALLS")
+ print(f"{'='*60}")
+
+ for call in api_calls:
+ print(f"\n{call['url']}")
+ print(f"Status: {call['status']}")
+ if 'json' in call['body'][:100].lower() or call['body'].startswith('{'):
+ print(f"Body (first 500 chars): {call['body'][:500]}")
+
+ await browser.close()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/find_auction_with_lots.py b/find_auction_with_lots.py
new file mode 100644
index 0000000..4bed970
--- /dev/null
+++ b/find_auction_with_lots.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""Find an auction page with lots data"""
+import sqlite3
+import zlib
+import json
+import re
+
+conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+
+cursor = conn.execute("""
+ SELECT url, content
+ FROM cache
+ WHERE url LIKE '%/a/%'
+""")
+
+for row in cursor:
+ url, content_blob = row
+ content = zlib.decompress(content_blob).decode('utf-8')
+
+ match = re.search(r'', content, re.DOTALL)
+ if not match:
+ continue
+
+ data = json.loads(match.group(1))
+ page_props = data.get('props', {}).get('pageProps', {})
+
+ if 'auction' in page_props:
+ auction = page_props['auction']
+ lots = auction.get('lots', [])
+
+ if lots and len(lots) > 0:
+ print(f"Found auction with {len(lots)} lots: {url}\n")
+
+ lot = lots[0]
+ print(f"SAMPLE LOT FROM AUCTION.LOTS[]:")
+ print(f" displayId: {lot.get('displayId')}")
+ print(f" title: {lot.get('title', '')[:50]}...")
+ print(f" urlSlug: {lot.get('urlSlug')}")
+ print(f"\nBIDDING FIELDS:")
+ for key in ['currentBid', 'highestBid', 'startingBid', 'minimumBidAmount', 'bidCount', 'numberOfBids']:
+ print(f" {key}: {lot.get(key)}")
+ print(f"\nTIMING FIELDS:")
+ for key in ['endDate', 'startDate', 'closingTime']:
+ print(f" {key}: {lot.get(key)}")
+ print(f"\nALL KEYS: {list(lot.keys())[:30]}...")
+ break
+
+conn.close()
diff --git a/inspect_cached_page.py b/inspect_cached_page.py
new file mode 100644
index 0000000..ac67672
--- /dev/null
+++ b/inspect_cached_page.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Extract and inspect __NEXT_DATA__ from a cached lot page"""
+import sqlite3
+import zlib
+import json
+import re
+
+conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+
+# Get a cached auction page
+cursor = conn.execute("""
+ SELECT url, content
+ FROM cache
+ WHERE url LIKE '%/a/%'
+ LIMIT 1
+""")
+
+row = cursor.fetchone()
+if not row:
+ print("No cached lot pages found")
+ exit(1)
+
+url, content_blob = row
+print(f"Inspecting: {url}\n")
+
+# Decompress
+content = zlib.decompress(content_blob).decode('utf-8')
+
+# Extract __NEXT_DATA__
+match = re.search(r'', content, re.DOTALL)
+if not match:
+ print("No __NEXT_DATA__ found")
+ exit(1)
+
+data = json.loads(match.group(1))
+page_props = data.get('props', {}).get('pageProps', {})
+
+if 'auction' in page_props:
+ auction = page_props['auction']
+ print("AUCTION DATA STRUCTURE:")
+ print("=" * 60)
+ print(f"displayId: {auction.get('displayId')}")
+ print(f"name: {auction.get('name', '')[:50]}...")
+ print(f"lots count: {len(auction.get('lots', []))}")
+
+ if auction.get('lots'):
+ lot = auction['lots'][0]
+ print(f"\nFIRST LOT STRUCTURE:")
+ print(f" displayId: {lot.get('displayId')}")
+ print(f" title: {lot.get('title', '')[:50]}...")
+ print(f"\n BIDDING:")
+ print(f" currentBid: {lot.get('currentBid')}")
+ print(f" highestBid: {lot.get('highestBid')}")
+ print(f" startingBid: {lot.get('startingBid')}")
+ print(f" minimumBidAmount: {lot.get('minimumBidAmount')}")
+ print(f" bidCount: {lot.get('bidCount')}")
+ print(f" numberOfBids: {lot.get('numberOfBids')}")
+ print(f" TIMING:")
+ print(f" endDate: {lot.get('endDate')}")
+ print(f" startDate: {lot.get('startDate')}")
+ print(f" closingTime: {lot.get('closingTime')}")
+ print(f" ALL KEYS: {list(lot.keys())}")
+
+ print(f"\nAUCTION TIMING:")
+ print(f" minEndDate: {auction.get('minEndDate')}")
+ print(f" maxEndDate: {auction.get('maxEndDate')}")
+ print(f" ALL KEYS: {list(auction.keys())}")
+
+conn.close()
diff --git a/intercept_api.py b/intercept_api.py
new file mode 100644
index 0000000..43667e7
--- /dev/null
+++ b/intercept_api.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""Intercept API calls to find where lot data comes from"""
+import asyncio
+import json
+from playwright.async_api import async_playwright
+
+async def main():
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(headless=False)
+ page = await browser.new_page()
+
+ # Track API calls
+ api_calls = []
+
+ async def handle_response(response):
+ if 'api' in response.url.lower() or 'graphql' in response.url.lower():
+ try:
+ body = await response.json()
+ api_calls.append({
+ 'url': response.url,
+ 'status': response.status,
+ 'body': body
+ })
+ print(f"\nAPI CALL: {response.url}")
+ print(f"Status: {response.status}")
+ if 'lot' in response.url.lower() or 'auction' in response.url.lower():
+ print(f"Body preview: {json.dumps(body, indent=2)[:500]}")
+ except:
+ pass
+
+ page.on('response', handle_response)
+
+ # Visit auction page
+ print("Loading auction page...")
+ await page.goto("https://www.troostwijkauctions.com/a/woonunits-generatoren-reinigingsmachines-en-zakelijke-goederen-A1-37889", wait_until='networkidle')
+
+ # Wait a bit for lazy loading
+ await asyncio.sleep(5)
+
+ print(f"\n\nCaptured {len(api_calls)} API calls")
+
+ await browser.close()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/scrape_fresh_auction.py b/scrape_fresh_auction.py
new file mode 100644
index 0000000..61d6d22
--- /dev/null
+++ b/scrape_fresh_auction.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""Scrape a fresh auction page to see the lots array structure"""
+import asyncio
+import json
+import re
+from playwright.async_api import async_playwright
+
+async def main():
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(headless=True)
+ page = await browser.new_page()
+
+ # Get first auction
+ await page.goto("https://www.troostwijkauctions.com/auctions", wait_until='networkidle')
+ content = await page.content()
+
+ # Find first auction link
+ match = re.search(r'href="(/a/[^"]+)"', content)
+ if not match:
+ print("No auction found")
+ return
+
+ auction_url = f"https://www.troostwijkauctions.com{match.group(1)}"
+ print(f"Scraping: {auction_url}\n")
+
+ await page.goto(auction_url, wait_until='networkidle')
+ content = await page.content()
+
+ # Extract __NEXT_DATA__
+ match = re.search(r'', content, re.DOTALL)
+ if not match:
+ print("No __NEXT_DATA__ found")
+ return
+
+ data = json.loads(match.group(1))
+ page_props = data.get('props', {}).get('pageProps', {})
+
+ if 'auction' in page_props:
+ auction = page_props['auction']
+ print(f"Auction: {auction.get('name', '')[:50]}...")
+ print(f"Lots in array: {len(auction.get('lots', []))}")
+
+ if auction.get('lots'):
+ lot = auction['lots'][0]
+ print(f"\nFIRST LOT:")
+ print(json.dumps(lot, indent=2)[:1500])
+
+ await browser.close()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/src/cache.py b/src/cache.py
index 340f725..af64b41 100644
--- a/src/cache.py
+++ b/src/cache.py
@@ -50,6 +50,8 @@ class CacheManager:
url TEXT UNIQUE,
title TEXT,
current_bid TEXT,
+ starting_bid TEXT,
+ minimum_bid TEXT,
bid_count INTEGER,
closing_time TEXT,
viewing_time TEXT,
@@ -72,6 +74,15 @@ class CacheManager:
)
""")
+ # Add new columns to lots table if they don't exist
+ cursor = conn.execute("PRAGMA table_info(lots)")
+ columns = {row[1] for row in cursor.fetchall()}
+
+ if 'starting_bid' not in columns:
+ conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
+ if 'minimum_bid' not in columns:
+ conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
+
# Remove duplicates before creating unique index
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
conn.execute("""
@@ -165,15 +176,18 @@ class CacheManager:
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT OR REPLACE INTO lots
- (lot_id, auction_id, url, title, current_bid, bid_count, closing_time,
- viewing_time, pickup_date, location, description, category, scraped_at)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ (lot_id, auction_id, url, title, current_bid, starting_bid, minimum_bid,
+ bid_count, closing_time, viewing_time, pickup_date, location, description,
+ category, scraped_at)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
lot_data['lot_id'],
lot_data.get('auction_id', ''),
lot_data['url'],
lot_data['title'],
lot_data.get('current_bid', ''),
+ lot_data.get('starting_bid', ''),
+ lot_data.get('minimum_bid', ''),
lot_data.get('bid_count', 0),
lot_data.get('closing_time', ''),
lot_data.get('viewing_time', ''),
diff --git a/src/graphql_client.py b/src/graphql_client.py
new file mode 100644
index 0000000..01dbc66
--- /dev/null
+++ b/src/graphql_client.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+GraphQL client for fetching lot bidding data from Troostwijk API
+"""
+import aiohttp
+from typing import Dict, Optional
+
+GRAPHQL_ENDPOINT = "https://storefront.tbauctions.com/storefront/graphql"
+
+LOT_BIDDING_QUERY = """
+query LotBiddingData($lotDisplayId: String!, $locale: String!, $platform: Platform!) {
+ lotDetails(displayId: $lotDisplayId, locale: $locale, platform: $platform) {
+ estimatedFullPrice {
+ saleTerm
+ }
+ lot {
+ id
+ displayId
+ auctionId
+ currentBidAmount {
+ cents
+ currency
+ }
+ initialAmount {
+ cents
+ currency
+ }
+ nextMinimalBid {
+ cents
+ currency
+ }
+ nextBidStepInCents
+ vat
+ markupPercentage
+ biddingStatus
+ bidsCount
+ startDate
+ endDate
+ assignedExplicitly
+ minimumBidAmountMet
+ }
+ }
+}
+"""
+
+
+async def fetch_lot_bidding_data(lot_display_id: str) -> Optional[Dict]:
+ """
+ Fetch lot bidding data from GraphQL API
+
+ Args:
+ lot_display_id: The lot display ID (e.g., "A1-28505-5")
+
+ Returns:
+ Dict with bidding data or None if request fails
+ """
+ variables = {
+ "lotDisplayId": lot_display_id,
+ "locale": "nl",
+ "platform": "TWK"
+ }
+
+ payload = {
+ "query": LOT_BIDDING_QUERY,
+ "variables": variables
+ }
+
+ try:
+ async with aiohttp.ClientSession() as session:
+ async with session.post(GRAPHQL_ENDPOINT, json=payload, timeout=30) as response:
+ if response.status == 200:
+ data = await response.json()
+ lot_details = data.get('data', {}).get('lotDetails', {})
+
+ if lot_details and lot_details.get('lot'):
+ return lot_details
+ return None
+ else:
+ print(f" GraphQL API error: {response.status}")
+ return None
+ except Exception as e:
+ print(f" GraphQL request failed: {e}")
+ return None
+
+
+def format_bid_data(lot_details: Dict) -> Dict:
+ """
+ Format GraphQL lot details into scraper format
+
+ Args:
+ lot_details: Raw lot details from GraphQL API
+
+ Returns:
+ Dict with formatted bid data
+ """
+ lot = lot_details.get('lot', {})
+
+ current_bid_amount = lot.get('currentBidAmount')
+ initial_amount = lot.get('initialAmount')
+ next_minimal_bid = lot.get('nextMinimalBid')
+
+ # Format currency amounts
+ def format_cents(amount_obj):
+ if not amount_obj or not isinstance(amount_obj, dict):
+ return None
+ cents = amount_obj.get('cents')
+ currency = amount_obj.get('currency', 'EUR')
+ if cents is None:
+ return None
+ return f"EUR {cents / 100:.2f}" if currency == 'EUR' else f"{currency} {cents / 100:.2f}"
+
+ current_bid = format_cents(current_bid_amount) or "No bids"
+ starting_bid = format_cents(initial_amount) or ""
+ minimum_bid = format_cents(next_minimal_bid) or ""
+
+ # Format timestamps (Unix timestamps in seconds)
+ start_date = lot.get('startDate')
+ end_date = lot.get('endDate')
+
+ def format_timestamp(ts):
+ if ts:
+ from datetime import datetime
+ try:
+ # Timestamps are already in seconds
+ return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
+ except:
+ return ''
+ return ''
+
+ return {
+ 'current_bid': current_bid,
+ 'starting_bid': starting_bid,
+ 'minimum_bid': minimum_bid,
+ 'bid_count': lot.get('bidsCount', 0),
+ 'closing_time': format_timestamp(end_date),
+ 'bidding_status': lot.get('biddingStatus', ''),
+ 'vat_percentage': lot.get('vat', 0),
+ }
diff --git a/src/scraper.py b/src/scraper.py
index 2431319..2b9a87d 100644
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -19,6 +19,7 @@ from config import (
)
from cache import CacheManager
from parse import DataParser
+from graphql_client import fetch_lot_bidding_data, format_bid_data
class TroostwijkScraper:
"""Main scraper class for Troostwijk Auctions"""
@@ -176,29 +177,44 @@ class TroostwijkScraper:
self.visited_lots.add(url)
if page_data.get('type') == 'auction':
- print(f" → Type: AUCTION")
- print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
- print(f" → Location: {page_data.get('location', 'N/A')}")
- print(f" → Lots: {page_data.get('lots_count', 0)}")
+ print(f" Type: AUCTION")
+ print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
+ print(f" Location: {page_data.get('location', 'N/A')}")
+ print(f" Lots: {page_data.get('lots_count', 0)}")
self.cache.save_auction(page_data)
elif page_data.get('type') == 'lot':
- print(f" → Type: LOT")
- print(f" → Title: {page_data.get('title', 'N/A')[:60]}...")
- print(f" → Bid: {page_data.get('current_bid', 'N/A')}")
- print(f" → Location: {page_data.get('location', 'N/A')}")
+ print(f" Type: LOT")
+ print(f" Title: {page_data.get('title', 'N/A')[:60]}...")
+
+ # Fetch bidding data from GraphQL API
+ lot_id = page_data.get('lot_id')
+ print(f" Fetching bidding data from API...")
+ bidding_data = await fetch_lot_bidding_data(lot_id)
+
+ if bidding_data:
+ formatted_data = format_bid_data(bidding_data)
+ # Update page_data with real bidding info
+ page_data.update(formatted_data)
+ print(f" Bid: {page_data.get('current_bid', 'N/A')}")
+ print(f" Bid Count: {page_data.get('bid_count', 0)}")
+ print(f" Closing: {page_data.get('closing_time', 'N/A')}")
+ else:
+ print(f" Bid: {page_data.get('current_bid', 'N/A')} (from HTML)")
+
+ print(f" Location: {page_data.get('location', 'N/A')}")
self.cache.save_lot(page_data)
images = page_data.get('images', [])
if images:
self.cache.save_images(page_data['lot_id'], images)
- print(f" → Images: {len(images)}")
+ print(f" Images: {len(images)}")
if self.download_images:
for i, img_url in enumerate(images):
local_path = await self._download_image(img_url, page_data['lot_id'], i)
if local_path:
- print(f" ✓ Downloaded: {Path(local_path).name}")
+ print(f" Downloaded: {Path(local_path).name}")
return page_data
diff --git a/test_full_scraper.py b/test_full_scraper.py
new file mode 100644
index 0000000..3f4073f
--- /dev/null
+++ b/test_full_scraper.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Test the full scraper with one lot"""
+import asyncio
+import sys
+sys.path.insert(0, 'src')
+
+from scraper import TroostwijkScraper
+
+async def main():
+ scraper = TroostwijkScraper()
+
+ from playwright.async_api import async_playwright
+
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(headless=True)
+ page = await browser.new_page(
+ viewport={'width': 1920, 'height': 1080},
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+ )
+
+ # Test with a known lot
+ lot_url = "https://www.troostwijkauctions.com/l/%25282x%2529-duo-bureau-160x168-cm-A1-28505-5"
+
+ print(f"Testing with: {lot_url}\n")
+ result = await scraper.crawl_page(page, lot_url)
+
+ if result:
+ print(f"\n{'='*60}")
+ print("FINAL RESULT:")
+ print(f"{'='*60}")
+ print(f"Lot ID: {result.get('lot_id')}")
+ print(f"Title: {result.get('title', '')[:50]}...")
+ print(f"Current Bid: {result.get('current_bid')}")
+ print(f"Starting Bid: {result.get('starting_bid')}")
+ print(f"Minimum Bid: {result.get('minimum_bid')}")
+ print(f"Bid Count: {result.get('bid_count')}")
+ print(f"Closing Time: {result.get('closing_time')}")
+ print(f"Location: {result.get('location')}")
+
+ await browser.close()
+
+ # Verify database
+ import sqlite3
+ conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+ cursor = conn.execute("""
+ SELECT current_bid, starting_bid, minimum_bid, bid_count, closing_time
+ FROM lots
+ WHERE lot_id = 'A1-28505-5'
+ """)
+ row = cursor.fetchone()
+ conn.close()
+
+ if row:
+ print(f"\n{'='*60}")
+ print("DATABASE VERIFICATION:")
+ print(f"{'='*60}")
+ print(f"Current Bid: {row[0]}")
+ print(f"Starting Bid: {row[1]}")
+ print(f"Minimum Bid: {row[2]}")
+ print(f"Bid Count: {row[3]}")
+ print(f"Closing Time: {row[4]}")
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/test_graphql_scraper.py b/test_graphql_scraper.py
new file mode 100644
index 0000000..71eda86
--- /dev/null
+++ b/test_graphql_scraper.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+"""Test the updated scraper with GraphQL integration"""
+import asyncio
+import sys
+sys.path.insert(0, 'src')
+
+from graphql_client import fetch_lot_bidding_data, format_bid_data
+
+async def main():
+ # Test with known lot ID
+ lot_id = "A1-28505-5"
+
+ print(f"Testing GraphQL API with lot: {lot_id}\n")
+
+ bidding_data = await fetch_lot_bidding_data(lot_id)
+
+ if bidding_data:
+ print("Raw GraphQL Response:")
+ print("="*60)
+ import json
+ print(json.dumps(bidding_data, indent=2))
+
+ print("\n\nFormatted Data:")
+ print("="*60)
+ formatted = format_bid_data(bidding_data)
+ for key, value in formatted.items():
+ print(f" {key}: {value}")
+ else:
+ print("Failed to fetch bidding data")
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/test_live_lot.py b/test_live_lot.py
new file mode 100644
index 0000000..78096ee
--- /dev/null
+++ b/test_live_lot.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""Test scraping a single live lot page"""
+import asyncio
+import sys
+sys.path.insert(0, 'src')
+
+from scraper import TroostwijkScraper
+
+async def main():
+ scraper = TroostwijkScraper()
+
+ from playwright.async_api import async_playwright
+
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(headless=True)
+ page = await browser.new_page()
+
+ # Get a lot URL from the database
+ import sqlite3
+ conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+ cursor = conn.execute("SELECT url FROM lots LIMIT 1")
+ row = cursor.fetchone()
+ conn.close()
+
+ if not row:
+ print("No lots in database")
+ return
+
+ lot_url = row[0]
+ print(f"Fetching: {lot_url}\n")
+
+ result = await scraper.crawl_page(page, lot_url)
+
+ if result:
+ print(f"\nExtracted Data:")
+ print(f" current_bid: {result.get('current_bid')}")
+ print(f" bid_count: {result.get('bid_count')}")
+ print(f" closing_time: {result.get('closing_time')}")
+
+ await browser.close()
+
+if __name__ == "__main__":
+ asyncio.run(main())