52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Scrape a fresh auction page to see the lots array structure"""
|
|
import asyncio
|
|
import json
|
|
import re
|
|
from playwright.async_api import async_playwright
|
|
|
|
async def main():
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
page = await browser.new_page()
|
|
|
|
# Get first auction
|
|
await page.goto("https://www.troostwijkauctions.com/auctions", wait_until='networkidle')
|
|
content = await page.content()
|
|
|
|
# Find first auction link
|
|
match = re.search(r'href="(/a/[^"]+)"', content)
|
|
if not match:
|
|
print("No auction found")
|
|
return
|
|
|
|
auction_url = f"https://www.troostwijkauctions.com{match.group(1)}"
|
|
print(f"Scraping: {auction_url}\n")
|
|
|
|
await page.goto(auction_url, wait_until='networkidle')
|
|
content = await page.content()
|
|
|
|
# Extract __NEXT_DATA__
|
|
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
|
if not match:
|
|
print("No __NEXT_DATA__ found")
|
|
return
|
|
|
|
data = json.loads(match.group(1))
|
|
page_props = data.get('props', {}).get('pageProps', {})
|
|
|
|
if 'auction' in page_props:
|
|
auction = page_props['auction']
|
|
print(f"Auction: {auction.get('name', '')[:50]}...")
|
|
print(f"Lots in array: {len(auction.get('lots', []))}")
|
|
|
|
if auction.get('lots'):
|
|
lot = auction['lots'][0]
|
|
print(f"\nFIRST LOT:")
|
|
print(json.dumps(lot, indent=2)[:1500])
|
|
|
|
await browser.close()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|