GraphQL integrate, data correctness
This commit is contained in:
51
scrape_fresh_auction.py
Normal file
51
scrape_fresh_auction.py
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Scrape a fresh auction page to see the lots array structure"""
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# Get first auction
|
||||
await page.goto("https://www.troostwijkauctions.com/auctions", wait_until='networkidle')
|
||||
content = await page.content()
|
||||
|
||||
# Find first auction link
|
||||
match = re.search(r'href="(/a/[^"]+)"', content)
|
||||
if not match:
|
||||
print("No auction found")
|
||||
return
|
||||
|
||||
auction_url = f"https://www.troostwijkauctions.com{match.group(1)}"
|
||||
print(f"Scraping: {auction_url}\n")
|
||||
|
||||
await page.goto(auction_url, wait_until='networkidle')
|
||||
content = await page.content()
|
||||
|
||||
# Extract __NEXT_DATA__
|
||||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||||
if not match:
|
||||
print("No __NEXT_DATA__ found")
|
||||
return
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
if 'auction' in page_props:
|
||||
auction = page_props['auction']
|
||||
print(f"Auction: {auction.get('name', '')[:50]}...")
|
||||
print(f"Lots in array: {len(auction.get('lots', []))}")
|
||||
|
||||
if auction.get('lots'):
|
||||
lot = auction['lots'][0]
|
||||
print(f"\nFIRST LOT:")
|
||||
print(json.dumps(lot, indent=2)[:1500])
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user