enrich data

This commit is contained in:
Tour
2025-12-07 01:26:48 +01:00
parent bb7f4bbe9d
commit d09ee5574f
14 changed files with 1221 additions and 7 deletions

49
inspect_lot_html.py Normal file
View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python3
"""Inspect a lot page HTML to find viewing_time and pickup_date"""
import asyncio
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Use the known lot
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
content = await page.content()
print("Searching for patterns...")
print("="*60)
# Search for viewing time patterns
import re
patterns = {
'Bezichtigingen': r'Bezichtigingen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
'viewing': r'(?i)viewing.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
'Ophalen': r'Ophalen.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
'pickup': r'(?i)pickup.*?(\d{2}\s+\w{3}\s+\d{4}\s+van\s+\d{2}:\d{2}\s+tot\s+\d{2}:\d{2})',
'Status': r'Status\s+([^<]+)',
}
for name, pattern in patterns.items():
matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
if matches:
print(f"\n{name}:")
for match in matches[:3]:
print(f" {match[:200]}")
# Also look for structured data
print("\n\nSearching for 'Bezichtigingen' section:")
bez_match = re.search(r'Bezichtigingen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
if bez_match:
print(bez_match.group(0)[:500])
print("\n\nSearching for 'Ophalen' section:")
oph_match = re.search(r'Ophalen.*?<.*?>(.*?)</.*?>', content, re.DOTALL)
if oph_match:
print(oph_match.group(0)[:500])
await browser.close()
if __name__ == "__main__":
asyncio.run(main())