enrich data
This commit is contained in:
45
extract_viewing_from_html.py
Normal file
45
extract_viewing_from_html.py
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find viewing/pickup in actual HTML"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
import re
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# Try a lot that should have viewing times
|
||||
await page.goto("https://www.troostwijkauctions.com/l/woonunit-type-tp-4-b-6m-nr-102-A1-37889-102", wait_until='networkidle')
|
||||
|
||||
# Get text content
|
||||
text_content = await page.evaluate("document.body.innerText")
|
||||
|
||||
print("Searching for viewing/pickup patterns...\n")
|
||||
|
||||
# Look for "Bezichtigingen" section
|
||||
lines = text_content.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
if 'bezichtig' in line.lower() or 'viewing' in line.lower():
|
||||
# Print surrounding context
|
||||
context = lines[max(0, i-1):min(len(lines), i+5)]
|
||||
print("FOUND Bezichtigingen:")
|
||||
for c in context:
|
||||
print(f" {c}")
|
||||
print()
|
||||
break
|
||||
|
||||
# Look for "Ophalen" section
|
||||
for i, line in enumerate(lines):
|
||||
if 'ophalen' in line.lower() or 'collection' in line.lower() or 'pickup' in line.lower():
|
||||
context = lines[max(0, i-1):min(len(lines), i+5)]
|
||||
print("FOUND Ophalen:")
|
||||
for c in context:
|
||||
print(f" {c}")
|
||||
print()
|
||||
break
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user