enrich data

This commit is contained in:
Tour
2025-12-07 01:59:45 +01:00
parent d09ee5574f
commit 08bf112c3f
9 changed files with 1750 additions and 32 deletions

47
search_cached_viewing.py Normal file
View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python3
"""Search cached pages for viewing/pickup text"""
import sqlite3
import zlib
import re
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
cursor = conn.execute("""
SELECT url, content
FROM cache
WHERE url LIKE '%/l/%'
ORDER BY timestamp DESC
LIMIT 20
""")
for url, content_blob in cursor:
try:
content = zlib.decompress(content_blob).decode('utf-8')
# Look for viewing/pickup patterns
if 'bezichtig' in content.lower() or 'ophalen' in content.lower():
print(f"\n{'='*60}")
print(f"URL: {url}")
print(f"{'='*60}")
# Extract sections with context
patterns = [
(r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'),
(r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'),
]
for pattern, label in patterns:
matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
if matches:
print(f"\n{label}:")
for match in matches[:1]: # First match
# Clean up HTML
clean = re.sub(r'<[^>]+>', ' ', match)
clean = re.sub(r'\s+', ' ', clean).strip()
print(f" {clean[:200]}")
break # Found one, that's enough
except:
continue
conn.close()