enrich data

2025-12-07 01:59:45 +01:00
parent d09ee5574f
commit 08bf112c3f
9 changed files with 1750 additions and 32 deletions
--- a/search_cached_viewing.py
+++ b/search_cached_viewing.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+"""Search cached pages for viewing/pickup text"""
+import sqlite3
+import zlib
+import re
+
+conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
+
+cursor = conn.execute("""
+    SELECT url, content
+    FROM cache
+    WHERE url LIKE '%/l/%'
+    ORDER BY timestamp DESC
+    LIMIT 20
+""")
+
+for url, content_blob in cursor:
+    try:
+        content = zlib.decompress(content_blob).decode('utf-8')
+
+        # Look for viewing/pickup patterns
+        if 'bezichtig' in content.lower() or 'ophalen' in content.lower():
+            print(f"\n{'='*60}")
+            print(f"URL: {url}")
+            print(f"{'='*60}")
+
+            # Extract sections with context
+            patterns = [
+                (r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'),
+                (r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'),
+            ]
+
+            for pattern, label in patterns:
+                matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
+                if matches:
+                    print(f"\n{label}:")
+                    for match in matches[:1]:  # First match
+                        # Clean up HTML
+                        clean = re.sub(r'<[^>]+>', ' ', match)
+                        clean = re.sub(r'\s+', ' ', clean).strip()
+                        print(f"  {clean[:200]}")
+
+            break  # Found one, that's enough
+    except:
+        continue
+
+conn.close()