enrich data
This commit is contained in:
47
search_cached_viewing.py
Normal file
47
search_cached_viewing.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Search cached pages for viewing/pickup text"""
|
||||
import sqlite3
|
||||
import zlib
|
||||
import re
|
||||
|
||||
conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')
|
||||
|
||||
cursor = conn.execute("""
|
||||
SELECT url, content
|
||||
FROM cache
|
||||
WHERE url LIKE '%/l/%'
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 20
|
||||
""")
|
||||
|
||||
for url, content_blob in cursor:
|
||||
try:
|
||||
content = zlib.decompress(content_blob).decode('utf-8')
|
||||
|
||||
# Look for viewing/pickup patterns
|
||||
if 'bezichtig' in content.lower() or 'ophalen' in content.lower():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"URL: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Extract sections with context
|
||||
patterns = [
|
||||
(r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'),
|
||||
(r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'),
|
||||
]
|
||||
|
||||
for pattern, label in patterns:
|
||||
matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
|
||||
if matches:
|
||||
print(f"\n{label}:")
|
||||
for match in matches[:1]: # First match
|
||||
# Clean up HTML
|
||||
clean = re.sub(r'<[^>]+>', ' ', match)
|
||||
clean = re.sub(r'\s+', ' ', clean).strip()
|
||||
print(f" {clean[:200]}")
|
||||
|
||||
break # Found one, that's enough
|
||||
except:
|
||||
continue
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user