scaev/search_cached_viewing.py

#!/usr/bin/env python3
"""Search cached pages for viewing/pickup text"""
import sqlite3
import zlib
import re

conn = sqlite3.connect('/mnt/okcomputer/output/cache.db')

cursor = conn.execute("""
    SELECT url, content
    FROM cache
    WHERE url LIKE '%/l/%'
    ORDER BY timestamp DESC
    LIMIT 20
""")

for url, content_blob in cursor:
    try:
        content = zlib.decompress(content_blob).decode('utf-8')

        # Look for viewing/pickup patterns
        if 'bezichtig' in content.lower() or 'ophalen' in content.lower():
            print(f"\n{'='*60}")
            print(f"URL: {url}")
            print(f"{'='*60}")

            # Extract sections with context
            patterns = [
                (r'(Bezichtigingen?.*?(?:\n.*?){0,5})', 'VIEWING'),
                (r'(Ophalen.*?(?:\n.*?){0,5})', 'PICKUP'),
            ]

            for pattern, label in patterns:
                matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
                if matches:
                    print(f"\n{label}:")
                    for match in matches[:1]:  # First match
                        # Clean up HTML
                        clean = re.sub(r'<[^>]+>', ' ', match)
                        clean = re.sub(r'\s+', ' ', clean).strip()
                        print(f"  {clean[:200]}")

            break  # Found one, that's enough
    except:
        continue

conn.close()