From e9eb7ea5d9138838860aa5c50bfdc308986626f1 Mon Sep 17 00:00:00 2001
From: mike <michael@appmodel.nl>
Date: Sat, 13 Dec 2025 11:35:33 +0100
Subject: [PATCH] fly wa

---
 app/analysis/folder_analyzer.py            | 110 +++++++++
 app/enrichment/enricher.py                 |  59 +++++
 app/enrichment/llm_client.py               |  54 +++++
 app/main.py                                | 251 ++++++++++++++++++++-
 app/parsers/code_parser.py                 |  44 ++++
 app/parsers/media_parser.py                |  42 ++++
 app/parsers/pdf_parser.py                  |  31 +++
 app/parsers/text_parser.py                 |  26 +++
 docker-compose.yml                         |  31 +++
 flyway.conf                                |   7 +
 requirements.txt                           |   2 +
 sql/init.sql                               | 176 ---------------
 sql/migration/V001__init.sql               | 188 +++++++++++++++
 sql/migration/V002__add_extracted_text.sql |  11 +
 sql/migration/V003__add_folder_support.sql |  41 ++++
 sql/setup_database.sql                     |  42 +---
 16 files changed, 899 insertions(+), 216 deletions(-)
 create mode 100644 app/analysis/folder_analyzer.py
 create mode 100644 app/enrichment/enricher.py
 create mode 100644 app/enrichment/llm_client.py
 create mode 100644 app/parsers/code_parser.py
 create mode 100644 app/parsers/media_parser.py
 create mode 100644 app/parsers/pdf_parser.py
 create mode 100644 app/parsers/text_parser.py
 create mode 100644 flyway.conf
 delete mode 100644 sql/init.sql
 create mode 100644 sql/migration/V001__init.sql
 create mode 100644 sql/migration/V002__add_extracted_text.sql
 create mode 100644 sql/migration/V003__add_folder_support.sql

diff --git a/app/analysis/folder_analyzer.py b/app/analysis/folder_analyzer.py
new file mode 100644
index 0000000..74d4e1c
--- /dev/null
+++ b/app/analysis/folder_analyzer.py
@@ -0,0 +1,110 @@
+from pathlib import Path
+from typing import Dict, Set, List
+from collections import Counter
+
+class FolderAnalyzer:
+    def __init__(self):
+        self.manifest_files = {
+            'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'],
+            'javascript': ['package.json', 'yarn.lock', 'package-lock.json'],
+            'python': ['pyproject.toml', 'setup.py', 'requirements.txt', 'Pipfile'],
+            'go': ['go.mod', 'go.sum'],
+            'rust': ['Cargo.toml', 'Cargo.lock'],
+            'docker': ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml'],
+            'k8s': ['helm', 'kustomization.yaml', 'deployment.yaml']
+        }
+
+        self.intent_keywords = {
+            'infrastructure': ['infra', 'deploy', 'k8s', 'docker', 'terraform', 'ansible'],
+            'application': ['app', 'service', 'api', 'server', 'client'],
+            'data': ['data', 'dataset', 'models', 'training', 'ml'],
+            'documentation': ['docs', 'documentation', 'wiki', 'readme'],
+            'testing': ['test', 'tests', 'spec', 'e2e', 'integration'],
+            'build': ['build', 'dist', 'target', 'out', 'bin'],
+            'config': ['config', 'conf', 'settings', 'env']
+        }
+
+    def analyze_folder(self, folder_path: Path, files: List[Dict]) -> Dict:
+        files_list = [Path(f['path']) for f in files]
+
+        has_readme = any('readme' in f.name.lower() for f in files_list)
+        has_git = any('.git' in str(f) for f in files_list)
+
+        manifest_types = self._detect_manifests(files_list)
+        has_manifest = len(manifest_types) > 0
+
+        file_types = Counter(f.suffix.lower() for f in files_list if f.suffix)
+        dominant_types = dict(file_types.most_common(10))
+
+        intent = self._infer_intent(folder_path.name.lower(), files_list)
+        project_type = self._infer_project_type(manifest_types, dominant_types)
+
+        structure = {
+            'depth': len(folder_path.parts),
+            'has_src': any('src' in str(f) for f in files_list[:20]),
+            'has_tests': any('test' in str(f) for f in files_list[:20]),
+            'has_docs': any('doc' in str(f) for f in files_list[:20])
+        }
+
+        return {
+            'has_readme': has_readme,
+            'has_git': has_git,
+            'has_manifest': has_manifest,
+            'manifest_types': manifest_types,
+            'dominant_file_types': dominant_types,
+            'project_type': project_type,
+            'intent': intent,
+            'structure': structure
+        }
+
+    def _detect_manifests(self, files: List[Path]) -> List[str]:
+        detected = []
+        file_names = {f.name for f in files}
+
+        for tech, manifests in self.manifest_files.items():
+            if any(m in file_names for m in manifests):
+                detected.append(tech)
+
+        return detected
+
+    def _infer_intent(self, folder_name: str, files: List[Path]) -> str:
+        file_str = ' '.join(str(f) for f in files[:50])
+
+        for intent, keywords in self.intent_keywords.items():
+            if any(kw in folder_name or kw in file_str.lower() for kw in keywords):
+                return intent
+
+        return 'unknown'
+
+    def _infer_project_type(self, manifests: List[str], file_types: Dict) -> str:
+        if manifests:
+            return manifests[0]
+
+        if '.py' in file_types and file_types.get('.py', 0) > 5:
+            return 'python'
+        if '.js' in file_types or '.ts' in file_types:
+            return 'javascript'
+        if '.java' in file_types:
+            return 'java'
+        if '.go' in file_types:
+            return 'go'
+
+        return 'mixed'
+
+    def generate_summary(self, folder_analysis: Dict, readme_text: str = None) -> str:
+        parts = []
+
+        if folder_analysis.get('project_type'):
+            parts.append(f"{folder_analysis['project_type']} project")
+
+        if folder_analysis.get('intent'):
+            parts.append(f"for {folder_analysis['intent']}")
+
+        if folder_analysis.get('manifest_types'):
+            parts.append(f"using {', '.join(folder_analysis['manifest_types'])}")
+
+        if readme_text:
+            first_para = readme_text.split('\n\n')[0][:200]
+            parts.append(f"Description: {first_para}")
+
+        return ' '.join(parts) if parts else 'Mixed content folder'
diff --git a/app/enrichment/enricher.py b/app/enrichment/enricher.py
new file mode 100644
index 0000000..9185ce0
--- /dev/null
+++ b/app/enrichment/enricher.py
@@ -0,0 +1,59 @@
+from typing import Dict
+import re
+
+class ContentEnricher:
+    def __init__(self, llm_client=None):
+        self.llm_client = llm_client
+        self.pii_patterns = {
+            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
+            'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
+            'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
+            'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
+        }
+
+    def enrich(self, text: str, use_llm: bool = False) -> Dict:
+        enrichment = {
+            'summary': self._basic_summary(text),
+            'word_count': len(text.split()),
+            'has_pii': self._detect_pii(text),
+            'quality': self._assess_quality(text),
+            'topics': self._extract_basic_topics(text)
+        }
+
+        if use_llm and self.llm_client:
+            llm_result = self.llm_client.classify_content(text)
+            if llm_result.get('success'):
+                enrichment['llm_classification'] = llm_result['text']
+
+        return enrichment
+
+    def _basic_summary(self, text: str) -> str:
+        sentences = re.split(r'[.!?]+', text)
+        return ' '.join(sentences[:3])[:200]
+
+    def _detect_pii(self, text: str) -> Dict:
+        detected = {}
+        for pii_type, pattern in self.pii_patterns.items():
+            matches = re.findall(pattern, text)
+            if matches:
+                detected[pii_type] = len(matches)
+        return detected
+
+    def _assess_quality(self, text: str) -> str:
+        if len(text.strip()) < 10:
+            return 'low'
+
+        special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
+        if special_char_ratio > 0.3:
+            return 'low'
+
+        return 'high' if len(text.split()) > 50 else 'medium'
+
+    def _extract_basic_topics(self, text: str) -> list:
+        words = re.findall(r'\b[A-Z][a-z]+\b', text)
+        word_freq = {}
+        for word in words:
+            if len(word) > 3:
+                word_freq[word] = word_freq.get(word, 0) + 1
+
+        return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
diff --git a/app/enrichment/llm_client.py b/app/enrichment/llm_client.py
new file mode 100644
index 0000000..526b4a4
--- /dev/null
+++ b/app/enrichment/llm_client.py
@@ -0,0 +1,54 @@
+import requests
+import json
+from typing import Dict, Optional
+
+class LLMClient:
+    def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'):
+        self.endpoint = endpoint
+        self.model = model
+        self.local_ollama = 'http://localhost:11434'
+
+    def summarize(self, text: str, max_length: int = 200) -> Dict:
+        prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}"
+        return self._query(prompt)
+
+    def extract_topics(self, text: str) -> Dict:
+        prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}"
+        return self._query(prompt)
+
+    def classify_content(self, text: str) -> Dict:
+        prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}"
+        return self._query(prompt)
+
+    def _query(self, prompt: str, use_local: bool = False) -> Dict:
+        try:
+            endpoint = self.local_ollama if use_local else self.endpoint
+
+            if use_local:
+                response = requests.post(
+                    f'{endpoint}/api/generate',
+                    json={'model': 'llama3.2', 'prompt': prompt, 'stream': False},
+                    timeout=30
+                )
+            else:
+                response = requests.post(
+                    f'{endpoint}/v1/chat/completions',
+                    json={
+                        'model': self.model,
+                        'messages': [{'role': 'user', 'content': prompt}],
+                        'max_tokens': 500
+                    },
+                    timeout=30
+                )
+
+            if response.status_code == 200:
+                data = response.json()
+                if use_local:
+                    return {'success': True, 'text': data.get('response', '')}
+                else:
+                    return {'success': True, 'text': data['choices'][0]['message']['content']}
+            else:
+                return {'success': False, 'error': f'HTTP {response.status_code}'}
+
+        except Exception as e:
+            return {'success': False, 'error': str(e)}
diff --git a/app/main.py b/app/main.py
index 545017c..5ac1937 100644
--- a/app/main.py
+++ b/app/main.py
@@ -27,7 +27,7 @@ class DiskReorganizer:
 
     def __init__(self, db_config: Dict=None):
         if db_config is None:
-            db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
+            db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'auction'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
         self.db_config = db_config
         self.init_database()
 
@@ -522,23 +522,126 @@ class DiskReorganizer:
             cursor.close()
             conn.close()
 
-    def classify_files(self, disk: Optional[str]=None, update_db: bool=False):
+    def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False):
+        from parsers.text_parser import TextParser
+        from parsers.code_parser import CodeParser
+        from parsers.pdf_parser import PDFParser
+
+        parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()}
+        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
+
+        conn = self.get_connection()
+        cursor = conn.cursor()
+
+        try:
+            query = "SELECT path, size, disk_label FROM files WHERE 1=1"
+            params = []
+            if kind:
+                suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
+                if kind in suffix_map:
+                    query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
+            query += f" LIMIT {limit}"
+
+            cursor.execute(query, params)
+            files = cursor.fetchall()
+
+            print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
+
+            parsed_count = 0
+            for path, size, disk_label in files:
+                mount_point = disk_mount_map.get(disk_label, disk_label)
+                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
+
+                if not full_path.exists() or int(size) > 10 * 1024 * 1024:
+                    continue
+
+                file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text'
+                parser = parsers.get(file_kind)
+                if not parser:
+                    continue
+
+                result = parser.parse(full_path)
+                if 'error' not in result:
+                    text = result.get('text', '')
+                    quality = result.get('quality', 'unknown')
+                    print(f"{path[:60]} | {file_kind} | {len(text):,} chars")
+
+                    if update_db and text:
+                        cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path))
+                        parsed_count += 1
+                        if parsed_count % 10 == 0:
+                            conn.commit()
+
+            if update_db:
+                conn.commit()
+            print(f"\nParsed {parsed_count} files")
+
+        finally:
+            cursor.close()
+            conn.close()
+
+    def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
+        from enrichment.enricher import ContentEnricher
+
+        enricher = ContentEnricher()
+        conn = self.get_connection()
+        cursor = conn.cursor()
+
+        try:
+            cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
+            files = cursor.fetchall()
+
+            print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
+
+            for path, text in files:
+                enrichment = enricher.enrich(text[:5000], use_llm=False)
+                print(f"{path[:60]}")
+                print(f"  Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
+                print(f"  PII: {list(enrichment.get('has_pii', {}).keys())}")
+                print(f"  Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
+
+                cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
+
+            conn.commit()
+            print(f"Enriched {len(files)} files")
+
+        finally:
+            cursor.close()
+            conn.close()
+
+    def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True):
         from classification.classifier import FileClassifier
         classifier = FileClassifier()
         conn = self.get_connection()
         cursor = conn.cursor()
         try:
+            task_name = f"classify_{disk or 'all'}"
+            skip_count = 0
+
+            if resume and update_db:
+                cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,))
+                checkpoint = cursor.fetchone()
+                if checkpoint:
+                    last_path, skip_count = checkpoint
+                    logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed')
+
             if disk:
-                cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s', (disk,))
+                cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,))
             else:
-                cursor.execute('SELECT path, size, disk_label FROM files')
+                cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path')
             files = cursor.fetchall()
             total = len(files)
             logger.info(f'Classifying {total:,} files...')
+
             categories = {}
             build_artifacts = 0
             batch = []
+            processed = 0
+
             for idx, (path, size, disk_label) in enumerate(files, 1):
+                if idx <= skip_count:
+                    continue
+
                 labels, category, is_build = classifier.classify_path(path, int(size))
                 if is_build:
                     build_artifacts += 1
@@ -546,18 +649,40 @@ class DiskReorganizer:
                     categories[category] = {'count': 0, 'size': 0}
                 categories[category]['count'] += 1
                 categories[category]['size'] += int(size)
+
                 if update_db:
                     labels_str = ','.join(labels)
                     batch.append((category, labels_str, path))
+
                     if len(batch) >= 1000:
                         cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
+                        cursor.execute('''
+                            INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
+                            VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
+                            ON CONFLICT (task_name) DO UPDATE SET
+                                last_processed_path = EXCLUDED.last_processed_path,
+                                processed_count = EXCLUDED.processed_count,
+                                updated_at = CURRENT_TIMESTAMP
+                        ''', (task_name, path, idx))
                         conn.commit()
                         batch.clear()
+
+                processed += 1
                 if idx % 1000 == 0:
-                    print(f'\rClassified: {idx:,}/{total:,}', end='', flush=True)
+                    print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True)
+
             if update_db and batch:
                 cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
+                cursor.execute('''
+                    INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
+                    VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
+                    ON CONFLICT (task_name) DO UPDATE SET
+                        last_processed_path = EXCLUDED.last_processed_path,
+                        processed_count = EXCLUDED.processed_count,
+                        updated_at = CURRENT_TIMESTAMP
+                ''', (task_name, files[-1][0] if files else '', total))
                 conn.commit()
+
             print()
             print(f'\n=== CLASSIFICATION SUMMARY ===')
             print(f'Total files: {total:,}')
@@ -570,6 +695,99 @@ class DiskReorganizer:
             cursor.close()
             conn.close()
 
+    def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
+        from analysis.folder_analyzer import FolderAnalyzer
+        analyzer = FolderAnalyzer()
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            query = '''
+                SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label
+                FROM files
+                WHERE 1=1
+            '''
+            params = []
+            if disk:
+                query += ' AND disk_label = %s'
+                params.append(disk)
+
+            cursor.execute(query, params)
+            potential_folders = cursor.fetchall()
+
+            logger.info(f'Found {len(potential_folders)} potential folders to analyze')
+
+            processed = 0
+            for folder_name, disk_label in potential_folders:
+                cursor.execute('''
+                    SELECT path, size FROM files
+                    WHERE disk_label = %s AND path LIKE %s
+                ''', (disk_label, f'{folder_name}%'))
+
+                files = cursor.fetchall()
+                if len(files) < min_files:
+                    continue
+
+                files_list = [{'path': f[0], 'size': int(f[1])} for f in files]
+                folder_path = Path(folder_name)
+
+                analysis = analyzer.analyze_folder(folder_path, files_list)
+
+                readme_text = None
+                for file_dict in files_list:
+                    if 'readme' in file_dict['path'].lower():
+                        readme_text = f"Found README at {file_dict['path']}"
+                        break
+
+                summary = analyzer.generate_summary(analysis, readme_text)
+
+                cursor.execute('''
+                    INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary,
+                                        has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure)
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+                    ON CONFLICT (path) DO UPDATE SET
+                        file_count = EXCLUDED.file_count,
+                        total_size = EXCLUDED.total_size,
+                        project_type = EXCLUDED.project_type,
+                        intent = EXCLUDED.intent,
+                        summary = EXCLUDED.summary,
+                        has_readme = EXCLUDED.has_readme,
+                        has_git = EXCLUDED.has_git,
+                        has_manifest = EXCLUDED.has_manifest,
+                        manifest_types = EXCLUDED.manifest_types,
+                        dominant_file_types = EXCLUDED.dominant_file_types,
+                        structure = EXCLUDED.structure,
+                        updated_at = CURRENT_TIMESTAMP
+                ''', (
+                    str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list),
+                    analysis.get('project_type'), analysis.get('intent'), summary,
+                    analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'),
+                    analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})),
+                    json.dumps(analysis.get('structure', {}))
+                ))
+
+                processed += 1
+                if processed % 100 == 0:
+                    conn.commit()
+                    print(f'\rAnalyzed: {processed} folders', end='', flush=True)
+
+            conn.commit()
+            print()
+            logger.info(f'Completed folder analysis: {processed} folders')
+
+            cursor.execute('''
+                SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size)
+                FROM folders
+                GROUP BY project_type
+            ''')
+            print(f'\n=== FOLDER ANALYSIS SUMMARY ===')
+            for row in cursor.fetchall():
+                proj_type, count, files, size = row
+                print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}')
+
+        finally:
+            cursor.close()
+            conn.close()
+
     def review_migration(self, category: Optional[str]=None, show_build: bool=False):
         from classification.classifier import FileClassifier
         classifier = FileClassifier()
@@ -640,9 +858,24 @@ def main():
     extract_parser = subparsers.add_parser('extract', help='Extract content from files')
     extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
     extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
+
+    parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
+    parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
+    parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
+    parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
+
+    enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
+    enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
+    enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
+    enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
+
     classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
     classify_parser.add_argument('--disk', help='Classify specific disk')
     classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
+    classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch instead of resuming')
+    folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
+    folders_parser.add_argument('--disk', help='Analyze specific disk')
+    folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
     review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
     review_parser.add_argument('--category', help='Review specific category')
     review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
@@ -669,8 +902,14 @@ def main():
         tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
     elif args.command == 'extract':
         tool.extract_content(kind=args.kind, limit=args.limit)
+    elif args.command == 'parse':
+        tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
+    elif args.command == 'enrich':
+        tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
     elif args.command == 'classify':
-        tool.classify_files(disk=args.disk, update_db=args.update)
+        tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
+    elif args.command == 'analyze-folders':
+        tool.analyze_folders(disk=args.disk, min_files=args.min_files)
     elif args.command == 'review':
         tool.review_migration(category=args.category, show_build=args.show_build)
     elif args.command == 'report':
diff --git a/app/parsers/code_parser.py b/app/parsers/code_parser.py
new file mode 100644
index 0000000..8a27ae0
--- /dev/null
+++ b/app/parsers/code_parser.py
@@ -0,0 +1,44 @@
+from pathlib import Path
+from typing import Dict
+import re
+
+class CodeParser:
+    def __init__(self):
+        self.patterns = {
+            'python': {'imports': r'^import |^from .+ import', 'class': r'^class \w+', 'function': r'^def \w+'},
+            'javascript': {'imports': r'^import |^require\(', 'class': r'^class \w+', 'function': r'^function \w+|^const \w+ = '},
+            'java': {'package': r'^package ', 'imports': r'^import ', 'class': r'^public class \w+'},
+            'go': {'package': r'^package ', 'imports': r'^import ', 'function': r'^func \w+'}
+        }
+
+    def parse(self, file_path: Path) -> Dict:
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                text = f.read()
+
+            language = self._detect_language(file_path, text)
+            structure = self._extract_structure(text, language)
+
+            return {
+                'text': text,
+                'language': language,
+                'line_count': len(text.split('\n')),
+                'structure': structure,
+                'quality': 'high'
+            }
+        except Exception as e:
+            return {'error': str(e)}
+
+    def _detect_language(self, file_path: Path, text: str) -> str:
+        lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go'}
+        return lang_map.get(file_path.suffix.lower(), 'unknown')
+
+    def _extract_structure(self, text: str, language: str) -> Dict:
+        patterns = self.patterns.get(language, {})
+        structure = {'type': 'code', 'language': language}
+
+        for key, pattern in patterns.items():
+            matches = re.findall(pattern, text, re.MULTILINE)
+            structure[key] = len(matches)
+
+        return structure
diff --git a/app/parsers/media_parser.py b/app/parsers/media_parser.py
new file mode 100644
index 0000000..1384d59
--- /dev/null
+++ b/app/parsers/media_parser.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+from typing import Dict
+
+class MediaParser:
+    def parse_audio(self, file_path: Path) -> Dict:
+        return {
+            'text': '[Audio transcription pending]',
+            'needs_transcription': True,
+            'transcription_service': 'whisper',
+            'structure': {'type': 'audio'},
+            'quality': 'pending'
+        }
+
+    def parse_video(self, file_path: Path) -> Dict:
+        return {
+            'text': '[Video transcription pending]',
+            'needs_transcription': True,
+            'needs_scene_detection': True,
+            'transcription_service': 'whisper',
+            'structure': {'type': 'video'},
+            'quality': 'pending'
+        }
+
+    def parse_image(self, file_path: Path) -> Dict:
+        try:
+            from PIL import Image
+
+            with Image.open(file_path) as img:
+                width, height = img.size
+                mode = img.mode
+
+            return {
+                'text': '[Image caption/OCR pending]',
+                'needs_ocr': True,
+                'needs_caption': True,
+                'dimensions': f'{width}x{height}',
+                'mode': mode,
+                'structure': {'type': 'image', 'width': width, 'height': height},
+                'quality': 'pending'
+            }
+        except Exception as e:
+            return {'error': str(e)}
diff --git a/app/parsers/pdf_parser.py b/app/parsers/pdf_parser.py
new file mode 100644
index 0000000..55f1113
--- /dev/null
+++ b/app/parsers/pdf_parser.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+from typing import Dict, List
+
+class PDFParser:
+    def parse(self, file_path: Path) -> Dict:
+        try:
+            import PyPDF2
+
+            pages = []
+            with open(file_path, 'rb') as f:
+                pdf = PyPDF2.PdfReader(f)
+                page_count = len(pdf.pages)
+
+                for i, page in enumerate(pdf.pages[:50]):
+                    text = page.extract_text()
+                    pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
+
+            full_text = '\n\n'.join([p['text'] for p in pages])
+            has_text_layer = sum(p['char_count'] for p in pages) > 100
+
+            return {
+                'text': full_text,
+                'page_count': page_count,
+                'pages_extracted': len(pages),
+                'has_text_layer': has_text_layer,
+                'needs_ocr': not has_text_layer,
+                'structure': {'type': 'document', 'pages': pages[:5]},
+                'quality': 'high' if has_text_layer else 'needs_ocr'
+            }
+        except Exception as e:
+            return {'error': str(e), 'needs_ocr': True}
diff --git a/app/parsers/text_parser.py b/app/parsers/text_parser.py
new file mode 100644
index 0000000..7a17589
--- /dev/null
+++ b/app/parsers/text_parser.py
@@ -0,0 +1,26 @@
+from pathlib import Path
+from typing import Dict, Optional
+import chardet
+
+class TextParser:
+    def parse(self, file_path: Path) -> Dict:
+        try:
+            with open(file_path, 'rb') as f:
+                raw_data = f.read(1024 * 1024)
+
+            encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
+            text = raw_data.decode(encoding, errors='ignore')
+
+            lines = text.split('\n')
+
+            return {
+                'text': text,
+                'encoding': encoding,
+                'line_count': len(lines),
+                'char_count': len(text),
+                'word_count': len(text.split()),
+                'structure': {'type': 'plain_text'},
+                'quality': 'high' if encoding == 'utf-8' else 'medium'
+            }
+        except Exception as e:
+            return {'error': str(e)}
diff --git a/docker-compose.yml b/docker-compose.yml
index 87b8696..7754528 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -232,6 +232,37 @@ services:
     networks:
       - defrag-network
 
+  flyway:
+    image: flyway/flyway:latest
+    container_name: flyway
+    volumes:
+      - ./sql/migration:/flyway/sql:ro
+    environment:
+      FLYWAY_URL: jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
+      FLYWAY_USER: disk_reorg_user
+      FLYWAY_PASSWORD: heel-goed-wachtwoord
+      FLYWAY_SCHEMAS: public
+      FLYWAY_LOCATIONS: filesystem:./sql
+      FLYWAY_CONNECT_RETRIES: "60"
+    command: migrate
+    restart: "no"
+
+  pg_backup:
+    image: postgres:16
+    container_name: pg_backup
+    environment:
+      PGPASSWORD: heel-goed-wachtwoord
+    volumes:
+      - ./:/backup
+    command:
+      - bash
+      - -lc
+      - >
+        pg_dump -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db
+        --format=custom --no-owner --no-privileges
+        -f /backup/backup_$(date +%F_%H%M)_disk_reorganizer_db.dump
+    restart: "no"
+
 networks:
   defrag-network:
     driver: bridge
diff --git a/flyway.conf b/flyway.conf
new file mode 100644
index 0000000..6d73888
--- /dev/null
+++ b/flyway.conf
@@ -0,0 +1,7 @@
+flyway.url=jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
+flyway.user=disk_org_user
+flyway.password=heel-goed-wachtwoord
+
+flyway.locations=filesystem:sql/migration
+flyway.schemas=public
+
diff --git a/requirements.txt b/requirements.txt
index f838a8a..dc2b5b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,3 +37,5 @@ pytest-cov>=4.0.0
 black>=23.0.0
 mypy>=1.0.0
 flake8>=6.0.0
+
+chardet
\ No newline at end of file
diff --git a/sql/init.sql b/sql/init.sql
deleted file mode 100644
index 58266c7..0000000
--- a/sql/init.sql
+++ /dev/null
@@ -1,176 +0,0 @@
--- sql/init.sql
--- Initialize PostgreSQL database for Project Defrag
-
--- Enable useful extensions
-CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
-CREATE EXTENSION IF NOT EXISTS "pgcrypto";
-
--- Files table
-CREATE TABLE IF NOT EXISTS files (
-    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-    path TEXT NOT NULL,
-    size BIGINT NOT NULL,
-    modified_time TIMESTAMP WITH TIME ZONE,
-    created_time TIMESTAMP WITH TIME ZONE,
-    file_hash VARCHAR(64),  -- SHA-256 hash
-    checksum VARCHAR(64),   -- Alias for file_hash (legacy compatibility)
-    category VARCHAR(50),
-    disk_label VARCHAR(50),
-    last_verified TIMESTAMP WITH TIME ZONE,
-    status VARCHAR(20) DEFAULT 'indexed',
-    duplicate_of TEXT,      -- Path to canonical file if this is a duplicate
-
-    -- Metadata
-    metadata JSONB DEFAULT '{}',
-
-    -- Audit fields
-    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
-    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
-
-    -- Constraints
-    CONSTRAINT unique_file_path UNIQUE(path)
-);
-
--- Operations table (audit log)
-CREATE TABLE IF NOT EXISTS operations (
-    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-    operation_type VARCHAR(50) NOT NULL,
-    source_path TEXT,
-    target_path TEXT,
-    status VARCHAR(20) NOT NULL,
-
-    -- Legacy compatibility fields
-    executed INTEGER DEFAULT 0,
-    verified INTEGER DEFAULT 0,
-    error TEXT,
-
-    -- File reference
-    file_id UUID REFERENCES files(id) ON DELETE SET NULL,
-
-    -- Performance metrics
-    duration_ms INTEGER,
-    bytes_processed BIGINT,
-
-    -- Error information
-    error_message TEXT,
-    error_details JSONB,
-
-    -- Context
-    session_id VARCHAR(100),
-    user_agent TEXT,
-
-    -- Audit fields
-    started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
-    completed_at TIMESTAMP WITH TIME ZONE,
-    executed_at TIMESTAMP WITH TIME ZONE,
-    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
-);
-
--- Deduplication hash store
-CREATE TABLE IF NOT EXISTS deduplication_store (
-    hash VARCHAR(64) PRIMARY KEY,
-    canonical_path TEXT NOT NULL,
-    reference_count INTEGER DEFAULT 1,
-    first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
-    last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
-);
-
--- Migration plan table
-CREATE TABLE IF NOT EXISTS migration_plans (
-    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-    name VARCHAR(100) NOT NULL,
-    source_disk VARCHAR(50) NOT NULL,
-    target_disk VARCHAR(50) NOT NULL,
-    plan_json JSONB NOT NULL,
-
-    -- Statistics
-    total_files INTEGER DEFAULT 0,
-    total_size BIGINT DEFAULT 0,
-    estimated_duration INTEGER,  -- in seconds
-
-    -- Status
-    status VARCHAR(20) DEFAULT 'draft',
-
-    -- Audit
-    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
-    executed_at TIMESTAMP WITH TIME ZONE,
-    completed_at TIMESTAMP WITH TIME ZONE
-);
-
--- Indexes for performance
-CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
-CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
-CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
-CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
-CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
-create index on files (checksum);
-create index on files (checksum,path);
-
-CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
-CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);
-CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations(file_id);
-
-CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store(canonical_path);
-
--- Functions for updating timestamps
-CREATE OR REPLACE FUNCTION update_updated_at_column()
-RETURNS TRIGGER AS $$
-BEGIN
-    NEW.updated_at = CURRENT_TIMESTAMP;
-    RETURN NEW;
-END;
-$$ language 'plpgsql';
-
--- Triggers for automatic updated_at
-CREATE TRIGGER update_files_updated_at BEFORE UPDATE ON files
-    FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
-
--- View for operational dashboard
-CREATE OR REPLACE VIEW operational_dashboard AS
-SELECT
-    o.status,
-    COUNT(*) as operation_count,
-    SUM(o.bytes_processed) as total_bytes,
-    AVG(o.duration_ms) as avg_duration_ms,
-    MIN(o.started_at) as earliest_operation,
-    MAX(o.completed_at) as latest_operation
-FROM operations o
-WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
-GROUP BY o.status;
-
--- View for disk usage statistics
-CREATE OR REPLACE VIEW disk_usage_stats AS
-SELECT
-    disk_label,
-    COUNT(*) as file_count,
-    SUM(size) as total_size,
-    AVG(size) as avg_file_size,
-    MIN(created_time) as oldest_file,
-    MAX(modified_time) as newest_file
-FROM files
-GROUP BY disk_label;
-
--- Insert default configuration
-INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
-VALUES (
-    'Default Migration Plan',
-    'disk_d',
-    'disk_e',
-    '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
-    'draft'
-) ON CONFLICT DO NOTHING;
-
--- Create read-only user for monitoring
-DO $$
-BEGIN
-    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
-        CREATE USER monitor_user WITH PASSWORD 'monitor_password';
-    END IF;
-END
-$$;
-
-GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
-GRANT USAGE ON SCHEMA public TO monitor_user;
-GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
-GRANT SELECT ON operational_dashboard TO monitor_user;
-GRANT SELECT ON disk_usage_stats TO monitor_user;
\ No newline at end of file
diff --git a/sql/migration/V001__init.sql b/sql/migration/V001__init.sql
new file mode 100644
index 0000000..ee1366f
--- /dev/null
+++ b/sql/migration/V001__init.sql
@@ -0,0 +1,188 @@
+-- sql/init.sql
+-- Initialize PostgreSQL database for Project Defrag
+
+-- Enable useful extensions
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+CREATE EXTENSION IF NOT EXISTS "pgcrypto";
+-- future tables/sequences created by your owner role (pick the role that creates them)
+ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
+    GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
+
+ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
+    GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
+ALTER DATABASE disk_reorganizer_db OWNER TO disk_reorg_user;
+-- Files table
+CREATE TABLE IF NOT EXISTS files
+(
+    id            UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
+    path          TEXT   NOT NULL,
+    size          BIGINT NOT NULL,
+    modified_time TIMESTAMP WITH TIME ZONE,
+    created_time  TIMESTAMP WITH TIME ZONE,
+    file_hash     VARCHAR(64), -- SHA-256 hash
+    checksum      VARCHAR(64), -- Alias for file_hash (legacy compatibility)
+    category      VARCHAR(50),
+    disk_label    VARCHAR(50),
+    last_verified TIMESTAMP WITH TIME ZONE,
+    status        VARCHAR(20)              DEFAULT 'indexed',
+    duplicate_of  TEXT,        -- Path to canonical file if this is a duplicate
+
+    -- Metadata
+    metadata      JSONB                    DEFAULT '{}',
+
+    -- Audit fields
+    created_at    TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_at    TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+
+    -- Constraints
+    CONSTRAINT unique_file_path UNIQUE (path)
+);
+
+-- Operations table (audit log)
+CREATE TABLE IF NOT EXISTS operations
+(
+    id              UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
+    operation_type  VARCHAR(50) NOT NULL,
+    source_path     TEXT,
+    target_path     TEXT,
+    status          VARCHAR(20) NOT NULL,
+
+    -- Legacy compatibility fields
+    executed        INTEGER                  DEFAULT 0,
+    verified        INTEGER                  DEFAULT 0,
+    error           TEXT,
+
+    -- File reference
+    file_id         UUID        REFERENCES files (id) ON DELETE SET NULL,
+
+    -- Performance metrics
+    duration_ms     INTEGER,
+    bytes_processed BIGINT,
+
+    -- Error information
+    error_message   TEXT,
+    error_details   JSONB,
+
+    -- Context
+    session_id      VARCHAR(100),
+    user_agent      TEXT,
+
+    -- Audit fields
+    started_at      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    completed_at    TIMESTAMP WITH TIME ZONE,
+    executed_at     TIMESTAMP WITH TIME ZONE,
+    created_at      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Deduplication hash store
+CREATE TABLE IF NOT EXISTS deduplication_store
+(
+    hash            VARCHAR(64) PRIMARY KEY,
+    canonical_path  TEXT NOT NULL,
+    reference_count INTEGER                  DEFAULT 1,
+    first_seen      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    last_seen       TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Migration plan table
+CREATE TABLE IF NOT EXISTS migration_plans
+(
+    id                 UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
+    name               VARCHAR(100) NOT NULL,
+    source_disk        VARCHAR(50)  NOT NULL,
+    target_disk        VARCHAR(50)  NOT NULL,
+    plan_json          JSONB        NOT NULL,
+
+    -- Statistics
+    total_files        INTEGER                  DEFAULT 0,
+    total_size         BIGINT                   DEFAULT 0,
+    estimated_duration INTEGER, -- in seconds
+
+    -- Status
+    status             VARCHAR(20)              DEFAULT 'draft',
+
+    -- Audit
+    created_at         TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    executed_at        TIMESTAMP WITH TIME ZONE,
+    completed_at       TIMESTAMP WITH TIME ZONE
+);
+
+-- Indexes for performance
+CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
+CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
+CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
+CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
+CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
+create index on files (checksum);
+create index on files (checksum, path);
+
+CREATE INDEX IF NOT EXISTS idx_operations_status ON operations (status);
+CREATE INDEX IF NOT EXISTS idx_operations_created ON operations (created_at);
+CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations (file_id);
+
+CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store (canonical_path);
+
+-- Functions for updating timestamps
+CREATE OR REPLACE FUNCTION update_updated_at_column()
+    RETURNS TRIGGER AS
+$$
+BEGIN
+    NEW.updated_at = CURRENT_TIMESTAMP;
+    RETURN NEW;
+END;
+$$ language 'plpgsql';
+
+-- Triggers for automatic updated_at
+CREATE TRIGGER update_files_updated_at
+    BEFORE UPDATE
+    ON files
+    FOR EACH ROW
+EXECUTE FUNCTION update_updated_at_column();
+
+-- View for operational dashboard
+CREATE OR REPLACE VIEW operational_dashboard AS
+SELECT o.status,
+       COUNT(*)               as operation_count,
+       SUM(o.bytes_processed) as total_bytes,
+       AVG(o.duration_ms)     as avg_duration_ms,
+       MIN(o.started_at)      as earliest_operation,
+       MAX(o.completed_at)    as latest_operation
+FROM operations o
+WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
+GROUP BY o.status;
+
+-- View for disk usage statistics
+CREATE OR REPLACE VIEW disk_usage_stats AS
+SELECT disk_label,
+       COUNT(*)           as file_count,
+       SUM(size)          as total_size,
+       AVG(size)          as avg_file_size,
+       MIN(created_time)  as oldest_file,
+       MAX(modified_time) as newest_file
+FROM files
+GROUP BY disk_label;
+
+-- Insert default configuration
+INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
+VALUES ('Default Migration Plan',
+        'disk_d',
+        'disk_e',
+        '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
+        'draft')
+ON CONFLICT DO NOTHING;
+
+-- Create read-only user for monitoring
+DO
+$$
+    BEGIN
+        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
+            CREATE USER monitor_user WITH PASSWORD 'monitor_password';
+        END IF;
+    END
+$$;
+
+GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
+GRANT USAGE ON SCHEMA public TO monitor_user;
+GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
+GRANT SELECT ON operational_dashboard TO monitor_user;
+GRANT SELECT ON disk_usage_stats TO monitor_user;
\ No newline at end of file
diff --git a/sql/migration/V002__add_extracted_text.sql b/sql/migration/V002__add_extracted_text.sql
new file mode 100644
index 0000000..a3e03ad
--- /dev/null
+++ b/sql/migration/V002__add_extracted_text.sql
@@ -0,0 +1,11 @@
+-- Add extracted text and enrichment columns
+ALTER TABLE files ADD COLUMN IF NOT EXISTS extracted_text TEXT;
+ALTER TABLE files ADD COLUMN IF NOT EXISTS text_quality VARCHAR(20);
+ALTER TABLE files ADD COLUMN IF NOT EXISTS enrichment JSONB;
+
+-- Add indexes for text search
+CREATE INDEX IF NOT EXISTS idx_files_extracted_text ON files USING gin(to_tsvector('english', extracted_text));
+CREATE INDEX IF NOT EXISTS idx_files_enrichment ON files USING gin(enrichment);
+
+-- Add full text search capability
+CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING gin(to_tsvector('english', COALESCE(extracted_text, '')));
diff --git a/sql/migration/V003__add_folder_support.sql b/sql/migration/V003__add_folder_support.sql
new file mode 100644
index 0000000..210bcb0
--- /dev/null
+++ b/sql/migration/V003__add_folder_support.sql
@@ -0,0 +1,41 @@
+CREATE TABLE IF NOT EXISTS folders
+(
+    id                  UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
+    path                TEXT NOT NULL UNIQUE,
+    parent_path         TEXT,
+    disk_label          VARCHAR(50),
+
+    file_count          INT                      DEFAULT 0,
+    total_size          BIGINT                   DEFAULT 0,
+
+    project_type        VARCHAR(50),
+    intent              TEXT,
+    summary             TEXT,
+
+    has_readme          BOOLEAN                  DEFAULT FALSE,
+    has_git             BOOLEAN                  DEFAULT FALSE,
+    has_manifest        BOOLEAN                  DEFAULT FALSE,
+    manifest_types      TEXT[],
+    dominant_file_types JSONB,
+
+    structure           JSONB,
+
+    created_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE INDEX IF NOT EXISTS idx_folders_path ON folders (path);
+CREATE INDEX IF NOT EXISTS idx_folders_parent ON folders (parent_path);
+CREATE INDEX IF NOT EXISTS idx_folders_disk ON folders (disk_label);
+CREATE INDEX IF NOT EXISTS idx_folders_project_type ON folders (project_type);
+
+CREATE TABLE IF NOT EXISTS processing_checkpoints
+(
+    task_name           VARCHAR(100) PRIMARY KEY,
+    last_processed_id   TEXT,
+    last_processed_path TEXT,
+    processed_count     INT                      DEFAULT 0,
+    total_count         INT,
+    started_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
diff --git a/sql/setup_database.sql b/sql/setup_database.sql
index 0745998..27b7992 100644
--- a/sql/setup_database.sql
+++ b/sql/setup_database.sql
@@ -19,54 +19,27 @@ CREATE DATABASE disk_reorganizer_db
 CREATE USER disk_reorg_user WITH PASSWORD 'heel-goed-wachtwoord';
 
 -- Create files table
-CREATE TABLE IF NOT EXISTS files (
-    path TEXT PRIMARY KEY,
-    size BIGINT NOT NULL,
-    modified_time DOUBLE PRECISION NOT NULL,
-    disk_label TEXT NOT NULL,
-    checksum TEXT,
-    status TEXT DEFAULT 'indexed',
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-);
 
 -- Create index on disk column for faster queries
-CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label);
-CREATE INDEX IF NOT EXISTS idx_files_status ON files(status);
 
--- Create operations table
-CREATE TABLE IF NOT EXISTS operations (
-    id SERIAL PRIMARY KEY,
-    source_path TEXT NOT NULL,
-    target_path TEXT NOT NULL,
-    operation_type TEXT NOT NULL,
-    executed INTEGER DEFAULT 0,
-    verified INTEGER DEFAULT 0,
-    error TEXT,
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    executed_at TIMESTAMP
-);
-
--- Create index on operations for faster lookups
-CREATE INDEX IF NOT EXISTS idx_operations_executed ON operations(executed);
-CREATE INDEX IF NOT EXISTS idx_operations_source ON operations(source_path);
 
 -- Grant privileges to disk_reorg_user
 GRANT CONNECT ON DATABASE disk_reorganizer_db TO disk_reorg_user;
 GRANT USAGE ON SCHEMA public TO disk_reorg_user;
-GRANT ALL PRIVILEGES ON ALL TABLES    IN SCHEMA public TO disk_reorg_user;
+GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO disk_reorg_user;
 GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO disk_reorg_user;
 
 -- future tables/sequences created by your owner role (pick the role that creates them)
 ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
-  GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
+    GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
 
 ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
-  GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
+    GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
 
 -- Create function to update updated_at timestamp
 CREATE OR REPLACE FUNCTION update_updated_at_column()
-RETURNS TRIGGER AS $$
+    RETURNS TRIGGER AS
+$$
 BEGIN
     NEW.updated_at = CURRENT_TIMESTAMP;
     RETURN NEW;
@@ -75,9 +48,10 @@ $$ LANGUAGE plpgsql;
 
 -- Create trigger for files table
 CREATE TRIGGER update_files_updated_at
-    BEFORE UPDATE ON files
+    BEFORE UPDATE
+    ON files
     FOR EACH ROW
-    EXECUTE FUNCTION update_updated_at_column();
+EXECUTE FUNCTION update_updated_at_column();
 
 -- Display success message
 \echo 'Database setup completed successfully!'