fly wa

2025-12-13 11:35:33 +01:00
parent 9759001f4c
commit e9eb7ea5d9
16 changed files with 899 additions and 216 deletions
--- a/app/analysis/folder_analyzer.py
+++ b/app/analysis/folder_analyzer.py
@@ -0,0 +1,110 @@
 from pathlib import Path
 from typing import Dict, Set, List
 from collections import Counter
 class FolderAnalyzer:
    def __init__(self):
        self.manifest_files = {
            'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'],
            'javascript': ['package.json', 'yarn.lock', 'package-lock.json'],
            'python': ['pyproject.toml', 'setup.py', 'requirements.txt', 'Pipfile'],
            'go': ['go.mod', 'go.sum'],
            'rust': ['Cargo.toml', 'Cargo.lock'],
            'docker': ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml'],
            'k8s': ['helm', 'kustomization.yaml', 'deployment.yaml']
        }
        self.intent_keywords = {
            'infrastructure': ['infra', 'deploy', 'k8s', 'docker', 'terraform', 'ansible'],
            'application': ['app', 'service', 'api', 'server', 'client'],
            'data': ['data', 'dataset', 'models', 'training', 'ml'],
            'documentation': ['docs', 'documentation', 'wiki', 'readme'],
            'testing': ['test', 'tests', 'spec', 'e2e', 'integration'],
            'build': ['build', 'dist', 'target', 'out', 'bin'],
            'config': ['config', 'conf', 'settings', 'env']
        }
    def analyze_folder(self, folder_path: Path, files: List[Dict]) -> Dict:
        files_list = [Path(f['path']) for f in files]
        has_readme = any('readme' in f.name.lower() for f in files_list)
        has_git = any('.git' in str(f) for f in files_list)
        manifest_types = self._detect_manifests(files_list)
        has_manifest = len(manifest_types) > 0
        file_types = Counter(f.suffix.lower() for f in files_list if f.suffix)
        dominant_types = dict(file_types.most_common(10))
        intent = self._infer_intent(folder_path.name.lower(), files_list)
        project_type = self._infer_project_type(manifest_types, dominant_types)
        structure = {
            'depth': len(folder_path.parts),
            'has_src': any('src' in str(f) for f in files_list[:20]),
            'has_tests': any('test' in str(f) for f in files_list[:20]),
            'has_docs': any('doc' in str(f) for f in files_list[:20])
        }
        return {
            'has_readme': has_readme,
            'has_git': has_git,
            'has_manifest': has_manifest,
            'manifest_types': manifest_types,
            'dominant_file_types': dominant_types,
            'project_type': project_type,
            'intent': intent,
            'structure': structure
        }
    def _detect_manifests(self, files: List[Path]) -> List[str]:
        detected = []
        file_names = {f.name for f in files}
        for tech, manifests in self.manifest_files.items():
            if any(m in file_names for m in manifests):
                detected.append(tech)
        return detected
    def _infer_intent(self, folder_name: str, files: List[Path]) -> str:
        file_str = ' '.join(str(f) for f in files[:50])
        for intent, keywords in self.intent_keywords.items():
            if any(kw in folder_name or kw in file_str.lower() for kw in keywords):
                return intent
        return 'unknown'
    def _infer_project_type(self, manifests: List[str], file_types: Dict) -> str:
        if manifests:
            return manifests[0]
        if '.py' in file_types and file_types.get('.py', 0) > 5:
            return 'python'
        if '.js' in file_types or '.ts' in file_types:
            return 'javascript'
        if '.java' in file_types:
            return 'java'
        if '.go' in file_types:
            return 'go'
        return 'mixed'
    def generate_summary(self, folder_analysis: Dict, readme_text: str = None) -> str:
        parts = []
        if folder_analysis.get('project_type'):
            parts.append(f"{folder_analysis['project_type']} project")
        if folder_analysis.get('intent'):
            parts.append(f"for {folder_analysis['intent']}")
        if folder_analysis.get('manifest_types'):
            parts.append(f"using {', '.join(folder_analysis['manifest_types'])}")
        if readme_text:
            first_para = readme_text.split('\n\n')[0][:200]
            parts.append(f"Description: {first_para}")
        return ' '.join(parts) if parts else 'Mixed content folder'
--- a/app/enrichment/enricher.py
+++ b/app/enrichment/enricher.py
@@ -0,0 +1,59 @@
 from typing import Dict
 import re
 class ContentEnricher:
    def __init__(self, llm_client=None):
        self.llm_client = llm_client
        self.pii_patterns = {
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
            'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
            'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
        }
    def enrich(self, text: str, use_llm: bool = False) -> Dict:
        enrichment = {
            'summary': self._basic_summary(text),
            'word_count': len(text.split()),
            'has_pii': self._detect_pii(text),
            'quality': self._assess_quality(text),
            'topics': self._extract_basic_topics(text)
        }
        if use_llm and self.llm_client:
            llm_result = self.llm_client.classify_content(text)
            if llm_result.get('success'):
                enrichment['llm_classification'] = llm_result['text']
        return enrichment
    def _basic_summary(self, text: str) -> str:
        sentences = re.split(r'[.!?]+', text)
        return ' '.join(sentences[:3])[:200]
    def _detect_pii(self, text: str) -> Dict:
        detected = {}
        for pii_type, pattern in self.pii_patterns.items():
            matches = re.findall(pattern, text)
            if matches:
                detected[pii_type] = len(matches)
        return detected
    def _assess_quality(self, text: str) -> str:
        if len(text.strip()) < 10:
            return 'low'
        special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
        if special_char_ratio > 0.3:
            return 'low'
        return 'high' if len(text.split()) > 50 else 'medium'
    def _extract_basic_topics(self, text: str) -> list:
        words = re.findall(r'\b[A-Z][a-z]+\b', text)
        word_freq = {}
        for word in words:
            if len(word) > 3:
                word_freq[word] = word_freq.get(word, 0) + 1
        return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
--- a/app/enrichment/llm_client.py
+++ b/app/enrichment/llm_client.py
@@ -0,0 +1,54 @@
 import requests
 import json
 from typing import Dict, Optional
 class LLMClient:
    def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'):
        self.endpoint = endpoint
        self.model = model
        self.local_ollama = 'http://localhost:11434'
    def summarize(self, text: str, max_length: int = 200) -> Dict:
        prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}"
        return self._query(prompt)
    def extract_topics(self, text: str) -> Dict:
        prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}"
        return self._query(prompt)
    def classify_content(self, text: str) -> Dict:
        prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}"
        return self._query(prompt)
    def _query(self, prompt: str, use_local: bool = False) -> Dict:
        try:
            endpoint = self.local_ollama if use_local else self.endpoint
            if use_local:
                response = requests.post(
                    f'{endpoint}/api/generate',
                    json={'model': 'llama3.2', 'prompt': prompt, 'stream': False},
                    timeout=30
                )
            else:
                response = requests.post(
                    f'{endpoint}/v1/chat/completions',
                    json={
                        'model': self.model,
                        'messages': [{'role': 'user', 'content': prompt}],
                        'max_tokens': 500
                    },
                    timeout=30
                )
            if response.status_code == 200:
                data = response.json()
                if use_local:
                    return {'success': True, 'text': data.get('response', '')}
                else:
                    return {'success': True, 'text': data['choices'][0]['message']['content']}
            else:
                return {'success': False, 'error': f'HTTP {response.status_code}'}
        except Exception as e:
            return {'success': False, 'error': str(e)}
--- a/app/main.py
+++ b/app/main.py
@@ -27,7 +27,7 @@ class DiskReorganizer:
    def __init__(self, db_config: Dict=None):
        if db_config is None:
-            db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
+            db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'auction'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
        self.db_config = db_config
        self.init_database()
@@ -522,23 +522,126 @@ class DiskReorganizer:
            cursor.close()
            conn.close()
-    def classify_files(self, disk: Optional[str]=None, update_db: bool=False):
+    def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False):
        from parsers.text_parser import TextParser
        from parsers.code_parser import CodeParser
        from parsers.pdf_parser import PDFParser
        parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()}
        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            query = "SELECT path, size, disk_label FROM files WHERE 1=1"
            params = []
            if kind:
                suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
                if kind in suffix_map:
                    query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
            query += f" LIMIT {limit}"
            cursor.execute(query, params)
            files = cursor.fetchall()
            print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
            parsed_count = 0
            for path, size, disk_label in files:
                mount_point = disk_mount_map.get(disk_label, disk_label)
                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
                if not full_path.exists() or int(size) > 10 * 1024 * 1024:
                    continue
                file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text'
                parser = parsers.get(file_kind)
                if not parser:
                    continue
                result = parser.parse(full_path)
                if 'error' not in result:
                    text = result.get('text', '')
                    quality = result.get('quality', 'unknown')
                    print(f"{path[:60]} | {file_kind} | {len(text):,} chars")
                    if update_db and text:
                        cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path))
                        parsed_count += 1
                        if parsed_count % 10 == 0:
                            conn.commit()
            if update_db:
                conn.commit()
            print(f"\nParsed {parsed_count} files")
        finally:
            cursor.close()
            conn.close()
    def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
        from enrichment.enricher import ContentEnricher
        enricher = ContentEnricher()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
            files = cursor.fetchall()
            print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
            for path, text in files:
                enrichment = enricher.enrich(text[:5000], use_llm=False)
                print(f"{path[:60]}")
                print(f"  Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
                print(f"  PII: {list(enrichment.get('has_pii', {}).keys())}")
                print(f"  Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
                cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
            conn.commit()
            print(f"Enriched {len(files)} files")
        finally:
            cursor.close()
            conn.close()
    def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True):
        from classification.classifier import FileClassifier
        classifier = FileClassifier()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            task_name = f"classify_{disk or 'all'}"
            skip_count = 0
            if resume and update_db:
                cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,))
                checkpoint = cursor.fetchone()
                if checkpoint:
                    last_path, skip_count = checkpoint
                    logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed')
            if disk:
-                cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s', (disk,))
+                cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,))
            else:
-                cursor.execute('SELECT path, size, disk_label FROM files')
+                cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path')
            files = cursor.fetchall()
            total = len(files)
            logger.info(f'Classifying {total:,} files...')
            categories = {}
            build_artifacts = 0
            batch = []
            processed = 0
            for idx, (path, size, disk_label) in enumerate(files, 1):
                if idx <= skip_count:
                    continue
                labels, category, is_build = classifier.classify_path(path, int(size))
                if is_build:
                    build_artifacts += 1
@@ -546,18 +649,40 @@ class DiskReorganizer:
                    categories[category] = {'count': 0, 'size': 0}
                categories[category]['count'] += 1
                categories[category]['size'] += int(size)
                if update_db:
                    labels_str = ','.join(labels)
                    batch.append((category, labels_str, path))
                    if len(batch) >= 1000:
                        cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
                        cursor.execute('''
                            INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
                            VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
                            ON CONFLICT (task_name) DO UPDATE SET
                                last_processed_path = EXCLUDED.last_processed_path,
                                processed_count = EXCLUDED.processed_count,
                                updated_at = CURRENT_TIMESTAMP
                        ''', (task_name, path, idx))
                        conn.commit()
                        batch.clear()
                processed += 1
                if idx % 1000 == 0:
-                    print(f'\rClassified: {idx:,}/{total:,}', end='', flush=True)
+                    print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True)
            if update_db and batch:
                cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
                cursor.execute('''
                    INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
                    VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
                    ON CONFLICT (task_name) DO UPDATE SET
                        last_processed_path = EXCLUDED.last_processed_path,
                        processed_count = EXCLUDED.processed_count,
                        updated_at = CURRENT_TIMESTAMP
                ''', (task_name, files[-1][0] if files else '', total))
                conn.commit()
            print()
            print(f'\n=== CLASSIFICATION SUMMARY ===')
            print(f'Total files: {total:,}')
@@ -570,6 +695,99 @@ class DiskReorganizer:
            cursor.close()
            conn.close()
    def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
        from analysis.folder_analyzer import FolderAnalyzer
        analyzer = FolderAnalyzer()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            query = '''
                SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label
                FROM files
                WHERE 1=1
            '''
            params = []
            if disk:
                query += ' AND disk_label = %s'
                params.append(disk)
            cursor.execute(query, params)
            potential_folders = cursor.fetchall()
            logger.info(f'Found {len(potential_folders)} potential folders to analyze')
            processed = 0
            for folder_name, disk_label in potential_folders:
                cursor.execute('''
                    SELECT path, size FROM files
                    WHERE disk_label = %s AND path LIKE %s
                ''', (disk_label, f'{folder_name}%'))
                files = cursor.fetchall()
                if len(files) < min_files:
                    continue
                files_list = [{'path': f[0], 'size': int(f[1])} for f in files]
                folder_path = Path(folder_name)
                analysis = analyzer.analyze_folder(folder_path, files_list)
                readme_text = None
                for file_dict in files_list:
                    if 'readme' in file_dict['path'].lower():
                        readme_text = f"Found README at {file_dict['path']}"
                        break
                summary = analyzer.generate_summary(analysis, readme_text)
                cursor.execute('''
                    INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary,
                                        has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT (path) DO UPDATE SET
                        file_count = EXCLUDED.file_count,
                        total_size = EXCLUDED.total_size,
                        project_type = EXCLUDED.project_type,
                        intent = EXCLUDED.intent,
                        summary = EXCLUDED.summary,
                        has_readme = EXCLUDED.has_readme,
                        has_git = EXCLUDED.has_git,
                        has_manifest = EXCLUDED.has_manifest,
                        manifest_types = EXCLUDED.manifest_types,
                        dominant_file_types = EXCLUDED.dominant_file_types,
                        structure = EXCLUDED.structure,
                        updated_at = CURRENT_TIMESTAMP
                ''', (
                    str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list),
                    analysis.get('project_type'), analysis.get('intent'), summary,
                    analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'),
                    analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})),
                    json.dumps(analysis.get('structure', {}))
                ))
                processed += 1
                if processed % 100 == 0:
                    conn.commit()
                    print(f'\rAnalyzed: {processed} folders', end='', flush=True)
            conn.commit()
            print()
            logger.info(f'Completed folder analysis: {processed} folders')
            cursor.execute('''
                SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size)
                FROM folders
                GROUP BY project_type
            ''')
            print(f'\n=== FOLDER ANALYSIS SUMMARY ===')
            for row in cursor.fetchall():
                proj_type, count, files, size = row
                print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}')
        finally:
            cursor.close()
            conn.close()
    def review_migration(self, category: Optional[str]=None, show_build: bool=False):
        from classification.classifier import FileClassifier
        classifier = FileClassifier()
@@ -640,9 +858,24 @@ def main():
    extract_parser = subparsers.add_parser('extract', help='Extract content from files')
    extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
    extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
    parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
    parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
    parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
    parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
    enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
    enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
    enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
    enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
    classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
    classify_parser.add_argument('--disk', help='Classify specific disk')
    classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
    classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch instead of resuming')
    folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
    folders_parser.add_argument('--disk', help='Analyze specific disk')
    folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
    review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
    review_parser.add_argument('--category', help='Review specific category')
    review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
@@ -669,8 +902,14 @@ def main():
        tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
    elif args.command == 'extract':
        tool.extract_content(kind=args.kind, limit=args.limit)
    elif args.command == 'parse':
        tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
    elif args.command == 'enrich':
        tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
    elif args.command == 'classify':
-        tool.classify_files(disk=args.disk, update_db=args.update)
+        tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
    elif args.command == 'analyze-folders':
        tool.analyze_folders(disk=args.disk, min_files=args.min_files)
    elif args.command == 'review':
        tool.review_migration(category=args.category, show_build=args.show_build)
    elif args.command == 'report':
--- a/app/parsers/code_parser.py
+++ b/app/parsers/code_parser.py
@@ -0,0 +1,44 @@
 from pathlib import Path
 from typing import Dict
 import re
 class CodeParser:
    def __init__(self):
        self.patterns = {
            'python': {'imports': r'^import |^from .+ import', 'class': r'^class \w+', 'function': r'^def \w+'},
            'javascript': {'imports': r'^import |^require\(', 'class': r'^class \w+', 'function': r'^function \w+|^const \w+ = '},
            'java': {'package': r'^package ', 'imports': r'^import ', 'class': r'^public class \w+'},
            'go': {'package': r'^package ', 'imports': r'^import ', 'function': r'^func \w+'}
        }
    def parse(self, file_path: Path) -> Dict:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
            language = self._detect_language(file_path, text)
            structure = self._extract_structure(text, language)
            return {
                'text': text,
                'language': language,
                'line_count': len(text.split('\n')),
                'structure': structure,
                'quality': 'high'
            }
        except Exception as e:
            return {'error': str(e)}
    def _detect_language(self, file_path: Path, text: str) -> str:
        lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go'}
        return lang_map.get(file_path.suffix.lower(), 'unknown')
    def _extract_structure(self, text: str, language: str) -> Dict:
        patterns = self.patterns.get(language, {})
        structure = {'type': 'code', 'language': language}
        for key, pattern in patterns.items():
            matches = re.findall(pattern, text, re.MULTILINE)
            structure[key] = len(matches)
        return structure
--- a/app/parsers/media_parser.py
+++ b/app/parsers/media_parser.py
@@ -0,0 +1,42 @@
 from pathlib import Path
 from typing import Dict
 class MediaParser:
    def parse_audio(self, file_path: Path) -> Dict:
        return {
            'text': '[Audio transcription pending]',
            'needs_transcription': True,
            'transcription_service': 'whisper',
            'structure': {'type': 'audio'},
            'quality': 'pending'
        }
    def parse_video(self, file_path: Path) -> Dict:
        return {
            'text': '[Video transcription pending]',
            'needs_transcription': True,
            'needs_scene_detection': True,
            'transcription_service': 'whisper',
            'structure': {'type': 'video'},
            'quality': 'pending'
        }
    def parse_image(self, file_path: Path) -> Dict:
        try:
            from PIL import Image
            with Image.open(file_path) as img:
                width, height = img.size
                mode = img.mode
            return {
                'text': '[Image caption/OCR pending]',
                'needs_ocr': True,
                'needs_caption': True,
                'dimensions': f'{width}x{height}',
                'mode': mode,
                'structure': {'type': 'image', 'width': width, 'height': height},
                'quality': 'pending'
            }
        except Exception as e:
            return {'error': str(e)}
--- a/app/parsers/pdf_parser.py
+++ b/app/parsers/pdf_parser.py
@@ -0,0 +1,31 @@
 from pathlib import Path
 from typing import Dict, List
 class PDFParser:
    def parse(self, file_path: Path) -> Dict:
        try:
            import PyPDF2
            pages = []
            with open(file_path, 'rb') as f:
                pdf = PyPDF2.PdfReader(f)
                page_count = len(pdf.pages)
                for i, page in enumerate(pdf.pages[:50]):
                    text = page.extract_text()
                    pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
            full_text = '\n\n'.join([p['text'] for p in pages])
            has_text_layer = sum(p['char_count'] for p in pages) > 100
            return {
                'text': full_text,
                'page_count': page_count,
                'pages_extracted': len(pages),
                'has_text_layer': has_text_layer,
                'needs_ocr': not has_text_layer,
                'structure': {'type': 'document', 'pages': pages[:5]},
                'quality': 'high' if has_text_layer else 'needs_ocr'
            }
        except Exception as e:
            return {'error': str(e), 'needs_ocr': True}
--- a/app/parsers/text_parser.py
+++ b/app/parsers/text_parser.py
@@ -0,0 +1,26 @@
 from pathlib import Path
 from typing import Dict, Optional
 import chardet
 class TextParser:
    def parse(self, file_path: Path) -> Dict:
        try:
            with open(file_path, 'rb') as f:
                raw_data = f.read(1024 * 1024)
            encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
            text = raw_data.decode(encoding, errors='ignore')
            lines = text.split('\n')
            return {
                'text': text,
                'encoding': encoding,
                'line_count': len(lines),
                'char_count': len(text),
                'word_count': len(text.split()),
                'structure': {'type': 'plain_text'},
                'quality': 'high' if encoding == 'utf-8' else 'medium'
            }
        except Exception as e:
            return {'error': str(e)}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -232,6 +232,37 @@ services:
    networks:
      - defrag-network
  flyway:
    image: flyway/flyway:latest
    container_name: flyway
    volumes:
      - ./sql/migration:/flyway/sql:ro
    environment:
      FLYWAY_URL: jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
      FLYWAY_USER: disk_reorg_user
      FLYWAY_PASSWORD: heel-goed-wachtwoord
      FLYWAY_SCHEMAS: public
      FLYWAY_LOCATIONS: filesystem:./sql
      FLYWAY_CONNECT_RETRIES: "60"
    command: migrate
    restart: "no"
  pg_backup:
    image: postgres:16
    container_name: pg_backup
    environment:
      PGPASSWORD: heel-goed-wachtwoord
    volumes:
      - ./:/backup
    command:
      - bash
      - -lc
      - >
        pg_dump -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db
        --format=custom --no-owner --no-privileges
        -f /backup/backup_$(date +%F_%H%M)_disk_reorganizer_db.dump
    restart: "no"
 networks:
  defrag-network:
    driver: bridge
--- a/flyway.conf
+++ b/flyway.conf
@@ -0,0 +1,7 @@
 flyway.url=jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
 flyway.user=disk_org_user
 flyway.password=heel-goed-wachtwoord
 flyway.locations=filesystem:sql/migration
 flyway.schemas=public
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,3 +37,5 @@ pytest-cov>=4.0.0
 black>=23.0.0
 mypy>=1.0.0
 flake8>=6.0.0
 chardet
--- a/sql/init.sql
+++ b/sql/init.sql
@@ -1,176 +0,0 @@
 -- sql/init.sql
 -- Initialize PostgreSQL database for Project Defrag
 -- Enable useful extensions
 CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
 CREATE EXTENSION IF NOT EXISTS "pgcrypto";
 -- Files table
 CREATE TABLE IF NOT EXISTS files (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    path TEXT NOT NULL,
    size BIGINT NOT NULL,
    modified_time TIMESTAMP WITH TIME ZONE,
    created_time TIMESTAMP WITH TIME ZONE,
    file_hash VARCHAR(64),  -- SHA-256 hash
    checksum VARCHAR(64),   -- Alias for file_hash (legacy compatibility)
    category VARCHAR(50),
    disk_label VARCHAR(50),
    last_verified TIMESTAMP WITH TIME ZONE,
    status VARCHAR(20) DEFAULT 'indexed',
    duplicate_of TEXT,      -- Path to canonical file if this is a duplicate
    -- Metadata
    metadata JSONB DEFAULT '{}',
    -- Audit fields
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    -- Constraints
    CONSTRAINT unique_file_path UNIQUE(path)
 );
 -- Operations table (audit log)
 CREATE TABLE IF NOT EXISTS operations (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    operation_type VARCHAR(50) NOT NULL,
    source_path TEXT,
    target_path TEXT,
    status VARCHAR(20) NOT NULL,
    -- Legacy compatibility fields
    executed INTEGER DEFAULT 0,
    verified INTEGER DEFAULT 0,
    error TEXT,
    -- File reference
    file_id UUID REFERENCES files(id) ON DELETE SET NULL,
    -- Performance metrics
    duration_ms INTEGER,
    bytes_processed BIGINT,
    -- Error information
    error_message TEXT,
    error_details JSONB,
    -- Context
    session_id VARCHAR(100),
    user_agent TEXT,
    -- Audit fields
    started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    completed_at TIMESTAMP WITH TIME ZONE,
    executed_at TIMESTAMP WITH TIME ZONE,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
 -- Deduplication hash store
 CREATE TABLE IF NOT EXISTS deduplication_store (
    hash VARCHAR(64) PRIMARY KEY,
    canonical_path TEXT NOT NULL,
    reference_count INTEGER DEFAULT 1,
    first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
 -- Migration plan table
 CREATE TABLE IF NOT EXISTS migration_plans (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name VARCHAR(100) NOT NULL,
    source_disk VARCHAR(50) NOT NULL,
    target_disk VARCHAR(50) NOT NULL,
    plan_json JSONB NOT NULL,
    -- Statistics
    total_files INTEGER DEFAULT 0,
    total_size BIGINT DEFAULT 0,
    estimated_duration INTEGER,  -- in seconds
    -- Status
    status VARCHAR(20) DEFAULT 'draft',
    -- Audit
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    executed_at TIMESTAMP WITH TIME ZONE,
    completed_at TIMESTAMP WITH TIME ZONE
 );
 -- Indexes for performance
 CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
 CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
 CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
 CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
 CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
 create index on files (checksum);
 create index on files (checksum,path);
 CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
 CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);
 CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations(file_id);
 CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store(canonical_path);
 -- Functions for updating timestamps
 CREATE OR REPLACE FUNCTION update_updated_at_column()
 RETURNS TRIGGER AS $$
 BEGIN
    NEW.updated_at = CURRENT_TIMESTAMP;
    RETURN NEW;
 END;
 $$ language 'plpgsql';
 -- Triggers for automatic updated_at
 CREATE TRIGGER update_files_updated_at BEFORE UPDATE ON files
    FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
 -- View for operational dashboard
 CREATE OR REPLACE VIEW operational_dashboard AS
 SELECT
    o.status,
    COUNT(*) as operation_count,
    SUM(o.bytes_processed) as total_bytes,
    AVG(o.duration_ms) as avg_duration_ms,
    MIN(o.started_at) as earliest_operation,
    MAX(o.completed_at) as latest_operation
 FROM operations o
 WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
 GROUP BY o.status;
 -- View for disk usage statistics
 CREATE OR REPLACE VIEW disk_usage_stats AS
 SELECT
    disk_label,
    COUNT(*) as file_count,
    SUM(size) as total_size,
    AVG(size) as avg_file_size,
    MIN(created_time) as oldest_file,
    MAX(modified_time) as newest_file
 FROM files
 GROUP BY disk_label;
 -- Insert default configuration
 INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
 VALUES (
    'Default Migration Plan',
    'disk_d',
    'disk_e',
    '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
    'draft'
 ) ON CONFLICT DO NOTHING;
 -- Create read-only user for monitoring
 DO $$
 BEGIN
    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
        CREATE USER monitor_user WITH PASSWORD 'monitor_password';
    END IF;
 END
 $$;
 GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
 GRANT USAGE ON SCHEMA public TO monitor_user;
 GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
 GRANT SELECT ON operational_dashboard TO monitor_user;
 GRANT SELECT ON disk_usage_stats TO monitor_user;
--- a/sql/migration/V001__init.sql
+++ b/sql/migration/V001__init.sql
@@ -0,0 +1,188 @@
 -- sql/init.sql
 -- Initialize PostgreSQL database for Project Defrag
 -- Enable useful extensions
 CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
 CREATE EXTENSION IF NOT EXISTS "pgcrypto";
 -- future tables/sequences created by your owner role (pick the role that creates them)
 ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
    GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
 ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
    GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
 ALTER DATABASE disk_reorganizer_db OWNER TO disk_reorg_user;
 -- Files table
 CREATE TABLE IF NOT EXISTS files
 (
    id            UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
    path          TEXT   NOT NULL,
    size          BIGINT NOT NULL,
    modified_time TIMESTAMP WITH TIME ZONE,
    created_time  TIMESTAMP WITH TIME ZONE,
    file_hash     VARCHAR(64), -- SHA-256 hash
    checksum      VARCHAR(64), -- Alias for file_hash (legacy compatibility)
    category      VARCHAR(50),
    disk_label    VARCHAR(50),
    last_verified TIMESTAMP WITH TIME ZONE,
    status        VARCHAR(20)              DEFAULT 'indexed',
    duplicate_of  TEXT,        -- Path to canonical file if this is a duplicate
    -- Metadata
    metadata      JSONB                    DEFAULT '{}',
    -- Audit fields
    created_at    TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at    TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    -- Constraints
    CONSTRAINT unique_file_path UNIQUE (path)
 );
 -- Operations table (audit log)
 CREATE TABLE IF NOT EXISTS operations
 (
    id              UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
    operation_type  VARCHAR(50) NOT NULL,
    source_path     TEXT,
    target_path     TEXT,
    status          VARCHAR(20) NOT NULL,
    -- Legacy compatibility fields
    executed        INTEGER                  DEFAULT 0,
    verified        INTEGER                  DEFAULT 0,
    error           TEXT,
    -- File reference
    file_id         UUID        REFERENCES files (id) ON DELETE SET NULL,
    -- Performance metrics
    duration_ms     INTEGER,
    bytes_processed BIGINT,
    -- Error information
    error_message   TEXT,
    error_details   JSONB,
    -- Context
    session_id      VARCHAR(100),
    user_agent      TEXT,
    -- Audit fields
    started_at      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    completed_at    TIMESTAMP WITH TIME ZONE,
    executed_at     TIMESTAMP WITH TIME ZONE,
    created_at      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
 -- Deduplication hash store
 CREATE TABLE IF NOT EXISTS deduplication_store
 (
    hash            VARCHAR(64) PRIMARY KEY,
    canonical_path  TEXT NOT NULL,
    reference_count INTEGER                  DEFAULT 1,
    first_seen      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_seen       TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
 -- Migration plan table
 CREATE TABLE IF NOT EXISTS migration_plans
 (
    id                 UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
    name               VARCHAR(100) NOT NULL,
    source_disk        VARCHAR(50)  NOT NULL,
    target_disk        VARCHAR(50)  NOT NULL,
    plan_json          JSONB        NOT NULL,
    -- Statistics
    total_files        INTEGER                  DEFAULT 0,
    total_size         BIGINT                   DEFAULT 0,
    estimated_duration INTEGER, -- in seconds
    -- Status
    status             VARCHAR(20)              DEFAULT 'draft',
    -- Audit
    created_at         TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    executed_at        TIMESTAMP WITH TIME ZONE,
    completed_at       TIMESTAMP WITH TIME ZONE
 );
 -- Indexes for performance
 CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
 CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
 CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
 CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
 CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
 create index on files (checksum);
 create index on files (checksum, path);
 CREATE INDEX IF NOT EXISTS idx_operations_status ON operations (status);
 CREATE INDEX IF NOT EXISTS idx_operations_created ON operations (created_at);
 CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations (file_id);
 CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store (canonical_path);
 -- Functions for updating timestamps
 CREATE OR REPLACE FUNCTION update_updated_at_column()
    RETURNS TRIGGER AS
 $$
 BEGIN
    NEW.updated_at = CURRENT_TIMESTAMP;
    RETURN NEW;
 END;
 $$ language 'plpgsql';
 -- Triggers for automatic updated_at
 CREATE TRIGGER update_files_updated_at
    BEFORE UPDATE
    ON files
    FOR EACH ROW
 EXECUTE FUNCTION update_updated_at_column();
 -- View for operational dashboard
 CREATE OR REPLACE VIEW operational_dashboard AS
 SELECT o.status,
       COUNT(*)               as operation_count,
       SUM(o.bytes_processed) as total_bytes,
       AVG(o.duration_ms)     as avg_duration_ms,
       MIN(o.started_at)      as earliest_operation,
       MAX(o.completed_at)    as latest_operation
 FROM operations o
 WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
 GROUP BY o.status;
 -- View for disk usage statistics
 CREATE OR REPLACE VIEW disk_usage_stats AS
 SELECT disk_label,
       COUNT(*)           as file_count,
       SUM(size)          as total_size,
       AVG(size)          as avg_file_size,
       MIN(created_time)  as oldest_file,
       MAX(modified_time) as newest_file
 FROM files
 GROUP BY disk_label;
 -- Insert default configuration
 INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
 VALUES ('Default Migration Plan',
        'disk_d',
        'disk_e',
        '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
        'draft')
 ON CONFLICT DO NOTHING;
 -- Create read-only user for monitoring
 DO
 $$
    BEGIN
        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
            CREATE USER monitor_user WITH PASSWORD 'monitor_password';
        END IF;
    END
 $$;
 GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
 GRANT USAGE ON SCHEMA public TO monitor_user;
 GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
 GRANT SELECT ON operational_dashboard TO monitor_user;
 GRANT SELECT ON disk_usage_stats TO monitor_user;
--- a/sql/migration/V002__add_extracted_text.sql
+++ b/sql/migration/V002__add_extracted_text.sql
@@ -0,0 +1,11 @@
 -- Add extracted text and enrichment columns
 ALTER TABLE files ADD COLUMN IF NOT EXISTS extracted_text TEXT;
 ALTER TABLE files ADD COLUMN IF NOT EXISTS text_quality VARCHAR(20);
 ALTER TABLE files ADD COLUMN IF NOT EXISTS enrichment JSONB;
 -- Add indexes for text search
 CREATE INDEX IF NOT EXISTS idx_files_extracted_text ON files USING gin(to_tsvector('english', extracted_text));
 CREATE INDEX IF NOT EXISTS idx_files_enrichment ON files USING gin(enrichment);
 -- Add full text search capability
 CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING gin(to_tsvector('english', COALESCE(extracted_text, '')));
--- a/sql/migration/V003__add_folder_support.sql
+++ b/sql/migration/V003__add_folder_support.sql
@@ -0,0 +1,41 @@
 CREATE TABLE IF NOT EXISTS folders
 (
    id                  UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
    path                TEXT NOT NULL UNIQUE,
    parent_path         TEXT,
    disk_label          VARCHAR(50),
    file_count          INT                      DEFAULT 0,
    total_size          BIGINT                   DEFAULT 0,
    project_type        VARCHAR(50),
    intent              TEXT,
    summary             TEXT,
    has_readme          BOOLEAN                  DEFAULT FALSE,
    has_git             BOOLEAN                  DEFAULT FALSE,
    has_manifest        BOOLEAN                  DEFAULT FALSE,
    manifest_types      TEXT[],
    dominant_file_types JSONB,
    structure           JSONB,
    created_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
 CREATE INDEX IF NOT EXISTS idx_folders_path ON folders (path);
 CREATE INDEX IF NOT EXISTS idx_folders_parent ON folders (parent_path);
 CREATE INDEX IF NOT EXISTS idx_folders_disk ON folders (disk_label);
 CREATE INDEX IF NOT EXISTS idx_folders_project_type ON folders (project_type);
 CREATE TABLE IF NOT EXISTS processing_checkpoints
 (
    task_name           VARCHAR(100) PRIMARY KEY,
    last_processed_id   TEXT,
    last_processed_path TEXT,
    processed_count     INT                      DEFAULT 0,
    total_count         INT,
    started_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
--- a/sql/setup_database.sql
+++ b/sql/setup_database.sql
@@ -19,37 +19,9 @@ CREATE DATABASE disk_reorganizer_db
 CREATE USER disk_reorg_user WITH PASSWORD 'heel-goed-wachtwoord';
 -- Create files table
 CREATE TABLE IF NOT EXISTS files (
    path TEXT PRIMARY KEY,
    size BIGINT NOT NULL,
    modified_time DOUBLE PRECISION NOT NULL,
    disk_label TEXT NOT NULL,
    checksum TEXT,
    status TEXT DEFAULT 'indexed',
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 );
 -- Create index on disk column for faster queries
 CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label);
 CREATE INDEX IF NOT EXISTS idx_files_status ON files(status);
 -- Create operations table
 CREATE TABLE IF NOT EXISTS operations (
    id SERIAL PRIMARY KEY,
    source_path TEXT NOT NULL,
    target_path TEXT NOT NULL,
    operation_type TEXT NOT NULL,
    executed INTEGER DEFAULT 0,
    verified INTEGER DEFAULT 0,
    error TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    executed_at TIMESTAMP
 );
 -- Create index on operations for faster lookups
 CREATE INDEX IF NOT EXISTS idx_operations_executed ON operations(executed);
 CREATE INDEX IF NOT EXISTS idx_operations_source ON operations(source_path);
 -- Grant privileges to disk_reorg_user
 GRANT CONNECT ON DATABASE disk_reorganizer_db TO disk_reorg_user;
@@ -66,7 +38,8 @@ ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
 -- Create function to update updated_at timestamp
 CREATE OR REPLACE FUNCTION update_updated_at_column()
-RETURNS TRIGGER AS $$
+    RETURNS TRIGGER AS
 $$
 BEGIN
    NEW.updated_at = CURRENT_TIMESTAMP;
    RETURN NEW;
@@ -75,7 +48,8 @@ $$ LANGUAGE plpgsql;
 -- Create trigger for files table
 CREATE TRIGGER update_files_updated_at
-    BEFORE UPDATE ON files
+    BEFORE UPDATE
    ON files
    FOR EACH ROW
 EXECUTE FUNCTION update_updated_at_column();