From e9eb7ea5d9138838860aa5c50bfdc308986626f1 Mon Sep 17 00:00:00 2001 From: mike Date: Sat, 13 Dec 2025 11:35:33 +0100 Subject: [PATCH] fly wa --- app/analysis/folder_analyzer.py | 110 +++++++++ app/enrichment/enricher.py | 59 +++++ app/enrichment/llm_client.py | 54 +++++ app/main.py | 251 ++++++++++++++++++++- app/parsers/code_parser.py | 44 ++++ app/parsers/media_parser.py | 42 ++++ app/parsers/pdf_parser.py | 31 +++ app/parsers/text_parser.py | 26 +++ docker-compose.yml | 31 +++ flyway.conf | 7 + requirements.txt | 2 + sql/init.sql | 176 --------------- sql/migration/V001__init.sql | 188 +++++++++++++++ sql/migration/V002__add_extracted_text.sql | 11 + sql/migration/V003__add_folder_support.sql | 41 ++++ sql/setup_database.sql | 42 +--- 16 files changed, 899 insertions(+), 216 deletions(-) create mode 100644 app/analysis/folder_analyzer.py create mode 100644 app/enrichment/enricher.py create mode 100644 app/enrichment/llm_client.py create mode 100644 app/parsers/code_parser.py create mode 100644 app/parsers/media_parser.py create mode 100644 app/parsers/pdf_parser.py create mode 100644 app/parsers/text_parser.py create mode 100644 flyway.conf delete mode 100644 sql/init.sql create mode 100644 sql/migration/V001__init.sql create mode 100644 sql/migration/V002__add_extracted_text.sql create mode 100644 sql/migration/V003__add_folder_support.sql diff --git a/app/analysis/folder_analyzer.py b/app/analysis/folder_analyzer.py new file mode 100644 index 0000000..74d4e1c --- /dev/null +++ b/app/analysis/folder_analyzer.py @@ -0,0 +1,110 @@ +from pathlib import Path +from typing import Dict, Set, List +from collections import Counter + +class FolderAnalyzer: + def __init__(self): + self.manifest_files = { + 'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'], + 'javascript': ['package.json', 'yarn.lock', 'package-lock.json'], + 'python': ['pyproject.toml', 'setup.py', 'requirements.txt', 'Pipfile'], + 'go': ['go.mod', 'go.sum'], + 'rust': ['Cargo.toml', 'Cargo.lock'], + 'docker': ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml'], + 'k8s': ['helm', 'kustomization.yaml', 'deployment.yaml'] + } + + self.intent_keywords = { + 'infrastructure': ['infra', 'deploy', 'k8s', 'docker', 'terraform', 'ansible'], + 'application': ['app', 'service', 'api', 'server', 'client'], + 'data': ['data', 'dataset', 'models', 'training', 'ml'], + 'documentation': ['docs', 'documentation', 'wiki', 'readme'], + 'testing': ['test', 'tests', 'spec', 'e2e', 'integration'], + 'build': ['build', 'dist', 'target', 'out', 'bin'], + 'config': ['config', 'conf', 'settings', 'env'] + } + + def analyze_folder(self, folder_path: Path, files: List[Dict]) -> Dict: + files_list = [Path(f['path']) for f in files] + + has_readme = any('readme' in f.name.lower() for f in files_list) + has_git = any('.git' in str(f) for f in files_list) + + manifest_types = self._detect_manifests(files_list) + has_manifest = len(manifest_types) > 0 + + file_types = Counter(f.suffix.lower() for f in files_list if f.suffix) + dominant_types = dict(file_types.most_common(10)) + + intent = self._infer_intent(folder_path.name.lower(), files_list) + project_type = self._infer_project_type(manifest_types, dominant_types) + + structure = { + 'depth': len(folder_path.parts), + 'has_src': any('src' in str(f) for f in files_list[:20]), + 'has_tests': any('test' in str(f) for f in files_list[:20]), + 'has_docs': any('doc' in str(f) for f in files_list[:20]) + } + + return { + 'has_readme': has_readme, + 'has_git': has_git, + 'has_manifest': has_manifest, + 'manifest_types': manifest_types, + 'dominant_file_types': dominant_types, + 'project_type': project_type, + 'intent': intent, + 'structure': structure + } + + def _detect_manifests(self, files: List[Path]) -> List[str]: + detected = [] + file_names = {f.name for f in files} + + for tech, manifests in self.manifest_files.items(): + if any(m in file_names for m in manifests): + detected.append(tech) + + return detected + + def _infer_intent(self, folder_name: str, files: List[Path]) -> str: + file_str = ' '.join(str(f) for f in files[:50]) + + for intent, keywords in self.intent_keywords.items(): + if any(kw in folder_name or kw in file_str.lower() for kw in keywords): + return intent + + return 'unknown' + + def _infer_project_type(self, manifests: List[str], file_types: Dict) -> str: + if manifests: + return manifests[0] + + if '.py' in file_types and file_types.get('.py', 0) > 5: + return 'python' + if '.js' in file_types or '.ts' in file_types: + return 'javascript' + if '.java' in file_types: + return 'java' + if '.go' in file_types: + return 'go' + + return 'mixed' + + def generate_summary(self, folder_analysis: Dict, readme_text: str = None) -> str: + parts = [] + + if folder_analysis.get('project_type'): + parts.append(f"{folder_analysis['project_type']} project") + + if folder_analysis.get('intent'): + parts.append(f"for {folder_analysis['intent']}") + + if folder_analysis.get('manifest_types'): + parts.append(f"using {', '.join(folder_analysis['manifest_types'])}") + + if readme_text: + first_para = readme_text.split('\n\n')[0][:200] + parts.append(f"Description: {first_para}") + + return ' '.join(parts) if parts else 'Mixed content folder' diff --git a/app/enrichment/enricher.py b/app/enrichment/enricher.py new file mode 100644 index 0000000..9185ce0 --- /dev/null +++ b/app/enrichment/enricher.py @@ -0,0 +1,59 @@ +from typing import Dict +import re + +class ContentEnricher: + def __init__(self, llm_client=None): + self.llm_client = llm_client + self.pii_patterns = { + 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', + 'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', + 'ssn': r'\b\d{3}-\d{2}-\d{4}\b', + 'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b' + } + + def enrich(self, text: str, use_llm: bool = False) -> Dict: + enrichment = { + 'summary': self._basic_summary(text), + 'word_count': len(text.split()), + 'has_pii': self._detect_pii(text), + 'quality': self._assess_quality(text), + 'topics': self._extract_basic_topics(text) + } + + if use_llm and self.llm_client: + llm_result = self.llm_client.classify_content(text) + if llm_result.get('success'): + enrichment['llm_classification'] = llm_result['text'] + + return enrichment + + def _basic_summary(self, text: str) -> str: + sentences = re.split(r'[.!?]+', text) + return ' '.join(sentences[:3])[:200] + + def _detect_pii(self, text: str) -> Dict: + detected = {} + for pii_type, pattern in self.pii_patterns.items(): + matches = re.findall(pattern, text) + if matches: + detected[pii_type] = len(matches) + return detected + + def _assess_quality(self, text: str) -> str: + if len(text.strip()) < 10: + return 'low' + + special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text) + if special_char_ratio > 0.3: + return 'low' + + return 'high' if len(text.split()) > 50 else 'medium' + + def _extract_basic_topics(self, text: str) -> list: + words = re.findall(r'\b[A-Z][a-z]+\b', text) + word_freq = {} + for word in words: + if len(word) > 3: + word_freq[word] = word_freq.get(word, 0) + 1 + + return sorted(word_freq, key=word_freq.get, reverse=True)[:10] diff --git a/app/enrichment/llm_client.py b/app/enrichment/llm_client.py new file mode 100644 index 0000000..526b4a4 --- /dev/null +++ b/app/enrichment/llm_client.py @@ -0,0 +1,54 @@ +import requests +import json +from typing import Dict, Optional + +class LLMClient: + def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'): + self.endpoint = endpoint + self.model = model + self.local_ollama = 'http://localhost:11434' + + def summarize(self, text: str, max_length: int = 200) -> Dict: + prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}" + return self._query(prompt) + + def extract_topics(self, text: str) -> Dict: + prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}" + return self._query(prompt) + + def classify_content(self, text: str) -> Dict: + prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}" + return self._query(prompt) + + def _query(self, prompt: str, use_local: bool = False) -> Dict: + try: + endpoint = self.local_ollama if use_local else self.endpoint + + if use_local: + response = requests.post( + f'{endpoint}/api/generate', + json={'model': 'llama3.2', 'prompt': prompt, 'stream': False}, + timeout=30 + ) + else: + response = requests.post( + f'{endpoint}/v1/chat/completions', + json={ + 'model': self.model, + 'messages': [{'role': 'user', 'content': prompt}], + 'max_tokens': 500 + }, + timeout=30 + ) + + if response.status_code == 200: + data = response.json() + if use_local: + return {'success': True, 'text': data.get('response', '')} + else: + return {'success': True, 'text': data['choices'][0]['message']['content']} + else: + return {'success': False, 'error': f'HTTP {response.status_code}'} + + except Exception as e: + return {'success': False, 'error': str(e)} diff --git a/app/main.py b/app/main.py index 545017c..5ac1937 100644 --- a/app/main.py +++ b/app/main.py @@ -27,7 +27,7 @@ class DiskReorganizer: def __init__(self, db_config: Dict=None): if db_config is None: - db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')} + db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'auction'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')} self.db_config = db_config self.init_database() @@ -522,23 +522,126 @@ class DiskReorganizer: cursor.close() conn.close() - def classify_files(self, disk: Optional[str]=None, update_db: bool=False): + def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False): + from parsers.text_parser import TextParser + from parsers.code_parser import CodeParser + from parsers.pdf_parser import PDFParser + + parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()} + disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'} + + conn = self.get_connection() + cursor = conn.cursor() + + try: + query = "SELECT path, size, disk_label FROM files WHERE 1=1" + params = [] + if kind: + suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"} + if kind in suffix_map: + query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}" + query += f" LIMIT {limit}" + + cursor.execute(query, params) + files = cursor.fetchall() + + print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n") + + parsed_count = 0 + for path, size, disk_label in files: + mount_point = disk_mount_map.get(disk_label, disk_label) + full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path) + + if not full_path.exists() or int(size) > 10 * 1024 * 1024: + continue + + file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text' + parser = parsers.get(file_kind) + if not parser: + continue + + result = parser.parse(full_path) + if 'error' not in result: + text = result.get('text', '') + quality = result.get('quality', 'unknown') + print(f"{path[:60]} | {file_kind} | {len(text):,} chars") + + if update_db and text: + cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path)) + parsed_count += 1 + if parsed_count % 10 == 0: + conn.commit() + + if update_db: + conn.commit() + print(f"\nParsed {parsed_count} files") + + finally: + cursor.close() + conn.close() + + def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False): + from enrichment.enricher import ContentEnricher + + enricher = ContentEnricher() + conn = self.get_connection() + cursor = conn.cursor() + + try: + cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}") + files = cursor.fetchall() + + print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n") + + for path, text in files: + enrichment = enricher.enrich(text[:5000], use_llm=False) + print(f"{path[:60]}") + print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}") + print(f" PII: {list(enrichment.get('has_pii', {}).keys())}") + print(f" Topics: {', '.join(enrichment.get('topics', [])[:5])}\n") + + cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path)) + + conn.commit() + print(f"Enriched {len(files)} files") + + finally: + cursor.close() + conn.close() + + def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True): from classification.classifier import FileClassifier classifier = FileClassifier() conn = self.get_connection() cursor = conn.cursor() try: + task_name = f"classify_{disk or 'all'}" + skip_count = 0 + + if resume and update_db: + cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,)) + checkpoint = cursor.fetchone() + if checkpoint: + last_path, skip_count = checkpoint + logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed') + if disk: - cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s', (disk,)) + cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,)) else: - cursor.execute('SELECT path, size, disk_label FROM files') + cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path') files = cursor.fetchall() total = len(files) logger.info(f'Classifying {total:,} files...') + categories = {} build_artifacts = 0 batch = [] + processed = 0 + for idx, (path, size, disk_label) in enumerate(files, 1): + if idx <= skip_count: + continue + labels, category, is_build = classifier.classify_path(path, int(size)) if is_build: build_artifacts += 1 @@ -546,18 +649,40 @@ class DiskReorganizer: categories[category] = {'count': 0, 'size': 0} categories[category]['count'] += 1 categories[category]['size'] += int(size) + if update_db: labels_str = ','.join(labels) batch.append((category, labels_str, path)) + if len(batch) >= 1000: cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch]) + cursor.execute(''' + INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at) + VALUES (%s, %s, %s, CURRENT_TIMESTAMP) + ON CONFLICT (task_name) DO UPDATE SET + last_processed_path = EXCLUDED.last_processed_path, + processed_count = EXCLUDED.processed_count, + updated_at = CURRENT_TIMESTAMP + ''', (task_name, path, idx)) conn.commit() batch.clear() + + processed += 1 if idx % 1000 == 0: - print(f'\rClassified: {idx:,}/{total:,}', end='', flush=True) + print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True) + if update_db and batch: cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch]) + cursor.execute(''' + INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at) + VALUES (%s, %s, %s, CURRENT_TIMESTAMP) + ON CONFLICT (task_name) DO UPDATE SET + last_processed_path = EXCLUDED.last_processed_path, + processed_count = EXCLUDED.processed_count, + updated_at = CURRENT_TIMESTAMP + ''', (task_name, files[-1][0] if files else '', total)) conn.commit() + print() print(f'\n=== CLASSIFICATION SUMMARY ===') print(f'Total files: {total:,}') @@ -570,6 +695,99 @@ class DiskReorganizer: cursor.close() conn.close() + def analyze_folders(self, disk: Optional[str]=None, min_files: int=3): + from analysis.folder_analyzer import FolderAnalyzer + analyzer = FolderAnalyzer() + conn = self.get_connection() + cursor = conn.cursor() + try: + query = ''' + SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label + FROM files + WHERE 1=1 + ''' + params = [] + if disk: + query += ' AND disk_label = %s' + params.append(disk) + + cursor.execute(query, params) + potential_folders = cursor.fetchall() + + logger.info(f'Found {len(potential_folders)} potential folders to analyze') + + processed = 0 + for folder_name, disk_label in potential_folders: + cursor.execute(''' + SELECT path, size FROM files + WHERE disk_label = %s AND path LIKE %s + ''', (disk_label, f'{folder_name}%')) + + files = cursor.fetchall() + if len(files) < min_files: + continue + + files_list = [{'path': f[0], 'size': int(f[1])} for f in files] + folder_path = Path(folder_name) + + analysis = analyzer.analyze_folder(folder_path, files_list) + + readme_text = None + for file_dict in files_list: + if 'readme' in file_dict['path'].lower(): + readme_text = f"Found README at {file_dict['path']}" + break + + summary = analyzer.generate_summary(analysis, readme_text) + + cursor.execute(''' + INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary, + has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (path) DO UPDATE SET + file_count = EXCLUDED.file_count, + total_size = EXCLUDED.total_size, + project_type = EXCLUDED.project_type, + intent = EXCLUDED.intent, + summary = EXCLUDED.summary, + has_readme = EXCLUDED.has_readme, + has_git = EXCLUDED.has_git, + has_manifest = EXCLUDED.has_manifest, + manifest_types = EXCLUDED.manifest_types, + dominant_file_types = EXCLUDED.dominant_file_types, + structure = EXCLUDED.structure, + updated_at = CURRENT_TIMESTAMP + ''', ( + str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list), + analysis.get('project_type'), analysis.get('intent'), summary, + analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'), + analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})), + json.dumps(analysis.get('structure', {})) + )) + + processed += 1 + if processed % 100 == 0: + conn.commit() + print(f'\rAnalyzed: {processed} folders', end='', flush=True) + + conn.commit() + print() + logger.info(f'Completed folder analysis: {processed} folders') + + cursor.execute(''' + SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size) + FROM folders + GROUP BY project_type + ''') + print(f'\n=== FOLDER ANALYSIS SUMMARY ===') + for row in cursor.fetchall(): + proj_type, count, files, size = row + print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}') + + finally: + cursor.close() + conn.close() + def review_migration(self, category: Optional[str]=None, show_build: bool=False): from classification.classifier import FileClassifier classifier = FileClassifier() @@ -640,9 +858,24 @@ def main(): extract_parser = subparsers.add_parser('extract', help='Extract content from files') extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)') extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch') + + parse_parser = subparsers.add_parser('parse', help='Parse files to extract text') + parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)') + parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch') + parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database') + + enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis') + enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch') + enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint') + enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama') + classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization') classify_parser.add_argument('--disk', help='Classify specific disk') classify_parser.add_argument('--update', action='store_true', help='Update database with classifications') + classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch instead of resuming') + folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent') + folders_parser.add_argument('--disk', help='Analyze specific disk') + folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder') review_parser = subparsers.add_parser('review', help='Review proposed migration structure') review_parser.add_argument('--category', help='Review specific category') review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts') @@ -669,8 +902,14 @@ def main(): tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit) elif args.command == 'extract': tool.extract_content(kind=args.kind, limit=args.limit) + elif args.command == 'parse': + tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update) + elif args.command == 'enrich': + tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local) elif args.command == 'classify': - tool.classify_files(disk=args.disk, update_db=args.update) + tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume) + elif args.command == 'analyze-folders': + tool.analyze_folders(disk=args.disk, min_files=args.min_files) elif args.command == 'review': tool.review_migration(category=args.category, show_build=args.show_build) elif args.command == 'report': diff --git a/app/parsers/code_parser.py b/app/parsers/code_parser.py new file mode 100644 index 0000000..8a27ae0 --- /dev/null +++ b/app/parsers/code_parser.py @@ -0,0 +1,44 @@ +from pathlib import Path +from typing import Dict +import re + +class CodeParser: + def __init__(self): + self.patterns = { + 'python': {'imports': r'^import |^from .+ import', 'class': r'^class \w+', 'function': r'^def \w+'}, + 'javascript': {'imports': r'^import |^require\(', 'class': r'^class \w+', 'function': r'^function \w+|^const \w+ = '}, + 'java': {'package': r'^package ', 'imports': r'^import ', 'class': r'^public class \w+'}, + 'go': {'package': r'^package ', 'imports': r'^import ', 'function': r'^func \w+'} + } + + def parse(self, file_path: Path) -> Dict: + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + text = f.read() + + language = self._detect_language(file_path, text) + structure = self._extract_structure(text, language) + + return { + 'text': text, + 'language': language, + 'line_count': len(text.split('\n')), + 'structure': structure, + 'quality': 'high' + } + except Exception as e: + return {'error': str(e)} + + def _detect_language(self, file_path: Path, text: str) -> str: + lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go'} + return lang_map.get(file_path.suffix.lower(), 'unknown') + + def _extract_structure(self, text: str, language: str) -> Dict: + patterns = self.patterns.get(language, {}) + structure = {'type': 'code', 'language': language} + + for key, pattern in patterns.items(): + matches = re.findall(pattern, text, re.MULTILINE) + structure[key] = len(matches) + + return structure diff --git a/app/parsers/media_parser.py b/app/parsers/media_parser.py new file mode 100644 index 0000000..1384d59 --- /dev/null +++ b/app/parsers/media_parser.py @@ -0,0 +1,42 @@ +from pathlib import Path +from typing import Dict + +class MediaParser: + def parse_audio(self, file_path: Path) -> Dict: + return { + 'text': '[Audio transcription pending]', + 'needs_transcription': True, + 'transcription_service': 'whisper', + 'structure': {'type': 'audio'}, + 'quality': 'pending' + } + + def parse_video(self, file_path: Path) -> Dict: + return { + 'text': '[Video transcription pending]', + 'needs_transcription': True, + 'needs_scene_detection': True, + 'transcription_service': 'whisper', + 'structure': {'type': 'video'}, + 'quality': 'pending' + } + + def parse_image(self, file_path: Path) -> Dict: + try: + from PIL import Image + + with Image.open(file_path) as img: + width, height = img.size + mode = img.mode + + return { + 'text': '[Image caption/OCR pending]', + 'needs_ocr': True, + 'needs_caption': True, + 'dimensions': f'{width}x{height}', + 'mode': mode, + 'structure': {'type': 'image', 'width': width, 'height': height}, + 'quality': 'pending' + } + except Exception as e: + return {'error': str(e)} diff --git a/app/parsers/pdf_parser.py b/app/parsers/pdf_parser.py new file mode 100644 index 0000000..55f1113 --- /dev/null +++ b/app/parsers/pdf_parser.py @@ -0,0 +1,31 @@ +from pathlib import Path +from typing import Dict, List + +class PDFParser: + def parse(self, file_path: Path) -> Dict: + try: + import PyPDF2 + + pages = [] + with open(file_path, 'rb') as f: + pdf = PyPDF2.PdfReader(f) + page_count = len(pdf.pages) + + for i, page in enumerate(pdf.pages[:50]): + text = page.extract_text() + pages.append({'page': i + 1, 'text': text, 'char_count': len(text)}) + + full_text = '\n\n'.join([p['text'] for p in pages]) + has_text_layer = sum(p['char_count'] for p in pages) > 100 + + return { + 'text': full_text, + 'page_count': page_count, + 'pages_extracted': len(pages), + 'has_text_layer': has_text_layer, + 'needs_ocr': not has_text_layer, + 'structure': {'type': 'document', 'pages': pages[:5]}, + 'quality': 'high' if has_text_layer else 'needs_ocr' + } + except Exception as e: + return {'error': str(e), 'needs_ocr': True} diff --git a/app/parsers/text_parser.py b/app/parsers/text_parser.py new file mode 100644 index 0000000..7a17589 --- /dev/null +++ b/app/parsers/text_parser.py @@ -0,0 +1,26 @@ +from pathlib import Path +from typing import Dict, Optional +import chardet + +class TextParser: + def parse(self, file_path: Path) -> Dict: + try: + with open(file_path, 'rb') as f: + raw_data = f.read(1024 * 1024) + + encoding = chardet.detect(raw_data)['encoding'] or 'utf-8' + text = raw_data.decode(encoding, errors='ignore') + + lines = text.split('\n') + + return { + 'text': text, + 'encoding': encoding, + 'line_count': len(lines), + 'char_count': len(text), + 'word_count': len(text.split()), + 'structure': {'type': 'plain_text'}, + 'quality': 'high' if encoding == 'utf-8' else 'medium' + } + except Exception as e: + return {'error': str(e)} diff --git a/docker-compose.yml b/docker-compose.yml index 87b8696..7754528 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -232,6 +232,37 @@ services: networks: - defrag-network + flyway: + image: flyway/flyway:latest + container_name: flyway + volumes: + - ./sql/migration:/flyway/sql:ro + environment: + FLYWAY_URL: jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db + FLYWAY_USER: disk_reorg_user + FLYWAY_PASSWORD: heel-goed-wachtwoord + FLYWAY_SCHEMAS: public + FLYWAY_LOCATIONS: filesystem:./sql + FLYWAY_CONNECT_RETRIES: "60" + command: migrate + restart: "no" + + pg_backup: + image: postgres:16 + container_name: pg_backup + environment: + PGPASSWORD: heel-goed-wachtwoord + volumes: + - ./:/backup + command: + - bash + - -lc + - > + pg_dump -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db + --format=custom --no-owner --no-privileges + -f /backup/backup_$(date +%F_%H%M)_disk_reorganizer_db.dump + restart: "no" + networks: defrag-network: driver: bridge diff --git a/flyway.conf b/flyway.conf new file mode 100644 index 0000000..6d73888 --- /dev/null +++ b/flyway.conf @@ -0,0 +1,7 @@ +flyway.url=jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db +flyway.user=disk_org_user +flyway.password=heel-goed-wachtwoord + +flyway.locations=filesystem:sql/migration +flyway.schemas=public + diff --git a/requirements.txt b/requirements.txt index f838a8a..dc2b5b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,3 +37,5 @@ pytest-cov>=4.0.0 black>=23.0.0 mypy>=1.0.0 flake8>=6.0.0 + +chardet \ No newline at end of file diff --git a/sql/init.sql b/sql/init.sql deleted file mode 100644 index 58266c7..0000000 --- a/sql/init.sql +++ /dev/null @@ -1,176 +0,0 @@ --- sql/init.sql --- Initialize PostgreSQL database for Project Defrag - --- Enable useful extensions -CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; -CREATE EXTENSION IF NOT EXISTS "pgcrypto"; - --- Files table -CREATE TABLE IF NOT EXISTS files ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - path TEXT NOT NULL, - size BIGINT NOT NULL, - modified_time TIMESTAMP WITH TIME ZONE, - created_time TIMESTAMP WITH TIME ZONE, - file_hash VARCHAR(64), -- SHA-256 hash - checksum VARCHAR(64), -- Alias for file_hash (legacy compatibility) - category VARCHAR(50), - disk_label VARCHAR(50), - last_verified TIMESTAMP WITH TIME ZONE, - status VARCHAR(20) DEFAULT 'indexed', - duplicate_of TEXT, -- Path to canonical file if this is a duplicate - - -- Metadata - metadata JSONB DEFAULT '{}', - - -- Audit fields - created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, - - -- Constraints - CONSTRAINT unique_file_path UNIQUE(path) -); - --- Operations table (audit log) -CREATE TABLE IF NOT EXISTS operations ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - operation_type VARCHAR(50) NOT NULL, - source_path TEXT, - target_path TEXT, - status VARCHAR(20) NOT NULL, - - -- Legacy compatibility fields - executed INTEGER DEFAULT 0, - verified INTEGER DEFAULT 0, - error TEXT, - - -- File reference - file_id UUID REFERENCES files(id) ON DELETE SET NULL, - - -- Performance metrics - duration_ms INTEGER, - bytes_processed BIGINT, - - -- Error information - error_message TEXT, - error_details JSONB, - - -- Context - session_id VARCHAR(100), - user_agent TEXT, - - -- Audit fields - started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, - completed_at TIMESTAMP WITH TIME ZONE, - executed_at TIMESTAMP WITH TIME ZONE, - created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP -); - --- Deduplication hash store -CREATE TABLE IF NOT EXISTS deduplication_store ( - hash VARCHAR(64) PRIMARY KEY, - canonical_path TEXT NOT NULL, - reference_count INTEGER DEFAULT 1, - first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, - last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP -); - --- Migration plan table -CREATE TABLE IF NOT EXISTS migration_plans ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - name VARCHAR(100) NOT NULL, - source_disk VARCHAR(50) NOT NULL, - target_disk VARCHAR(50) NOT NULL, - plan_json JSONB NOT NULL, - - -- Statistics - total_files INTEGER DEFAULT 0, - total_size BIGINT DEFAULT 0, - estimated_duration INTEGER, -- in seconds - - -- Status - status VARCHAR(20) DEFAULT 'draft', - - -- Audit - created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, - executed_at TIMESTAMP WITH TIME ZONE, - completed_at TIMESTAMP WITH TIME ZONE -); - --- Indexes for performance -CREATE INDEX IF NOT EXISTS idx_files_path ON files (path); -CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash); -CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label); -CREATE INDEX IF NOT EXISTS idx_files_category ON files (category); -CREATE INDEX IF NOT EXISTS idx_files_status ON files (status); -create index on files (checksum); -create index on files (checksum,path); - -CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status); -CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at); -CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations(file_id); - -CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store(canonical_path); - --- Functions for updating timestamps -CREATE OR REPLACE FUNCTION update_updated_at_column() -RETURNS TRIGGER AS $$ -BEGIN - NEW.updated_at = CURRENT_TIMESTAMP; - RETURN NEW; -END; -$$ language 'plpgsql'; - --- Triggers for automatic updated_at -CREATE TRIGGER update_files_updated_at BEFORE UPDATE ON files - FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); - --- View for operational dashboard -CREATE OR REPLACE VIEW operational_dashboard AS -SELECT - o.status, - COUNT(*) as operation_count, - SUM(o.bytes_processed) as total_bytes, - AVG(o.duration_ms) as avg_duration_ms, - MIN(o.started_at) as earliest_operation, - MAX(o.completed_at) as latest_operation -FROM operations o -WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours' -GROUP BY o.status; - --- View for disk usage statistics -CREATE OR REPLACE VIEW disk_usage_stats AS -SELECT - disk_label, - COUNT(*) as file_count, - SUM(size) as total_size, - AVG(size) as avg_file_size, - MIN(created_time) as oldest_file, - MAX(modified_time) as newest_file -FROM files -GROUP BY disk_label; - --- Insert default configuration -INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status) -VALUES ( - 'Default Migration Plan', - 'disk_d', - 'disk_e', - '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb, - 'draft' -) ON CONFLICT DO NOTHING; - --- Create read-only user for monitoring -DO $$ -BEGIN - IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN - CREATE USER monitor_user WITH PASSWORD 'monitor_password'; - END IF; -END -$$; - -GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user; -GRANT USAGE ON SCHEMA public TO monitor_user; -GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user; -GRANT SELECT ON operational_dashboard TO monitor_user; -GRANT SELECT ON disk_usage_stats TO monitor_user; \ No newline at end of file diff --git a/sql/migration/V001__init.sql b/sql/migration/V001__init.sql new file mode 100644 index 0000000..ee1366f --- /dev/null +++ b/sql/migration/V001__init.sql @@ -0,0 +1,188 @@ +-- sql/init.sql +-- Initialize PostgreSQL database for Project Defrag + +-- Enable useful extensions +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +CREATE EXTENSION IF NOT EXISTS "pgcrypto"; +-- future tables/sequences created by your owner role (pick the role that creates them) +ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public + GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user; + +ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public + GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user; +ALTER DATABASE disk_reorganizer_db OWNER TO disk_reorg_user; +-- Files table +CREATE TABLE IF NOT EXISTS files +( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + path TEXT NOT NULL, + size BIGINT NOT NULL, + modified_time TIMESTAMP WITH TIME ZONE, + created_time TIMESTAMP WITH TIME ZONE, + file_hash VARCHAR(64), -- SHA-256 hash + checksum VARCHAR(64), -- Alias for file_hash (legacy compatibility) + category VARCHAR(50), + disk_label VARCHAR(50), + last_verified TIMESTAMP WITH TIME ZONE, + status VARCHAR(20) DEFAULT 'indexed', + duplicate_of TEXT, -- Path to canonical file if this is a duplicate + + -- Metadata + metadata JSONB DEFAULT '{}', + + -- Audit fields + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + + -- Constraints + CONSTRAINT unique_file_path UNIQUE (path) +); + +-- Operations table (audit log) +CREATE TABLE IF NOT EXISTS operations +( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + operation_type VARCHAR(50) NOT NULL, + source_path TEXT, + target_path TEXT, + status VARCHAR(20) NOT NULL, + + -- Legacy compatibility fields + executed INTEGER DEFAULT 0, + verified INTEGER DEFAULT 0, + error TEXT, + + -- File reference + file_id UUID REFERENCES files (id) ON DELETE SET NULL, + + -- Performance metrics + duration_ms INTEGER, + bytes_processed BIGINT, + + -- Error information + error_message TEXT, + error_details JSONB, + + -- Context + session_id VARCHAR(100), + user_agent TEXT, + + -- Audit fields + started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + completed_at TIMESTAMP WITH TIME ZONE, + executed_at TIMESTAMP WITH TIME ZONE, + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +-- Deduplication hash store +CREATE TABLE IF NOT EXISTS deduplication_store +( + hash VARCHAR(64) PRIMARY KEY, + canonical_path TEXT NOT NULL, + reference_count INTEGER DEFAULT 1, + first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +-- Migration plan table +CREATE TABLE IF NOT EXISTS migration_plans +( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(100) NOT NULL, + source_disk VARCHAR(50) NOT NULL, + target_disk VARCHAR(50) NOT NULL, + plan_json JSONB NOT NULL, + + -- Statistics + total_files INTEGER DEFAULT 0, + total_size BIGINT DEFAULT 0, + estimated_duration INTEGER, -- in seconds + + -- Status + status VARCHAR(20) DEFAULT 'draft', + + -- Audit + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + executed_at TIMESTAMP WITH TIME ZONE, + completed_at TIMESTAMP WITH TIME ZONE +); + +-- Indexes for performance +CREATE INDEX IF NOT EXISTS idx_files_path ON files (path); +CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash); +CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label); +CREATE INDEX IF NOT EXISTS idx_files_category ON files (category); +CREATE INDEX IF NOT EXISTS idx_files_status ON files (status); +create index on files (checksum); +create index on files (checksum, path); + +CREATE INDEX IF NOT EXISTS idx_operations_status ON operations (status); +CREATE INDEX IF NOT EXISTS idx_operations_created ON operations (created_at); +CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations (file_id); + +CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store (canonical_path); + +-- Functions for updating timestamps +CREATE OR REPLACE FUNCTION update_updated_at_column() + RETURNS TRIGGER AS +$$ +BEGIN + NEW.updated_at = CURRENT_TIMESTAMP; + RETURN NEW; +END; +$$ language 'plpgsql'; + +-- Triggers for automatic updated_at +CREATE TRIGGER update_files_updated_at + BEFORE UPDATE + ON files + FOR EACH ROW +EXECUTE FUNCTION update_updated_at_column(); + +-- View for operational dashboard +CREATE OR REPLACE VIEW operational_dashboard AS +SELECT o.status, + COUNT(*) as operation_count, + SUM(o.bytes_processed) as total_bytes, + AVG(o.duration_ms) as avg_duration_ms, + MIN(o.started_at) as earliest_operation, + MAX(o.completed_at) as latest_operation +FROM operations o +WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours' +GROUP BY o.status; + +-- View for disk usage statistics +CREATE OR REPLACE VIEW disk_usage_stats AS +SELECT disk_label, + COUNT(*) as file_count, + SUM(size) as total_size, + AVG(size) as avg_file_size, + MIN(created_time) as oldest_file, + MAX(modified_time) as newest_file +FROM files +GROUP BY disk_label; + +-- Insert default configuration +INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status) +VALUES ('Default Migration Plan', + 'disk_d', + 'disk_e', + '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb, + 'draft') +ON CONFLICT DO NOTHING; + +-- Create read-only user for monitoring +DO +$$ + BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN + CREATE USER monitor_user WITH PASSWORD 'monitor_password'; + END IF; + END +$$; + +GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user; +GRANT USAGE ON SCHEMA public TO monitor_user; +GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user; +GRANT SELECT ON operational_dashboard TO monitor_user; +GRANT SELECT ON disk_usage_stats TO monitor_user; \ No newline at end of file diff --git a/sql/migration/V002__add_extracted_text.sql b/sql/migration/V002__add_extracted_text.sql new file mode 100644 index 0000000..a3e03ad --- /dev/null +++ b/sql/migration/V002__add_extracted_text.sql @@ -0,0 +1,11 @@ +-- Add extracted text and enrichment columns +ALTER TABLE files ADD COLUMN IF NOT EXISTS extracted_text TEXT; +ALTER TABLE files ADD COLUMN IF NOT EXISTS text_quality VARCHAR(20); +ALTER TABLE files ADD COLUMN IF NOT EXISTS enrichment JSONB; + +-- Add indexes for text search +CREATE INDEX IF NOT EXISTS idx_files_extracted_text ON files USING gin(to_tsvector('english', extracted_text)); +CREATE INDEX IF NOT EXISTS idx_files_enrichment ON files USING gin(enrichment); + +-- Add full text search capability +CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING gin(to_tsvector('english', COALESCE(extracted_text, ''))); diff --git a/sql/migration/V003__add_folder_support.sql b/sql/migration/V003__add_folder_support.sql new file mode 100644 index 0000000..210bcb0 --- /dev/null +++ b/sql/migration/V003__add_folder_support.sql @@ -0,0 +1,41 @@ +CREATE TABLE IF NOT EXISTS folders +( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + path TEXT NOT NULL UNIQUE, + parent_path TEXT, + disk_label VARCHAR(50), + + file_count INT DEFAULT 0, + total_size BIGINT DEFAULT 0, + + project_type VARCHAR(50), + intent TEXT, + summary TEXT, + + has_readme BOOLEAN DEFAULT FALSE, + has_git BOOLEAN DEFAULT FALSE, + has_manifest BOOLEAN DEFAULT FALSE, + manifest_types TEXT[], + dominant_file_types JSONB, + + structure JSONB, + + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS idx_folders_path ON folders (path); +CREATE INDEX IF NOT EXISTS idx_folders_parent ON folders (parent_path); +CREATE INDEX IF NOT EXISTS idx_folders_disk ON folders (disk_label); +CREATE INDEX IF NOT EXISTS idx_folders_project_type ON folders (project_type); + +CREATE TABLE IF NOT EXISTS processing_checkpoints +( + task_name VARCHAR(100) PRIMARY KEY, + last_processed_id TEXT, + last_processed_path TEXT, + processed_count INT DEFAULT 0, + total_count INT, + started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); diff --git a/sql/setup_database.sql b/sql/setup_database.sql index 0745998..27b7992 100644 --- a/sql/setup_database.sql +++ b/sql/setup_database.sql @@ -19,54 +19,27 @@ CREATE DATABASE disk_reorganizer_db CREATE USER disk_reorg_user WITH PASSWORD 'heel-goed-wachtwoord'; -- Create files table -CREATE TABLE IF NOT EXISTS files ( - path TEXT PRIMARY KEY, - size BIGINT NOT NULL, - modified_time DOUBLE PRECISION NOT NULL, - disk_label TEXT NOT NULL, - checksum TEXT, - status TEXT DEFAULT 'indexed', - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP -); -- Create index on disk column for faster queries -CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label); -CREATE INDEX IF NOT EXISTS idx_files_status ON files(status); --- Create operations table -CREATE TABLE IF NOT EXISTS operations ( - id SERIAL PRIMARY KEY, - source_path TEXT NOT NULL, - target_path TEXT NOT NULL, - operation_type TEXT NOT NULL, - executed INTEGER DEFAULT 0, - verified INTEGER DEFAULT 0, - error TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - executed_at TIMESTAMP -); - --- Create index on operations for faster lookups -CREATE INDEX IF NOT EXISTS idx_operations_executed ON operations(executed); -CREATE INDEX IF NOT EXISTS idx_operations_source ON operations(source_path); -- Grant privileges to disk_reorg_user GRANT CONNECT ON DATABASE disk_reorganizer_db TO disk_reorg_user; GRANT USAGE ON SCHEMA public TO disk_reorg_user; -GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO disk_reorg_user; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO disk_reorg_user; GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO disk_reorg_user; -- future tables/sequences created by your owner role (pick the role that creates them) ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public - GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user; + GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user; ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public - GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user; + GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user; -- Create function to update updated_at timestamp CREATE OR REPLACE FUNCTION update_updated_at_column() -RETURNS TRIGGER AS $$ + RETURNS TRIGGER AS +$$ BEGIN NEW.updated_at = CURRENT_TIMESTAMP; RETURN NEW; @@ -75,9 +48,10 @@ $$ LANGUAGE plpgsql; -- Create trigger for files table CREATE TRIGGER update_files_updated_at - BEFORE UPDATE ON files + BEFORE UPDATE + ON files FOR EACH ROW - EXECUTE FUNCTION update_updated_at_column(); +EXECUTE FUNCTION update_updated_at_column(); -- Display success message \echo 'Database setup completed successfully!'