diff --git a/app/classification/__init__.py b/app/classification/__init__.py index 04fee6d..4b32f89 100644 --- a/app/classification/__init__.py +++ b/app/classification/__init__.py @@ -1,17 +1,3 @@ -"""Classification package exports""" -from .rules import RuleBasedClassifier -from .ml import create_ml_classifier, train_from_database, MLClassifier, DummyMLClassifier -from .engine import ClassificationEngine -from ._protocols import ClassificationRule, IClassifier, IRuleEngine +from .classifier import FileClassifier -__all__ = [ - 'RuleBasedClassifier', - 'MLClassifier', - 'DummyMLClassifier', - 'create_ml_classifier', - 'train_from_database', - 'ClassificationEngine', - 'ClassificationRule', - 'IClassifier', - 'IRuleEngine', -] +__all__ = ['FileClassifier'] diff --git a/app/classification/classifier.py b/app/classification/classifier.py new file mode 100644 index 0000000..f870c3f --- /dev/null +++ b/app/classification/classifier.py @@ -0,0 +1,124 @@ +from pathlib import Path +from typing import List, Set, Dict, Tuple +import re + +class FileClassifier: + def __init__(self): + self.build_patterns = { + 'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', + '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', + 'site-packages', 'bower_components', 'jspm_packages' + } + + self.artifact_patterns = { + 'java': {'.jar', '.war', '.ear', '.class'}, + 'python': {'.pyc', '.pyo', '.whl', '.egg'}, + 'node': {'node_modules'}, + 'go': {'vendor', 'pkg'}, + 'rust': {'target'}, + 'docker': {'.dockerignore', 'Dockerfile'} + } + + self.category_keywords = { + 'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, + 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, + 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, + 'cache': {'cache', 'temp', 'tmp', '.cache'}, + 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, + 'backups': {'backup', 'bak', 'snapshot', 'archive'}, + 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, + 'artifacts': {'build', 'dist', 'release', 'output'}, + 'temp': {'tmp', 'temp', 'staging', 'processing'} + } + + self.media_extensions = { + 'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, + 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, + 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, + 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, + 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, + 'presentation': {'.ppt', '.pptx', '.odp'} + } + + self.code_extensions = { + '.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', + '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r' + } + + def classify_path(self, path: str, size: int = 0) -> Tuple[Set[str], str, bool]: + p = Path(path) + labels = set() + primary_category = 'misc' + is_build_artifact = False + + parts = p.parts + name_lower = p.name.lower() + + for part in parts: + part_lower = part.lower() + if part_lower in self.build_patterns: + is_build_artifact = True + labels.add('build-artifact') + break + + if is_build_artifact: + for artifact_type, patterns in self.artifact_patterns.items(): + if any(part.lower() in patterns for part in parts) or p.suffix in patterns: + primary_category = f'artifacts/{artifact_type}' + labels.add('artifact') + return labels, primary_category, is_build_artifact + + if '.git' in parts: + labels.add('vcs') + primary_category = 'infra/git-infrastructure' + return labels, primary_category, False + + for category, keywords in self.category_keywords.items(): + if any(kw in name_lower or any(kw in part.lower() for part in parts) for kw in keywords): + labels.add(category) + primary_category = category + break + + for media_type, extensions in self.media_extensions.items(): + if p.suffix.lower() in extensions: + labels.add(media_type) + labels.add('media') + primary_category = f'user/{media_type}' + break + + if p.suffix.lower() in self.code_extensions: + labels.add('code') + if primary_category == 'misc': + primary_category = 'dev' + + if size > 100 * 1024 * 1024: + labels.add('large-file') + + if any(kw in name_lower for kw in ['test', 'spec', 'mock']): + labels.add('test') + + if any(kw in name_lower for kw in ['config', 'settings', 'env']): + labels.add('config') + + return labels, primary_category, is_build_artifact + + def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str: + p = Path(source_path) + + if 'build-artifact' in labels: + return f'trash/build-artifacts/{source_path}' + + if category.startswith('artifacts/'): + artifact_type = category.split('/')[-1] + return f'artifacts/{artifact_type}/{p.name}' + + if category.startswith('user/'): + media_type = category.split('/')[-1] + return f'user/{media_type}/{p.name}' + + parts = [part for part in p.parts if part not in self.build_patterns] + if len(parts) > 3: + project_name = parts[0] if parts else 'misc' + return f'{category}/{project_name}/{"/".join(parts[1:])}' + + return f'{category}/{source_path}' diff --git a/app/content/extractors.py b/app/content/extractors.py new file mode 100644 index 0000000..bb55212 --- /dev/null +++ b/app/content/extractors.py @@ -0,0 +1,104 @@ +from pathlib import Path +from typing import Dict, Optional +import json + +class ContentExtractor: + def __init__(self): + self.extractors = { + 'pdf_text': self._extract_pdf, + 'ocr+caption': self._extract_image, + 'transcribe': self._extract_audio, + 'transcribe+scenes': self._extract_video, + 'office_text': self._extract_document, + 'read': self._extract_text, + 'read+syntax': self._extract_code + } + + def extract(self, file_path: Path, extractor_type: str) -> Dict: + extractor = self.extractors.get(extractor_type) + if not extractor: + return {'error': f'Unknown extractor: {extractor_type}'} + + try: + return extractor(file_path) + except Exception as e: + return {'error': str(e)} + + def _extract_text(self, file_path: Path) -> Dict: + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read(1024 * 1024) + return { + 'text': content, + 'char_count': len(content), + 'needs_llm': False + } + except Exception as e: + return {'error': str(e)} + + def _extract_code(self, file_path: Path) -> Dict: + result = self._extract_text(file_path) + if 'error' not in result: + result['type'] = 'code' + result['needs_llm'] = True + return result + + def _extract_pdf(self, file_path: Path) -> Dict: + try: + import PyPDF2 + text_parts = [] + with open(file_path, 'rb') as f: + pdf = PyPDF2.PdfReader(f) + for page in pdf.pages[:10]: + text_parts.append(page.extract_text()) + + text = '\n'.join(text_parts) + return { + 'text': text, + 'pages_extracted': len(text_parts), + 'needs_llm': len(text.strip()) > 100, + 'type': 'document' + } + except Exception as e: + return {'error': str(e), 'needs_ocr': True} + + def _extract_image(self, file_path: Path) -> Dict: + return { + 'type': 'image', + 'needs_ocr': True, + 'needs_caption': True, + 'needs_llm': True, + 'pipeline': ['ocr', 'caption', 'embedding'], + 'status': 'pending' + } + + def _extract_audio(self, file_path: Path) -> Dict: + return { + 'type': 'audio', + 'needs_transcription': True, + 'needs_llm': True, + 'pipeline': ['transcribe', 'summarize'], + 'status': 'pending' + } + + def _extract_video(self, file_path: Path) -> Dict: + return { + 'type': 'video', + 'needs_transcription': True, + 'needs_scene_detection': True, + 'needs_llm': True, + 'pipeline': ['transcribe', 'scenes', 'summarize'], + 'status': 'pending' + } + + def _extract_document(self, file_path: Path) -> Dict: + try: + import textract + text = textract.process(str(file_path)).decode('utf-8') + return { + 'text': text, + 'type': 'document', + 'needs_llm': len(text.strip()) > 100 + } + except: + return {'error': 'textract failed', 'needs_llm': True} diff --git a/app/content/profiler.py b/app/content/profiler.py new file mode 100644 index 0000000..75e0776 --- /dev/null +++ b/app/content/profiler.py @@ -0,0 +1,155 @@ +from pathlib import Path +from typing import Dict, Optional, Tuple +import mimetypes +import magic +import json +from datetime import datetime + +class ContentProfiler: + def __init__(self): + self.mime_detector = magic.Magic(mime=True) + + self.kind_mapping = { + 'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], + 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], + 'pdf': ['application/pdf'], + 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], + 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], + 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], + 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], + 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], + 'spreadsheet': ['application/vnd.ms-excel', 'text/csv'] + } + + self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'} + self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'} + self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'} + + def profile_file(self, file_path: Path) -> Dict: + try: + stat = file_path.stat() + size = stat.st_size + mtime = datetime.fromtimestamp(stat.st_mtime) + + mime_type = self._detect_mime(file_path) + kind = self._determine_kind(file_path, mime_type) + + profile = { + 'path': str(file_path), + 'size': size, + 'mtime': mtime.isoformat(), + 'mime': mime_type, + 'kind': kind, + 'processable': kind in self.processable_kinds, + 'extractor': self._suggest_extractor(kind, mime_type), + 'hints': self._extract_hints(file_path, kind, mime_type, size) + } + + return profile + + except Exception as e: + return { + 'path': str(file_path), + 'error': str(e), + 'processable': False + } + + def _detect_mime(self, file_path: Path) -> str: + try: + return self.mime_detector.from_file(str(file_path)) + except: + guess = mimetypes.guess_type(str(file_path))[0] + return guess or 'application/octet-stream' + + def _determine_kind(self, file_path: Path, mime_type: str) -> str: + for kind, mimes in self.kind_mapping.items(): + if any(mime in mime_type for mime in mimes): + return kind + + suffix = file_path.suffix.lower() + if suffix in self.text_exts: + return 'text' + if suffix in self.code_exts: + return 'code' + + return 'unknown' + + def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]: + extractors = { + 'pdf': 'pdf_text', + 'image': 'ocr+caption', + 'audio': 'transcribe', + 'video': 'transcribe+scenes', + 'document': 'office_text', + 'text': 'read', + 'code': 'read+syntax' + } + return extractors.get(kind) + + def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict: + hints = {} + + if kind == 'text' or kind == 'code': + hints['language'] = self._guess_language(file_path) + if size < 1024 * 1024: + hints['lines'] = self._count_lines(file_path) + + if kind == 'pdf': + hints['page_count'] = self._get_pdf_pages(file_path) + + if kind in ['audio', 'video']: + hints['duration'] = self._get_media_duration(file_path) + + if kind == 'image': + hints['has_exif'] = self._has_exif(file_path) + hints['dimensions'] = self._get_image_dimensions(file_path) + + return hints + + def _guess_language(self, file_path: Path) -> Optional[str]: + lang_map = { + '.py': 'python', '.js': 'javascript', '.ts': 'typescript', + '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', + '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php' + } + return lang_map.get(file_path.suffix.lower()) + + def _count_lines(self, file_path: Path) -> Optional[int]: + try: + with open(file_path, 'rb') as f: + return sum(1 for _ in f) + except: + return None + + def _get_pdf_pages(self, file_path: Path) -> Optional[int]: + try: + import PyPDF2 + with open(file_path, 'rb') as f: + pdf = PyPDF2.PdfReader(f) + return len(pdf.pages) + except: + return None + + def _get_media_duration(self, file_path: Path) -> Optional[float]: + try: + import ffmpeg + probe = ffmpeg.probe(str(file_path)) + return float(probe['format']['duration']) + except: + return None + + def _has_exif(self, file_path: Path) -> bool: + try: + from PIL import Image + img = Image.open(file_path) + return hasattr(img, '_getexif') and img._getexif() is not None + except: + return False + + def _get_image_dimensions(self, file_path: Path) -> Optional[Tuple[int, int]]: + try: + from PIL import Image + with Image.open(file_path) as img: + return img.size + except: + return None diff --git a/app/main.py b/app/main.py index 0044d4b..7c60bb0 100644 --- a/app/main.py +++ b/app/main.py @@ -703,9 +703,252 @@ class DiskReorganizer: cursor.close() conn.close() + def profile_content(self, disk: Optional[str] = None, update_db: bool = False, limit: Optional[int] = None): + from content.profiler import ContentProfiler + + profiler = ContentProfiler() + disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'} + + conn = self.get_connection() + cursor = conn.cursor() + + try: + query = "SELECT path, size, disk_label FROM files WHERE 1=1" + params = [] + if disk: + query += " AND disk_label = %s" + params.append(disk) + if limit: + query += f" LIMIT {limit}" + + cursor.execute(query, params) + files = cursor.fetchall() + total = len(files) + logger.info(f"Profiling {total:,} files...") + + kind_stats = {} + processable = 0 + batch = [] + + for idx, (path, size, disk_label) in enumerate(files, 1): + mount_point = disk_mount_map.get(disk_label, disk_label) + full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path) + + if not full_path.exists(): + continue + + profile = profiler.profile_file(full_path) + + if 'error' not in profile: + kind = profile['kind'] + if kind not in kind_stats: + kind_stats[kind] = {'count': 0, 'processable': 0} + kind_stats[kind]['count'] += 1 + if profile['processable']: + kind_stats[kind]['processable'] += 1 + processable += 1 + + if update_db: + profile_json = json.dumps(profile) + batch.append((kind, profile_json, path)) + + if len(batch) >= 500: + cursor.executemany( + "UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", + [(pj, p) for k, pj, p in batch] + ) + conn.commit() + batch.clear() + + if idx % 100 == 0: + print(f"\rProfiled: {idx:,}/{total:,}", end='', flush=True) + + if update_db and batch: + cursor.executemany( + "UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", + [(pj, p) for k, pj, p in batch] + ) + conn.commit() + + print() + print(f"\n=== CONTENT PROFILE SUMMARY ===") + print(f"Total files: {total:,}") + print(f"Processable: {processable:,}\n") + print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}") + print("-" * 60) + for kind in sorted(kind_stats.keys()): + stats = kind_stats[kind] + extractor = profiler._suggest_extractor(kind, '') + print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}") + + finally: + cursor.close() + conn.close() + + def extract_content(self, kind: Optional[str] = None, limit: int = 10): + from content.profiler import ContentProfiler + from content.extractors import ContentExtractor + + profiler = ContentProfiler() + extractor = ContentExtractor() + disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'} + + conn = self.get_connection() + cursor = conn.cursor() + + try: + query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'" + params = [] + if kind: + query += " AND metadata->'profile'->>'kind' = %s" + params.append(kind) + query += f" LIMIT {limit}" + + cursor.execute(query, params) + files = cursor.fetchall() + + print(f"\n=== EXTRACTING CONTENT ===") + print(f"Processing {len(files)} files\n") + + for path, size, disk_label, metadata in files: + mount_point = disk_mount_map.get(disk_label, disk_label) + full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path) + + if not full_path.exists(): + continue + + profile = metadata.get('profile', {}) if metadata else {} + extractor_type = profile.get('extractor') + + if not extractor_type: + continue + + print(f"Extracting: {path}") + print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}") + + result = extractor.extract(full_path, extractor_type) + + if 'text' in result: + preview = result['text'][:200] + print(f" Preview: {preview}...") + elif 'pipeline' in result: + print(f" Pipeline: {' → '.join(result['pipeline'])}") + print(f" Status: {result.get('status', 'pending')}") + + print() + + finally: + cursor.close() + conn.close() + + def classify_files(self, disk: Optional[str] = None, update_db: bool = False): + from classification.classifier import FileClassifier + + classifier = FileClassifier() + conn = self.get_connection() + cursor = conn.cursor() + + try: + if disk: + cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s", (disk,)) + else: + cursor.execute("SELECT path, size, disk_label FROM files") + + files = cursor.fetchall() + total = len(files) + logger.info(f"Classifying {total:,} files...") + + categories = {} + build_artifacts = 0 + batch = [] + + for idx, (path, size, disk_label) in enumerate(files, 1): + labels, category, is_build = classifier.classify_path(path, int(size)) + + if is_build: + build_artifacts += 1 + + if category not in categories: + categories[category] = {'count': 0, 'size': 0} + categories[category]['count'] += 1 + categories[category]['size'] += int(size) + + if update_db: + labels_str = ','.join(labels) + batch.append((category, labels_str, path)) + + if len(batch) >= 1000: + cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch]) + conn.commit() + batch.clear() + + if idx % 1000 == 0: + print(f"\rClassified: {idx:,}/{total:,}", end='', flush=True) + + if update_db and batch: + cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch]) + conn.commit() + + print() + print(f"\n=== CLASSIFICATION SUMMARY ===") + print(f"Total files: {total:,}") + print(f"Build artifacts: {build_artifacts:,}") + print(f"\nCategories:") + for category in sorted(categories.keys()): + info = categories[category] + print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}") + + finally: + cursor.close() + conn.close() + + def review_migration(self, category: Optional[str] = None, show_build: bool = False): + from classification.classifier import FileClassifier + + classifier = FileClassifier() + conn = self.get_connection() + cursor = conn.cursor() + + try: + query = "SELECT path, size, category FROM files WHERE 1=1" + params = [] + + if category: + query += " AND category = %s" + params.append(category) + + if not show_build: + query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')" + + query += " ORDER BY category, size DESC LIMIT 100" + + cursor.execute(query, params) + files = cursor.fetchall() + + if not files: + print("No files found matching criteria") + return + + print(f"\n=== MIGRATION PREVIEW ===") + print(f"Showing {len(files)} files\n") + + current_category = None + for path, size, cat in files: + if cat != current_category: + current_category = cat + print(f"\n{cat}:") + + labels, suggested_cat, is_build = classifier.classify_path(path, int(size)) + target = classifier.suggest_target_path(path, suggested_cat, labels) + print(f" {path}") + print(f" → {target} ({self.format_size(int(size))})") + + finally: + cursor.close() + conn.close() + @staticmethod def format_size(size: int) -> str: - """Format bytes to human readable string""" for unit in ['B', 'KB', 'MB', 'GB', 'TB']: if size < 1024: return f"{size:.1f}{unit}" @@ -744,6 +987,27 @@ def main(): merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files') merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)') + # Profile command + profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)') + profile_parser.add_argument('--disk', help='Profile specific disk') + profile_parser.add_argument('--update', action='store_true', help='Update database with profiles') + profile_parser.add_argument('--limit', type=int, help='Limit number of files') + + # Extract command + extract_parser = subparsers.add_parser('extract', help='Extract content from files') + extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)') + extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch') + + # Classify command + classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization') + classify_parser.add_argument('--disk', help='Classify specific disk') + classify_parser.add_argument('--update', action='store_true', help='Update database with classifications') + + # Review command + review_parser = subparsers.add_parser('review', help='Review proposed migration structure') + review_parser.add_argument('--category', help='Review specific category') + review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts') + # Report command report_parser = subparsers.add_parser('report', help='Show current status') report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format') @@ -772,6 +1036,18 @@ def main(): elif args.command == 'execute': tool.execute_migration(args.plan_file, dry_run=args.dry_run) + elif args.command == 'profile': + tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit) + + elif args.command == 'extract': + tool.extract_content(kind=args.kind, limit=args.limit) + + elif args.command == 'classify': + tool.classify_files(disk=args.disk, update_db=args.update) + + elif args.command == 'review': + tool.review_migration(category=args.category, show_build=args.show_build) + elif args.command == 'report': tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)