base

2025-12-13 03:56:14 +01:00
parent 7c5df059df
commit 75034d5e51
5 changed files with 662 additions and 17 deletions
--- a/app/content/extractors.py
+++ b/app/content/extractors.py
@@ -0,0 +1,104 @@
+from pathlib import Path
+from typing import Dict, Optional
+import json
+
+class ContentExtractor:
+    def __init__(self):
+        self.extractors = {
+            'pdf_text': self._extract_pdf,
+            'ocr+caption': self._extract_image,
+            'transcribe': self._extract_audio,
+            'transcribe+scenes': self._extract_video,
+            'office_text': self._extract_document,
+            'read': self._extract_text,
+            'read+syntax': self._extract_code
+        }
+
+    def extract(self, file_path: Path, extractor_type: str) -> Dict:
+        extractor = self.extractors.get(extractor_type)
+        if not extractor:
+            return {'error': f'Unknown extractor: {extractor_type}'}
+
+        try:
+            return extractor(file_path)
+        except Exception as e:
+            return {'error': str(e)}
+
+    def _extract_text(self, file_path: Path) -> Dict:
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read(1024 * 1024)
+            return {
+                'text': content,
+                'char_count': len(content),
+                'needs_llm': False
+            }
+        except Exception as e:
+            return {'error': str(e)}
+
+    def _extract_code(self, file_path: Path) -> Dict:
+        result = self._extract_text(file_path)
+        if 'error' not in result:
+            result['type'] = 'code'
+            result['needs_llm'] = True
+        return result
+
+    def _extract_pdf(self, file_path: Path) -> Dict:
+        try:
+            import PyPDF2
+            text_parts = []
+            with open(file_path, 'rb') as f:
+                pdf = PyPDF2.PdfReader(f)
+                for page in pdf.pages[:10]:
+                    text_parts.append(page.extract_text())
+
+            text = '\n'.join(text_parts)
+            return {
+                'text': text,
+                'pages_extracted': len(text_parts),
+                'needs_llm': len(text.strip()) > 100,
+                'type': 'document'
+            }
+        except Exception as e:
+            return {'error': str(e), 'needs_ocr': True}
+
+    def _extract_image(self, file_path: Path) -> Dict:
+        return {
+            'type': 'image',
+            'needs_ocr': True,
+            'needs_caption': True,
+            'needs_llm': True,
+            'pipeline': ['ocr', 'caption', 'embedding'],
+            'status': 'pending'
+        }
+
+    def _extract_audio(self, file_path: Path) -> Dict:
+        return {
+            'type': 'audio',
+            'needs_transcription': True,
+            'needs_llm': True,
+            'pipeline': ['transcribe', 'summarize'],
+            'status': 'pending'
+        }
+
+    def _extract_video(self, file_path: Path) -> Dict:
+        return {
+            'type': 'video',
+            'needs_transcription': True,
+            'needs_scene_detection': True,
+            'needs_llm': True,
+            'pipeline': ['transcribe', 'scenes', 'summarize'],
+            'status': 'pending'
+        }
+
+    def _extract_document(self, file_path: Path) -> Dict:
+        try:
+            import textract
+            text = textract.process(str(file_path)).decode('utf-8')
+            return {
+                'text': text,
+                'type': 'document',
+                'needs_llm': len(text.strip()) > 100
+            }
+        except:
+            return {'error': 'textract failed', 'needs_llm': True}
--- a/app/content/profiler.py
+++ b/app/content/profiler.py
@@ -0,0 +1,155 @@
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import mimetypes
+import magic
+import json
+from datetime import datetime
+
+class ContentProfiler:
+    def __init__(self):
+        self.mime_detector = magic.Magic(mime=True)
+
+        self.kind_mapping = {
+            'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'],
+            'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'],
+            'pdf': ['application/pdf'],
+            'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'],
+            'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'],
+            'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'],
+            'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'],
+            'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'],
+            'spreadsheet': ['application/vnd.ms-excel', 'text/csv']
+        }
+
+        self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
+        self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
+        self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
+
+    def profile_file(self, file_path: Path) -> Dict:
+        try:
+            stat = file_path.stat()
+            size = stat.st_size
+            mtime = datetime.fromtimestamp(stat.st_mtime)
+
+            mime_type = self._detect_mime(file_path)
+            kind = self._determine_kind(file_path, mime_type)
+
+            profile = {
+                'path': str(file_path),
+                'size': size,
+                'mtime': mtime.isoformat(),
+                'mime': mime_type,
+                'kind': kind,
+                'processable': kind in self.processable_kinds,
+                'extractor': self._suggest_extractor(kind, mime_type),
+                'hints': self._extract_hints(file_path, kind, mime_type, size)
+            }
+
+            return profile
+
+        except Exception as e:
+            return {
+                'path': str(file_path),
+                'error': str(e),
+                'processable': False
+            }
+
+    def _detect_mime(self, file_path: Path) -> str:
+        try:
+            return self.mime_detector.from_file(str(file_path))
+        except:
+            guess = mimetypes.guess_type(str(file_path))[0]
+            return guess or 'application/octet-stream'
+
+    def _determine_kind(self, file_path: Path, mime_type: str) -> str:
+        for kind, mimes in self.kind_mapping.items():
+            if any(mime in mime_type for mime in mimes):
+                return kind
+
+        suffix = file_path.suffix.lower()
+        if suffix in self.text_exts:
+            return 'text'
+        if suffix in self.code_exts:
+            return 'code'
+
+        return 'unknown'
+
+    def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
+        extractors = {
+            'pdf': 'pdf_text',
+            'image': 'ocr+caption',
+            'audio': 'transcribe',
+            'video': 'transcribe+scenes',
+            'document': 'office_text',
+            'text': 'read',
+            'code': 'read+syntax'
+        }
+        return extractors.get(kind)
+
+    def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
+        hints = {}
+
+        if kind == 'text' or kind == 'code':
+            hints['language'] = self._guess_language(file_path)
+            if size < 1024 * 1024:
+                hints['lines'] = self._count_lines(file_path)
+
+        if kind == 'pdf':
+            hints['page_count'] = self._get_pdf_pages(file_path)
+
+        if kind in ['audio', 'video']:
+            hints['duration'] = self._get_media_duration(file_path)
+
+        if kind == 'image':
+            hints['has_exif'] = self._has_exif(file_path)
+            hints['dimensions'] = self._get_image_dimensions(file_path)
+
+        return hints
+
+    def _guess_language(self, file_path: Path) -> Optional[str]:
+        lang_map = {
+            '.py': 'python', '.js': 'javascript', '.ts': 'typescript',
+            '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c',
+            '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'
+        }
+        return lang_map.get(file_path.suffix.lower())
+
+    def _count_lines(self, file_path: Path) -> Optional[int]:
+        try:
+            with open(file_path, 'rb') as f:
+                return sum(1 for _ in f)
+        except:
+            return None
+
+    def _get_pdf_pages(self, file_path: Path) -> Optional[int]:
+        try:
+            import PyPDF2
+            with open(file_path, 'rb') as f:
+                pdf = PyPDF2.PdfReader(f)
+                return len(pdf.pages)
+        except:
+            return None
+
+    def _get_media_duration(self, file_path: Path) -> Optional[float]:
+        try:
+            import ffmpeg
+            probe = ffmpeg.probe(str(file_path))
+            return float(probe['format']['duration'])
+        except:
+            return None
+
+    def _has_exif(self, file_path: Path) -> bool:
+        try:
+            from PIL import Image
+            img = Image.open(file_path)
+            return hasattr(img, '_getexif') and img._getexif() is not None
+        except:
+            return False
+
+    def _get_image_dimensions(self, file_path: Path) -> Optional[Tuple[int, int]]:
+        try:
+            from PIL import Image
+            with Image.open(file_path) as img:
+                return img.size
+        except:
+            return None