initial

2025-12-13 11:56:06 +01:00
commit 2b2c575385
57 changed files with 6505 additions and 0 deletions
--- a/app/content/extractors.py
+++ b/app/content/extractors.py
@@ -0,0 +1,62 @@
+from pathlib import Path
+from typing import Dict, Optional
+import json
+
+class ContentExtractor:
+
+    def __init__(self):
+        self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}
+
+    def extract(self, file_path: Path, extractor_type: str) -> Dict:
+        extractor = self.extractors.get(extractor_type)
+        if not extractor:
+            return {'error': f'Unknown extractor: {extractor_type}'}
+        try:
+            return extractor(file_path)
+        except Exception as e:
+            return {'error': str(e)}
+
+    def _extract_text(self, file_path: Path) -> Dict:
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read(1024 * 1024)
+            return {'text': content, 'char_count': len(content), 'needs_llm': False}
+        except Exception as e:
+            return {'error': str(e)}
+
+    def _extract_code(self, file_path: Path) -> Dict:
+        result = self._extract_text(file_path)
+        if 'error' not in result:
+            result['type'] = 'code'
+            result['needs_llm'] = True
+        return result
+
+    def _extract_pdf(self, file_path: Path) -> Dict:
+        try:
+            import PyPDF2
+            text_parts = []
+            with open(file_path, 'rb') as f:
+                pdf = PyPDF2.PdfReader(f)
+                for page in pdf.pages[:10]:
+                    text_parts.append(page.extract_text())
+            text = '\n'.join(text_parts)
+            return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
+        except Exception as e:
+            return {'error': str(e), 'needs_ocr': True}
+
+    def _extract_image(self, file_path: Path) -> Dict:
+        return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}
+
+    def _extract_audio(self, file_path: Path) -> Dict:
+        return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}
+
+    def _extract_video(self, file_path: Path) -> Dict:
+        return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}
+
+    def _extract_document(self, file_path: Path) -> Dict:
+        try:
+            import textract
+            text = textract.process(str(file_path)).decode('utf-8')
+            return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
+        except:
+            return {'error': 'textract failed', 'needs_llm': True}