remove_doc

2025-12-13 04:23:04 +01:00
parent 75034d5e51
commit 9759001f4c
9 changed files with 741 additions and 1641 deletions
--- a/app/content/extractors.py
+++ b/app/content/extractors.py
@@ -3,22 +3,14 @@ from typing import Dict, Optional
 import json

 class ContentExtractor:
+
    def __init__(self):
-        self.extractors = {
-            'pdf_text': self._extract_pdf,
-            'ocr+caption': self._extract_image,
-            'transcribe': self._extract_audio,
-            'transcribe+scenes': self._extract_video,
-            'office_text': self._extract_document,
-            'read': self._extract_text,
-            'read+syntax': self._extract_code
-        }
+        self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}

    def extract(self, file_path: Path, extractor_type: str) -> Dict:
        extractor = self.extractors.get(extractor_type)
        if not extractor:
            return {'error': f'Unknown extractor: {extractor_type}'}
-
        try:
            return extractor(file_path)
        except Exception as e:
@@ -28,11 +20,7 @@ class ContentExtractor:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read(1024 * 1024)
-            return {
-                'text': content,
-                'char_count': len(content),
-                'needs_llm': False
-            }
+            return {'text': content, 'char_count': len(content), 'needs_llm': False}
        except Exception as e:
            return {'error': str(e)}

@@ -51,54 +39,24 @@ class ContentExtractor:
                pdf = PyPDF2.PdfReader(f)
                for page in pdf.pages[:10]:
                    text_parts.append(page.extract_text())
-
            text = '\n'.join(text_parts)
-            return {
-                'text': text,
-                'pages_extracted': len(text_parts),
-                'needs_llm': len(text.strip()) > 100,
-                'type': 'document'
-            }
+            return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
        except Exception as e:
            return {'error': str(e), 'needs_ocr': True}

    def _extract_image(self, file_path: Path) -> Dict:
-        return {
-            'type': 'image',
-            'needs_ocr': True,
-            'needs_caption': True,
-            'needs_llm': True,
-            'pipeline': ['ocr', 'caption', 'embedding'],
-            'status': 'pending'
-        }
+        return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}

    def _extract_audio(self, file_path: Path) -> Dict:
-        return {
-            'type': 'audio',
-            'needs_transcription': True,
-            'needs_llm': True,
-            'pipeline': ['transcribe', 'summarize'],
-            'status': 'pending'
-        }
+        return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}

    def _extract_video(self, file_path: Path) -> Dict:
-        return {
-            'type': 'video',
-            'needs_transcription': True,
-            'needs_scene_detection': True,
-            'needs_llm': True,
-            'pipeline': ['transcribe', 'scenes', 'summarize'],
-            'status': 'pending'
-        }
+        return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}

    def _extract_document(self, file_path: Path) -> Dict:
        try:
            import textract
            text = textract.process(str(file_path)).decode('utf-8')
-            return {
-                'text': text,
-                'type': 'document',
-                'needs_llm': len(text.strip()) > 100
-            }
+            return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
        except:
            return {'error': 'textract failed', 'needs_llm': True}