clean up code

2025-12-13 13:57:13 +01:00
parent f6aa2b7b76
commit 1583df8f57
9 changed files with 622 additions and 138 deletions
--- a/app/parsers/audio_parser.py
+++ b/app/parsers/audio_parser.py
@@ -0,0 +1,62 @@
+from pathlib import Path
+from typing import Dict
+import logging
+
+logger = logging.getLogger(__name__)
+
+class AudioParser:
+    def __init__(self, whisper_model: str = 'base'):
+        self.supported_formats = {'.mp3', '.wav', '.flac', '.m4a', '.ogg', '.wma', '.aac'}
+        self.whisper_model = whisper_model
+
+    def parse(self, file_path: Path) -> Dict:
+        if file_path.suffix.lower() not in self.supported_formats:
+            return {'error': f'Unsupported format: {file_path.suffix}'}
+
+        try:
+            return self._transcribe_with_whisper(file_path)
+        except Exception as e:
+            logger.error(f'Audio parse failed for {file_path}: {e}')
+            return {'error': str(e), 'text': ''}
+
+    def _transcribe_with_whisper(self, file_path: Path) -> Dict:
+        try:
+            import whisper
+
+            model = whisper.load_model(self.whisper_model)
+            result = model.transcribe(str(file_path))
+
+            return {
+                'text': result['text'].strip(),
+                'quality': 'good',
+                'method': f'whisper-{self.whisper_model}',
+                'language': result.get('language', 'unknown'),
+                'segments': len(result.get('segments', [])),
+                'metadata': {
+                    'duration': result.get('duration'),
+                    'language': result.get('language')
+                }
+            }
+        except ImportError:
+            logger.warning('Whisper not installed')
+            return {'error': 'Whisper not installed', 'text': '', 'needs': 'pip install openai-whisper'}
+        except Exception as e:
+            return {'error': str(e), 'text': ''}
+
+    def extract_metadata(self, file_path: Path) -> Dict:
+        try:
+            import mutagen
+            audio = mutagen.File(str(file_path))
+            if audio is None:
+                return {'error': 'Could not read audio file'}
+
+            return {
+                'duration': audio.info.length if hasattr(audio.info, 'length') else None,
+                'bitrate': audio.info.bitrate if hasattr(audio.info, 'bitrate') else None,
+                'sample_rate': audio.info.sample_rate if hasattr(audio.info, 'sample_rate') else None,
+                'channels': audio.info.channels if hasattr(audio.info, 'channels') else None
+            }
+        except ImportError:
+            return {'error': 'mutagen not installed', 'needs': 'pip install mutagen'}
+        except Exception as e:
+            return {'error': str(e)}
--- a/app/parsers/document_parser.py
+++ b/app/parsers/document_parser.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+from typing import Dict
+import logging
+import subprocess
+
+logger = logging.getLogger(__name__)
+
+class DocumentParser:
+    def __init__(self):
+        self.supported_formats = {'.doc', '.docx', '.odt', '.rtf'}
+
+    def parse(self, file_path: Path) -> Dict:
+        if file_path.suffix.lower() not in self.supported_formats:
+            return {'error': f'Unsupported format: {file_path.suffix}'}
+
+        try:
+            if file_path.suffix.lower() in {'.docx', '.odt'}:
+                return self._parse_with_python(file_path)
+            else:
+                return self._parse_with_external(file_path)
+        except Exception as e:
+            logger.error(f'Document parse failed for {file_path}: {e}')
+            return {'error': str(e), 'text': ''}
+
+    def _parse_with_python(self, file_path: Path) -> Dict:
+        try:
+            if file_path.suffix.lower() == '.docx':
+                import docx
+                doc = docx.Document(str(file_path))
+                text = '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
+                return {'text': text, 'quality': 'good', 'method': 'python-docx'}
+            elif file_path.suffix.lower() == '.odt':
+                from odf import text as odf_text, teletype
+                from odf.opendocument import load
+                doc = load(str(file_path))
+                paragraphs = doc.getElementsByType(odf_text.P)
+                text = '\n'.join([teletype.extractText(p) for p in paragraphs if teletype.extractText(p).strip()])
+                return {'text': text, 'quality': 'good', 'method': 'odfpy'}
+        except ImportError as ie:
+            logger.warning(f'Missing library for {file_path.suffix}: {ie}')
+            return {'error': f'Missing library: {ie}', 'text': '', 'needs': 'python-docx or odfpy'}
+        except Exception as e:
+            return {'error': str(e), 'text': ''}
+
+    def _parse_with_external(self, file_path: Path) -> Dict:
+        try:
+            result = subprocess.run(
+                ['antiword', str(file_path)],
+                capture_output=True,
+                text=True,
+                timeout=30
+            )
+            if result.returncode == 0:
+                return {'text': result.stdout, 'quality': 'good', 'method': 'antiword'}
+            else:
+                return {'error': 'antiword failed', 'text': '', 'needs': 'antiword tool'}
+        except FileNotFoundError:
+            return {'error': 'antiword not installed', 'text': '', 'needs': 'sudo apt install antiword'}
+        except Exception as e:
+            return {'error': str(e), 'text': ''}
--- a/app/parsers/image_parser.py
+++ b/app/parsers/image_parser.py
@@ -0,0 +1,63 @@
+from pathlib import Path
+from typing import Dict
+import logging
+
+logger = logging.getLogger(__name__)
+
+class ImageParser:
+    def __init__(self):
+        self.supported_formats = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
+
+    def parse(self, file_path: Path) -> Dict:
+        if file_path.suffix.lower() not in self.supported_formats:
+            return {'error': f'Unsupported format: {file_path.suffix}'}
+
+        try:
+            return self._parse_with_ocr(file_path)
+        except Exception as e:
+            logger.error(f'Image parse failed for {file_path}: {e}')
+            return {'error': str(e), 'text': ''}
+
+    def _parse_with_ocr(self, file_path: Path) -> Dict:
+        try:
+            from PIL import Image
+            import pytesseract
+
+            img = Image.open(str(file_path))
+            text = pytesseract.image_to_string(img)
+
+            data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
+            conf_scores = [int(c) for c in data['conf'] if c != '-1']
+            avg_confidence = sum(conf_scores) / len(conf_scores) if conf_scores else 0
+
+            quality = 'good' if avg_confidence > 80 else 'medium' if avg_confidence > 60 else 'low'
+
+            return {
+                'text': text.strip(),
+                'quality': quality,
+                'confidence': avg_confidence,
+                'method': 'tesseract',
+                'metadata': {
+                    'width': img.width,
+                    'height': img.height,
+                    'format': img.format
+                }
+            }
+        except ImportError as ie:
+            logger.warning(f'Missing library for OCR: {ie}')
+            return {'error': f'Missing library: {ie}', 'text': '', 'needs': 'pytesseract and tesseract-ocr'}
+        except Exception as e:
+            return {'error': str(e), 'text': ''}
+
+    def extract_metadata(self, file_path: Path) -> Dict:
+        try:
+            from PIL import Image
+            img = Image.open(str(file_path))
+            return {
+                'width': img.width,
+                'height': img.height,
+                'format': img.format,
+                'mode': img.mode
+            }
+        except Exception as e:
+            return {'error': str(e)}