initial

2025-12-13 11:56:06 +01:00
commit 2b2c575385
57 changed files with 6505 additions and 0 deletions
--- a/app/parsers/code_parser.py
+++ b/app/parsers/code_parser.py
@@ -0,0 +1,44 @@
+from pathlib import Path
+from typing import Dict
+import re
+
+class CodeParser:
+    def __init__(self):
+        self.patterns = {
+            'python': {'imports': r'^import |^from .+ import', 'class': r'^class \w+', 'function': r'^def \w+'},
+            'javascript': {'imports': r'^import |^require\(', 'class': r'^class \w+', 'function': r'^function \w+|^const \w+ = '},
+            'java': {'package': r'^package ', 'imports': r'^import ', 'class': r'^public class \w+'},
+            'go': {'package': r'^package ', 'imports': r'^import ', 'function': r'^func \w+'}
+        }
+
+    def parse(self, file_path: Path) -> Dict:
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                text = f.read()
+
+            language = self._detect_language(file_path, text)
+            structure = self._extract_structure(text, language)
+
+            return {
+                'text': text,
+                'language': language,
+                'line_count': len(text.split('\n')),
+                'structure': structure,
+                'quality': 'high'
+            }
+        except Exception as e:
+            return {'error': str(e)}
+
+    def _detect_language(self, file_path: Path, text: str) -> str:
+        lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go'}
+        return lang_map.get(file_path.suffix.lower(), 'unknown')
+
+    def _extract_structure(self, text: str, language: str) -> Dict:
+        patterns = self.patterns.get(language, {})
+        structure = {'type': 'code', 'language': language}
+
+        for key, pattern in patterns.items():
+            matches = re.findall(pattern, text, re.MULTILINE)
+            structure[key] = len(matches)
+
+        return structure
--- a/app/parsers/media_parser.py
+++ b/app/parsers/media_parser.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+from typing import Dict
+
+class MediaParser:
+    def parse_audio(self, file_path: Path) -> Dict:
+        return {
+            'text': '[Audio transcription pending]',
+            'needs_transcription': True,
+            'transcription_service': 'whisper',
+            'structure': {'type': 'audio'},
+            'quality': 'pending'
+        }
+
+    def parse_video(self, file_path: Path) -> Dict:
+        return {
+            'text': '[Video transcription pending]',
+            'needs_transcription': True,
+            'needs_scene_detection': True,
+            'transcription_service': 'whisper',
+            'structure': {'type': 'video'},
+            'quality': 'pending'
+        }
+
+    def parse_image(self, file_path: Path) -> Dict:
+        try:
+            from PIL import Image
+
+            with Image.open(file_path) as img:
+                width, height = img.size
+                mode = img.mode
+
+            return {
+                'text': '[Image caption/OCR pending]',
+                'needs_ocr': True,
+                'needs_caption': True,
+                'dimensions': f'{width}x{height}',
+                'mode': mode,
+                'structure': {'type': 'image', 'width': width, 'height': height},
+                'quality': 'pending'
+            }
+        except Exception as e:
+            return {'error': str(e)}
--- a/app/parsers/pdf_parser.py
+++ b/app/parsers/pdf_parser.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+from typing import Dict, List
+
+class PDFParser:
+    def parse(self, file_path: Path) -> Dict:
+        try:
+            import PyPDF2
+
+            pages = []
+            with open(file_path, 'rb') as f:
+                pdf = PyPDF2.PdfReader(f)
+                page_count = len(pdf.pages)
+
+                for i, page in enumerate(pdf.pages[:50]):
+                    text = page.extract_text()
+                    pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
+
+            full_text = '\n\n'.join([p['text'] for p in pages])
+            has_text_layer = sum(p['char_count'] for p in pages) > 100
+
+            return {
+                'text': full_text,
+                'page_count': page_count,
+                'pages_extracted': len(pages),
+                'has_text_layer': has_text_layer,
+                'needs_ocr': not has_text_layer,
+                'structure': {'type': 'document', 'pages': pages[:5]},
+                'quality': 'high' if has_text_layer else 'needs_ocr'
+            }
+        except Exception as e:
+            return {'error': str(e), 'needs_ocr': True}
--- a/app/parsers/text_parser.py
+++ b/app/parsers/text_parser.py
@@ -0,0 +1,26 @@
+from pathlib import Path
+from typing import Dict, Optional
+import chardet
+
+class TextParser:
+    def parse(self, file_path: Path) -> Dict:
+        try:
+            with open(file_path, 'rb') as f:
+                raw_data = f.read(1024 * 1024)
+
+            encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
+            text = raw_data.decode(encoding, errors='ignore')
+
+            lines = text.split('\n')
+
+            return {
+                'text': text,
+                'encoding': encoding,
+                'line_count': len(lines),
+                'char_count': len(text),
+                'word_count': len(text.split()),
+                'structure': {'type': 'plain_text'},
+                'quality': 'high' if encoding == 'utf-8' else 'medium'
+            }
+        except Exception as e:
+            return {'error': str(e)}