This commit is contained in:
mike
2025-12-13 11:56:06 +01:00
commit 2b2c575385
57 changed files with 6505 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
from pathlib import Path
from typing import Dict
import re
class CodeParser:
def __init__(self):
self.patterns = {
'python': {'imports': r'^import |^from .+ import', 'class': r'^class \w+', 'function': r'^def \w+'},
'javascript': {'imports': r'^import |^require\(', 'class': r'^class \w+', 'function': r'^function \w+|^const \w+ = '},
'java': {'package': r'^package ', 'imports': r'^import ', 'class': r'^public class \w+'},
'go': {'package': r'^package ', 'imports': r'^import ', 'function': r'^func \w+'}
}
def parse(self, file_path: Path) -> Dict:
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
language = self._detect_language(file_path, text)
structure = self._extract_structure(text, language)
return {
'text': text,
'language': language,
'line_count': len(text.split('\n')),
'structure': structure,
'quality': 'high'
}
except Exception as e:
return {'error': str(e)}
def _detect_language(self, file_path: Path, text: str) -> str:
lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go'}
return lang_map.get(file_path.suffix.lower(), 'unknown')
def _extract_structure(self, text: str, language: str) -> Dict:
patterns = self.patterns.get(language, {})
structure = {'type': 'code', 'language': language}
for key, pattern in patterns.items():
matches = re.findall(pattern, text, re.MULTILINE)
structure[key] = len(matches)
return structure

View File

@@ -0,0 +1,42 @@
from pathlib import Path
from typing import Dict
class MediaParser:
def parse_audio(self, file_path: Path) -> Dict:
return {
'text': '[Audio transcription pending]',
'needs_transcription': True,
'transcription_service': 'whisper',
'structure': {'type': 'audio'},
'quality': 'pending'
}
def parse_video(self, file_path: Path) -> Dict:
return {
'text': '[Video transcription pending]',
'needs_transcription': True,
'needs_scene_detection': True,
'transcription_service': 'whisper',
'structure': {'type': 'video'},
'quality': 'pending'
}
def parse_image(self, file_path: Path) -> Dict:
try:
from PIL import Image
with Image.open(file_path) as img:
width, height = img.size
mode = img.mode
return {
'text': '[Image caption/OCR pending]',
'needs_ocr': True,
'needs_caption': True,
'dimensions': f'{width}x{height}',
'mode': mode,
'structure': {'type': 'image', 'width': width, 'height': height},
'quality': 'pending'
}
except Exception as e:
return {'error': str(e)}

31
app/parsers/pdf_parser.py Normal file
View File

@@ -0,0 +1,31 @@
from pathlib import Path
from typing import Dict, List
class PDFParser:
def parse(self, file_path: Path) -> Dict:
try:
import PyPDF2
pages = []
with open(file_path, 'rb') as f:
pdf = PyPDF2.PdfReader(f)
page_count = len(pdf.pages)
for i, page in enumerate(pdf.pages[:50]):
text = page.extract_text()
pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
full_text = '\n\n'.join([p['text'] for p in pages])
has_text_layer = sum(p['char_count'] for p in pages) > 100
return {
'text': full_text,
'page_count': page_count,
'pages_extracted': len(pages),
'has_text_layer': has_text_layer,
'needs_ocr': not has_text_layer,
'structure': {'type': 'document', 'pages': pages[:5]},
'quality': 'high' if has_text_layer else 'needs_ocr'
}
except Exception as e:
return {'error': str(e), 'needs_ocr': True}

View File

@@ -0,0 +1,26 @@
from pathlib import Path
from typing import Dict, Optional
import chardet
class TextParser:
def parse(self, file_path: Path) -> Dict:
try:
with open(file_path, 'rb') as f:
raw_data = f.read(1024 * 1024)
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
text = raw_data.decode(encoding, errors='ignore')
lines = text.split('\n')
return {
'text': text,
'encoding': encoding,
'line_count': len(lines),
'char_count': len(text),
'word_count': len(text.split()),
'structure': {'type': 'plain_text'},
'quality': 'high' if encoding == 'utf-8' else 'medium'
}
except Exception as e:
return {'error': str(e)}