initial
This commit is contained in:
3
app/content/__init__.py
Normal file
3
app/content/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .profiler import ContentProfiler
|
||||
from .extractors import ContentExtractor
|
||||
__all__ = ['ContentProfiler', 'ContentExtractor']
|
||||
62
app/content/extractors.py
Normal file
62
app/content/extractors.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
import json
|
||||
|
||||
class ContentExtractor:
|
||||
|
||||
def __init__(self):
|
||||
self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}
|
||||
|
||||
def extract(self, file_path: Path, extractor_type: str) -> Dict:
|
||||
extractor = self.extractors.get(extractor_type)
|
||||
if not extractor:
|
||||
return {'error': f'Unknown extractor: {extractor_type}'}
|
||||
try:
|
||||
return extractor(file_path)
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _extract_text(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read(1024 * 1024)
|
||||
return {'text': content, 'char_count': len(content), 'needs_llm': False}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _extract_code(self, file_path: Path) -> Dict:
|
||||
result = self._extract_text(file_path)
|
||||
if 'error' not in result:
|
||||
result['type'] = 'code'
|
||||
result['needs_llm'] = True
|
||||
return result
|
||||
|
||||
def _extract_pdf(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
import PyPDF2
|
||||
text_parts = []
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf = PyPDF2.PdfReader(f)
|
||||
for page in pdf.pages[:10]:
|
||||
text_parts.append(page.extract_text())
|
||||
text = '\n'.join(text_parts)
|
||||
return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'needs_ocr': True}
|
||||
|
||||
def _extract_image(self, file_path: Path) -> Dict:
|
||||
return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}
|
||||
|
||||
def _extract_audio(self, file_path: Path) -> Dict:
|
||||
return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}
|
||||
|
||||
def _extract_video(self, file_path: Path) -> Dict:
|
||||
return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}
|
||||
|
||||
def _extract_document(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
import textract
|
||||
text = textract.process(str(file_path)).decode('utf-8')
|
||||
return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
|
||||
except:
|
||||
return {'error': 'textract failed', 'needs_llm': True}
|
||||
108
app/content/profiler.py
Normal file
108
app/content/profiler.py
Normal file
@@ -0,0 +1,108 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Tuple
|
||||
import mimetypes
|
||||
import magic
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
class ContentProfiler:
|
||||
|
||||
def __init__(self):
|
||||
self.mime_detector = magic.Magic(mime=True)
|
||||
self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']}
|
||||
self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
|
||||
self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
|
||||
self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
|
||||
|
||||
def profile_file(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
stat = file_path.stat()
|
||||
size = stat.st_size
|
||||
mtime = datetime.fromtimestamp(stat.st_mtime)
|
||||
mime_type = self._detect_mime(file_path)
|
||||
kind = self._determine_kind(file_path, mime_type)
|
||||
profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)}
|
||||
return profile
|
||||
except Exception as e:
|
||||
return {'path': str(file_path), 'error': str(e), 'processable': False}
|
||||
|
||||
def _detect_mime(self, file_path: Path) -> str:
|
||||
try:
|
||||
return self.mime_detector.from_file(str(file_path))
|
||||
except:
|
||||
guess = mimetypes.guess_type(str(file_path))[0]
|
||||
return guess or 'application/octet-stream'
|
||||
|
||||
def _determine_kind(self, file_path: Path, mime_type: str) -> str:
|
||||
for kind, mimes in self.kind_mapping.items():
|
||||
if any((mime in mime_type for mime in mimes)):
|
||||
return kind
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix in self.text_exts:
|
||||
return 'text'
|
||||
if suffix in self.code_exts:
|
||||
return 'code'
|
||||
return 'unknown'
|
||||
|
||||
def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
|
||||
extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'}
|
||||
return extractors.get(kind)
|
||||
|
||||
def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
|
||||
hints = {}
|
||||
if kind == 'text' or kind == 'code':
|
||||
hints['language'] = self._guess_language(file_path)
|
||||
if size < 1024 * 1024:
|
||||
hints['lines'] = self._count_lines(file_path)
|
||||
if kind == 'pdf':
|
||||
hints['page_count'] = self._get_pdf_pages(file_path)
|
||||
if kind in ['audio', 'video']:
|
||||
hints['duration'] = self._get_media_duration(file_path)
|
||||
if kind == 'image':
|
||||
hints['has_exif'] = self._has_exif(file_path)
|
||||
hints['dimensions'] = self._get_image_dimensions(file_path)
|
||||
return hints
|
||||
|
||||
def _guess_language(self, file_path: Path) -> Optional[str]:
|
||||
lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'}
|
||||
return lang_map.get(file_path.suffix.lower())
|
||||
|
||||
def _count_lines(self, file_path: Path) -> Optional[int]:
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
return sum((1 for _ in f))
|
||||
except:
|
||||
return None
|
||||
|
||||
def _get_pdf_pages(self, file_path: Path) -> Optional[int]:
|
||||
try:
|
||||
import PyPDF2
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf = PyPDF2.PdfReader(f)
|
||||
return len(pdf.pages)
|
||||
except:
|
||||
return None
|
||||
|
||||
def _get_media_duration(self, file_path: Path) -> Optional[float]:
|
||||
try:
|
||||
import ffmpeg
|
||||
probe = ffmpeg.probe(str(file_path))
|
||||
return float(probe['format']['duration'])
|
||||
except:
|
||||
return None
|
||||
|
||||
def _has_exif(self, file_path: Path) -> bool:
|
||||
try:
|
||||
from PIL import Image
|
||||
img = Image.open(file_path)
|
||||
return hasattr(img, '_getexif') and img._getexif() is not None
|
||||
except:
|
||||
return False
|
||||
|
||||
def _get_image_dimensions(self, file_path: Path) -> Optional[Tuple[int, int]]:
|
||||
try:
|
||||
from PIL import Image
|
||||
with Image.open(file_path) as img:
|
||||
return img.size
|
||||
except:
|
||||
return None
|
||||
Reference in New Issue
Block a user