from pathlib import Path from typing import Dict, Optional, Tuple import mimetypes import magic import json from datetime import datetime class ContentProfiler: def __init__(self): self.mime_detector = magic.Magic(mime=True) self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']} self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'} self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'} self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'} def profile_file(self, file_path: Path) -> Dict: try: stat = file_path.stat() size = stat.st_size mtime = datetime.fromtimestamp(stat.st_mtime) mime_type = self._detect_mime(file_path) kind = self._determine_kind(file_path, mime_type) profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)} return profile except Exception as e: return {'path': str(file_path), 'error': str(e), 'processable': False} def _detect_mime(self, file_path: Path) -> str: try: return self.mime_detector.from_file(str(file_path)) except: guess = mimetypes.guess_type(str(file_path))[0] return guess or 'application/octet-stream' def _determine_kind(self, file_path: Path, mime_type: str) -> str: for kind, mimes in self.kind_mapping.items(): if any((mime in mime_type for mime in mimes)): return kind suffix = file_path.suffix.lower() if suffix in self.text_exts: return 'text' if suffix in self.code_exts: return 'code' return 'unknown' def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]: extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'} return extractors.get(kind) def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict: hints = {} if kind == 'text' or kind == 'code': hints['language'] = self._guess_language(file_path) if size < 1024 * 1024: hints['lines'] = self._count_lines(file_path) if kind == 'pdf': hints['page_count'] = self._get_pdf_pages(file_path) if kind in ['audio', 'video']: hints['duration'] = self._get_media_duration(file_path) if kind == 'image': hints['has_exif'] = self._has_exif(file_path) hints['dimensions'] = self._get_image_dimensions(file_path) return hints def _guess_language(self, file_path: Path) -> Optional[str]: lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'} return lang_map.get(file_path.suffix.lower()) def _count_lines(self, file_path: Path) -> Optional[int]: try: with open(file_path, 'rb') as f: return sum((1 for _ in f)) except: return None def _get_pdf_pages(self, file_path: Path) -> Optional[int]: try: import PyPDF2 with open(file_path, 'rb') as f: pdf = PyPDF2.PdfReader(f) return len(pdf.pages) except: return None def _get_media_duration(self, file_path: Path) -> Optional[float]: try: import ffmpeg probe = ffmpeg.probe(str(file_path)) return float(probe['format']['duration']) except: return None def _has_exif(self, file_path: Path) -> bool: try: from PIL import Image img = Image.open(file_path) return hasattr(img, '_getexif') and img._getexif() is not None except: return False def _get_image_dimensions(self, file_path: Path) -> Optional[Tuple[int, int]]: try: from PIL import Image with Image.open(file_path) as img: return img.size except: return None