from pathlib import Path from typing import Dict import logging logger = logging.getLogger(__name__) class ImageParser: def __init__(self): self.supported_formats = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'} def parse(self, file_path: Path) -> Dict: if file_path.suffix.lower() not in self.supported_formats: return {'error': f'Unsupported format: {file_path.suffix}'} try: return self._parse_with_ocr(file_path) except Exception as e: logger.error(f'Image parse failed for {file_path}: {e}') return {'error': str(e), 'text': ''} def _parse_with_ocr(self, file_path: Path) -> Dict: try: from PIL import Image import pytesseract img = Image.open(str(file_path)) text = pytesseract.image_to_string(img) data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) conf_scores = [int(c) for c in data['conf'] if c != '-1'] avg_confidence = sum(conf_scores) / len(conf_scores) if conf_scores else 0 quality = 'good' if avg_confidence > 80 else 'medium' if avg_confidence > 60 else 'low' return { 'text': text.strip(), 'quality': quality, 'confidence': avg_confidence, 'method': 'tesseract', 'metadata': { 'width': img.width, 'height': img.height, 'format': img.format } } except ImportError as ie: logger.warning(f'Missing library for OCR: {ie}') return {'error': f'Missing library: {ie}', 'text': '', 'needs': 'pytesseract and tesseract-ocr'} except Exception as e: return {'error': str(e), 'text': ''} def extract_metadata(self, file_path: Path) -> Dict: try: from PIL import Image img = Image.open(str(file_path)) return { 'width': img.width, 'height': img.height, 'format': img.format, 'mode': img.mode } except Exception as e: return {'error': str(e)}