defrag/app/parsers/image_parser.py

from pathlib import Path
from typing import Dict
import logging

logger = logging.getLogger(__name__)

class ImageParser:
    def __init__(self):
        self.supported_formats = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}

    def parse(self, file_path: Path) -> Dict:
        if file_path.suffix.lower() not in self.supported_formats:
            return {'error': f'Unsupported format: {file_path.suffix}'}

        try:
            return self._parse_with_ocr(file_path)
        except Exception as e:
            logger.error(f'Image parse failed for {file_path}: {e}')
            return {'error': str(e), 'text': ''}

    def _parse_with_ocr(self, file_path: Path) -> Dict:
        try:
            from PIL import Image
            import pytesseract

            img = Image.open(str(file_path))
            text = pytesseract.image_to_string(img)

            data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
            conf_scores = [int(c) for c in data['conf'] if c != '-1']
            avg_confidence = sum(conf_scores) / len(conf_scores) if conf_scores else 0

            quality = 'good' if avg_confidence > 80 else 'medium' if avg_confidence > 60 else 'low'

            return {
                'text': text.strip(),
                'quality': quality,
                'confidence': avg_confidence,
                'method': 'tesseract',
                'metadata': {
                    'width': img.width,
                    'height': img.height,
                    'format': img.format
                }
            }
        except ImportError as ie:
            logger.warning(f'Missing library for OCR: {ie}')
            return {'error': f'Missing library: {ie}', 'text': '', 'needs': 'pytesseract and tesseract-ocr'}
        except Exception as e:
            return {'error': str(e), 'text': ''}

    def extract_metadata(self, file_path: Path) -> Dict:
        try:
            from PIL import Image
            img = Image.open(str(file_path))
            return {
                'width': img.width,
                'height': img.height,
                'format': img.format,
                'mode': img.mode
            }
        except Exception as e:
            return {'error': str(e)}