64 lines
2.2 KiB
Python
64 lines
2.2 KiB
Python
from pathlib import Path
|
|
from typing import Dict
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class ImageParser:
|
|
def __init__(self):
|
|
self.supported_formats = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
|
|
|
|
def parse(self, file_path: Path) -> Dict:
|
|
if file_path.suffix.lower() not in self.supported_formats:
|
|
return {'error': f'Unsupported format: {file_path.suffix}'}
|
|
|
|
try:
|
|
return self._parse_with_ocr(file_path)
|
|
except Exception as e:
|
|
logger.error(f'Image parse failed for {file_path}: {e}')
|
|
return {'error': str(e), 'text': ''}
|
|
|
|
def _parse_with_ocr(self, file_path: Path) -> Dict:
|
|
try:
|
|
from PIL import Image
|
|
import pytesseract
|
|
|
|
img = Image.open(str(file_path))
|
|
text = pytesseract.image_to_string(img)
|
|
|
|
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
|
conf_scores = [int(c) for c in data['conf'] if c != '-1']
|
|
avg_confidence = sum(conf_scores) / len(conf_scores) if conf_scores else 0
|
|
|
|
quality = 'good' if avg_confidence > 80 else 'medium' if avg_confidence > 60 else 'low'
|
|
|
|
return {
|
|
'text': text.strip(),
|
|
'quality': quality,
|
|
'confidence': avg_confidence,
|
|
'method': 'tesseract',
|
|
'metadata': {
|
|
'width': img.width,
|
|
'height': img.height,
|
|
'format': img.format
|
|
}
|
|
}
|
|
except ImportError as ie:
|
|
logger.warning(f'Missing library for OCR: {ie}')
|
|
return {'error': f'Missing library: {ie}', 'text': '', 'needs': 'pytesseract and tesseract-ocr'}
|
|
except Exception as e:
|
|
return {'error': str(e), 'text': ''}
|
|
|
|
def extract_metadata(self, file_path: Path) -> Dict:
|
|
try:
|
|
from PIL import Image
|
|
img = Image.open(str(file_path))
|
|
return {
|
|
'width': img.width,
|
|
'height': img.height,
|
|
'format': img.format,
|
|
'mode': img.mode
|
|
}
|
|
except Exception as e:
|
|
return {'error': str(e)}
|