from pathlib import Path from typing import Dict import logging import subprocess logger = logging.getLogger(__name__) class DocumentParser: def __init__(self): self.supported_formats = {'.doc', '.docx', '.odt', '.rtf'} def parse(self, file_path: Path) -> Dict: if file_path.suffix.lower() not in self.supported_formats: return {'error': f'Unsupported format: {file_path.suffix}'} try: if file_path.suffix.lower() in {'.docx', '.odt'}: return self._parse_with_python(file_path) else: return self._parse_with_external(file_path) except Exception as e: logger.error(f'Document parse failed for {file_path}: {e}') return {'error': str(e), 'text': ''} def _parse_with_python(self, file_path: Path) -> Dict: try: if file_path.suffix.lower() == '.docx': import docx doc = docx.Document(str(file_path)) text = '\n'.join([para.text for para in doc.paragraphs if para.text.strip()]) return {'text': text, 'quality': 'good', 'method': 'python-docx'} elif file_path.suffix.lower() == '.odt': from odf import text as odf_text, teletype from odf.opendocument import load doc = load(str(file_path)) paragraphs = doc.getElementsByType(odf_text.P) text = '\n'.join([teletype.extractText(p) for p in paragraphs if teletype.extractText(p).strip()]) return {'text': text, 'quality': 'good', 'method': 'odfpy'} except ImportError as ie: logger.warning(f'Missing library for {file_path.suffix}: {ie}') return {'error': f'Missing library: {ie}', 'text': '', 'needs': 'python-docx or odfpy'} except Exception as e: return {'error': str(e), 'text': ''} def _parse_with_external(self, file_path: Path) -> Dict: try: result = subprocess.run( ['antiword', str(file_path)], capture_output=True, text=True, timeout=30 ) if result.returncode == 0: return {'text': result.stdout, 'quality': 'good', 'method': 'antiword'} else: return {'error': 'antiword failed', 'text': '', 'needs': 'antiword tool'} except FileNotFoundError: return {'error': 'antiword not installed', 'text': '', 'needs': 'sudo apt install antiword'} except Exception as e: return {'error': str(e), 'text': ''}