61 lines
2.6 KiB
Python
61 lines
2.6 KiB
Python
from pathlib import Path
|
|
from typing import Dict
|
|
import logging
|
|
import subprocess
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentParser:
|
|
def __init__(self):
|
|
self.supported_formats = {'.doc', '.docx', '.odt', '.rtf'}
|
|
|
|
def parse(self, file_path: Path) -> Dict:
|
|
if file_path.suffix.lower() not in self.supported_formats:
|
|
return {'error': f'Unsupported format: {file_path.suffix}'}
|
|
|
|
try:
|
|
if file_path.suffix.lower() in {'.docx', '.odt'}:
|
|
return self._parse_with_python(file_path)
|
|
else:
|
|
return self._parse_with_external(file_path)
|
|
except Exception as e:
|
|
logger.error(f'Document parse failed for {file_path}: {e}')
|
|
return {'error': str(e), 'text': ''}
|
|
|
|
def _parse_with_python(self, file_path: Path) -> Dict:
|
|
try:
|
|
if file_path.suffix.lower() == '.docx':
|
|
import docx
|
|
doc = docx.Document(str(file_path))
|
|
text = '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
|
|
return {'text': text, 'quality': 'good', 'method': 'python-docx'}
|
|
elif file_path.suffix.lower() == '.odt':
|
|
from odf import text as odf_text, teletype
|
|
from odf.opendocument import load
|
|
doc = load(str(file_path))
|
|
paragraphs = doc.getElementsByType(odf_text.P)
|
|
text = '\n'.join([teletype.extractText(p) for p in paragraphs if teletype.extractText(p).strip()])
|
|
return {'text': text, 'quality': 'good', 'method': 'odfpy'}
|
|
except ImportError as ie:
|
|
logger.warning(f'Missing library for {file_path.suffix}: {ie}')
|
|
return {'error': f'Missing library: {ie}', 'text': '', 'needs': 'python-docx or odfpy'}
|
|
except Exception as e:
|
|
return {'error': str(e), 'text': ''}
|
|
|
|
def _parse_with_external(self, file_path: Path) -> Dict:
|
|
try:
|
|
result = subprocess.run(
|
|
['antiword', str(file_path)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30
|
|
)
|
|
if result.returncode == 0:
|
|
return {'text': result.stdout, 'quality': 'good', 'method': 'antiword'}
|
|
else:
|
|
return {'error': 'antiword failed', 'text': '', 'needs': 'antiword tool'}
|
|
except FileNotFoundError:
|
|
return {'error': 'antiword not installed', 'text': '', 'needs': 'sudo apt install antiword'}
|
|
except Exception as e:
|
|
return {'error': str(e), 'text': ''}
|