Files
defrag/app/parsers/pdf_parser.py
2025-12-13 11:56:06 +01:00

32 lines
1.1 KiB
Python

from pathlib import Path
from typing import Dict, List
class PDFParser:
def parse(self, file_path: Path) -> Dict:
try:
import PyPDF2
pages = []
with open(file_path, 'rb') as f:
pdf = PyPDF2.PdfReader(f)
page_count = len(pdf.pages)
for i, page in enumerate(pdf.pages[:50]):
text = page.extract_text()
pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
full_text = '\n\n'.join([p['text'] for p in pages])
has_text_layer = sum(p['char_count'] for p in pages) > 100
return {
'text': full_text,
'page_count': page_count,
'pages_extracted': len(pages),
'has_text_layer': has_text_layer,
'needs_ocr': not has_text_layer,
'structure': {'type': 'document', 'pages': pages[:5]},
'quality': 'high' if has_text_layer else 'needs_ocr'
}
except Exception as e:
return {'error': str(e), 'needs_ocr': True}