32 lines
1.1 KiB
Python
32 lines
1.1 KiB
Python
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
class PDFParser:
|
|
def parse(self, file_path: Path) -> Dict:
|
|
try:
|
|
import PyPDF2
|
|
|
|
pages = []
|
|
with open(file_path, 'rb') as f:
|
|
pdf = PyPDF2.PdfReader(f)
|
|
page_count = len(pdf.pages)
|
|
|
|
for i, page in enumerate(pdf.pages[:50]):
|
|
text = page.extract_text()
|
|
pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
|
|
|
|
full_text = '\n\n'.join([p['text'] for p in pages])
|
|
has_text_layer = sum(p['char_count'] for p in pages) > 100
|
|
|
|
return {
|
|
'text': full_text,
|
|
'page_count': page_count,
|
|
'pages_extracted': len(pages),
|
|
'has_text_layer': has_text_layer,
|
|
'needs_ocr': not has_text_layer,
|
|
'structure': {'type': 'document', 'pages': pages[:5]},
|
|
'quality': 'high' if has_text_layer else 'needs_ocr'
|
|
}
|
|
except Exception as e:
|
|
return {'error': str(e), 'needs_ocr': True}
|