from pathlib import Path from typing import Dict, List class PDFParser: def parse(self, file_path: Path) -> Dict: try: import PyPDF2 pages = [] with open(file_path, 'rb') as f: pdf = PyPDF2.PdfReader(f) page_count = len(pdf.pages) for i, page in enumerate(pdf.pages[:50]): text = page.extract_text() pages.append({'page': i + 1, 'text': text, 'char_count': len(text)}) full_text = '\n\n'.join([p['text'] for p in pages]) has_text_layer = sum(p['char_count'] for p in pages) > 100 return { 'text': full_text, 'page_count': page_count, 'pages_extracted': len(pages), 'has_text_layer': has_text_layer, 'needs_ocr': not has_text_layer, 'structure': {'type': 'document', 'pages': pages[:5]}, 'quality': 'high' if has_text_layer else 'needs_ocr' } except Exception as e: return {'error': str(e), 'needs_ocr': True}