import hashlib import psycopg2 from pathlib import Path from typing import Dict, Optional, List import logging import time import json logger = logging.getLogger(__name__) class IncrementalExtractor: def __init__(self, db_config: Dict): self.db_config = db_config def get_connection(self): return psycopg2.connect(**self.db_config) def should_extract(self, file_path: str, file_checksum: str) -> bool: conn = self.get_connection() cursor = conn.cursor() try: cursor.execute(''' SELECT file_checksum, status FROM extraction_log WHERE file_path = %s ORDER BY created_at DESC LIMIT 1 ''', (file_path,)) result = cursor.fetchone() if not result: return True last_checksum, status = result if last_checksum != file_checksum: logger.info(f'File changed: {file_path}') return True if status == 'success': return False if status == 'error': return True return True finally: cursor.close() conn.close() def log_extraction(self, node_id: Optional[str], file_path: str, file_checksum: str, method: str, status: str, error_msg: Optional[str] = None, extracted_size: Optional[int] = None, processing_time_ms: Optional[int] = None): conn = self.get_connection() cursor = conn.cursor() try: cursor.execute(''' INSERT INTO extraction_log (node_id, file_path, file_checksum, extraction_method, status, error_message, extracted_size, processing_time_ms) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) ''', (node_id, file_path, file_checksum, method, status, error_msg, extracted_size, processing_time_ms)) conn.commit() finally: cursor.close() conn.close() def create_or_update_node(self, node_type: str, path: str, disk_label: str, checksum: Optional[str], size: Optional[int], content_hash: Optional[str], metadata: Optional[Dict]) -> str: conn = self.get_connection() cursor = conn.cursor() try: cursor.execute(''' INSERT INTO content_nodes (node_type, path, disk_label, checksum, size, content_hash, extracted_at, metadata) VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP, %s) ON CONFLICT (node_type, path, disk_label) DO UPDATE SET checksum = EXCLUDED.checksum, size = EXCLUDED.size, content_hash = EXCLUDED.content_hash, extracted_at = CURRENT_TIMESTAMP, metadata = EXCLUDED.metadata, updated_at = CURRENT_TIMESTAMP RETURNING id ''', (node_type, path, disk_label, checksum, size, content_hash, json.dumps(metadata) if metadata else None)) node_id = cursor.fetchone()[0] conn.commit() return str(node_id) finally: cursor.close() conn.close() def batch_extract(self, file_list: List[Dict], parser_func, parser_name: str, batch_size: int = 100, skip_existing: bool = True) -> Dict: stats = { 'processed': 0, 'extracted': 0, 'skipped': 0, 'errors': 0, 'total_time_ms': 0 } conn = self.get_connection() cursor = conn.cursor() try: for idx, file_info in enumerate(file_list, 1): path = file_info['path'] checksum = file_info.get('checksum') disk_label = file_info.get('disk_label') if skip_existing and not self.should_extract(path, checksum): stats['skipped'] += 1 continue start_time = time.time() try: result = parser_func(Path(path)) processing_time_ms = int((time.time() - start_time) * 1000) if 'error' not in result and result.get('text'): text = result['text'] content_hash = hashlib.sha256(text.encode()).hexdigest() node_id = self.create_or_update_node( node_type='file', path=path, disk_label=disk_label, checksum=checksum, size=file_info.get('size'), content_hash=content_hash, metadata={ 'extraction': result.get('method', parser_name), 'quality': result.get('quality', 'unknown') } ) cursor.execute(''' UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s ''', (text[:50000], result.get('quality'), path)) self.log_extraction( node_id=node_id, file_path=path, file_checksum=checksum, method=parser_name, status='success', extracted_size=len(text), processing_time_ms=processing_time_ms ) stats['extracted'] += 1 stats['total_time_ms'] += processing_time_ms else: error_msg = result.get('error', 'No text extracted') self.log_extraction( node_id=None, file_path=path, file_checksum=checksum, method=parser_name, status='error', error_msg=error_msg ) stats['errors'] += 1 except Exception as e: logger.error(f'Extract failed for {path}: {e}') self.log_extraction( node_id=None, file_path=path, file_checksum=checksum, method=parser_name, status='error', error_msg=str(e) ) stats['errors'] += 1 stats['processed'] += 1 if stats['processed'] % batch_size == 0: conn.commit() logger.info(f'Batch progress: {stats["processed"]}/{len(file_list)} ' f'({stats["extracted"]} extracted, {stats["skipped"]} skipped, {stats["errors"]} errors)') conn.commit() finally: cursor.close() conn.close() return stats