defrag/app/extraction/incremental.py

import hashlib
import psycopg2
from pathlib import Path
from typing import Dict, Optional, List
import logging
import time
import json

logger = logging.getLogger(__name__)

class IncrementalExtractor:
    def __init__(self, db_config: Dict):
        self.db_config = db_config

    def get_connection(self):
        return psycopg2.connect(**self.db_config)

    def should_extract(self, file_path: str, file_checksum: str) -> bool:
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            cursor.execute('''
                SELECT file_checksum, status
                FROM extraction_log
                WHERE file_path = %s
                ORDER BY created_at DESC
                LIMIT 1
            ''', (file_path,))

            result = cursor.fetchone()
            if not result:
                return True

            last_checksum, status = result
            if last_checksum != file_checksum:
                logger.info(f'File changed: {file_path}')
                return True

            if status == 'success':
                return False

            if status == 'error':
                return True

            return True

        finally:
            cursor.close()
            conn.close()

    def log_extraction(self, node_id: Optional[str], file_path: str, file_checksum: str,
                      method: str, status: str, error_msg: Optional[str] = None,
                      extracted_size: Optional[int] = None, processing_time_ms: Optional[int] = None):
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            cursor.execute('''
                INSERT INTO extraction_log (node_id, file_path, file_checksum, extraction_method,
                                           status, error_message, extracted_size, processing_time_ms)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
            ''', (node_id, file_path, file_checksum, method, status, error_msg, extracted_size, processing_time_ms))
            conn.commit()
        finally:
            cursor.close()
            conn.close()

    def create_or_update_node(self, node_type: str, path: str, disk_label: str,
                             checksum: Optional[str], size: Optional[int],
                             content_hash: Optional[str], metadata: Optional[Dict]) -> str:
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            cursor.execute('''
                INSERT INTO content_nodes (node_type, path, disk_label, checksum, size,
                                          content_hash, extracted_at, metadata)
                VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP, %s)
                ON CONFLICT (node_type, path, disk_label) DO UPDATE SET
                    checksum = EXCLUDED.checksum,
                    size = EXCLUDED.size,
                    content_hash = EXCLUDED.content_hash,
                    extracted_at = CURRENT_TIMESTAMP,
                    metadata = EXCLUDED.metadata,
                    updated_at = CURRENT_TIMESTAMP
                RETURNING id
            ''', (node_type, path, disk_label, checksum, size, content_hash, json.dumps(metadata) if metadata else None))

            node_id = cursor.fetchone()[0]
            conn.commit()
            return str(node_id)

        finally:
            cursor.close()
            conn.close()

    def batch_extract(self, file_list: List[Dict], parser_func, parser_name: str,
                     batch_size: int = 100, skip_existing: bool = True) -> Dict:
        stats = {
            'processed': 0,
            'extracted': 0,
            'skipped': 0,
            'errors': 0,
            'total_time_ms': 0
        }

        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            for idx, file_info in enumerate(file_list, 1):
                path = file_info['path']
                checksum = file_info.get('checksum')
                disk_label = file_info.get('disk_label')

                if skip_existing and not self.should_extract(path, checksum):
                    stats['skipped'] += 1
                    continue

                start_time = time.time()
                try:
                    result = parser_func(Path(path))
                    processing_time_ms = int((time.time() - start_time) * 1000)

                    if 'error' not in result and result.get('text'):
                        text = result['text']
                        content_hash = hashlib.sha256(text.encode()).hexdigest()

                        node_id = self.create_or_update_node(
                            node_type='file',
                            path=path,
                            disk_label=disk_label,
                            checksum=checksum,
                            size=file_info.get('size'),
                            content_hash=content_hash,
                            metadata={
                                'extraction': result.get('method', parser_name),
                                'quality': result.get('quality', 'unknown')
                            }
                        )

                        cursor.execute('''
                            UPDATE files
                            SET extracted_text = %s,
                                text_quality = %s
                            WHERE path = %s
                        ''', (text[:50000], result.get('quality'), path))

                        self.log_extraction(
                            node_id=node_id,
                            file_path=path,
                            file_checksum=checksum,
                            method=parser_name,
                            status='success',
                            extracted_size=len(text),
                            processing_time_ms=processing_time_ms
                        )

                        stats['extracted'] += 1
                        stats['total_time_ms'] += processing_time_ms
                    else:
                        error_msg = result.get('error', 'No text extracted')
                        self.log_extraction(
                            node_id=None,
                            file_path=path,
                            file_checksum=checksum,
                            method=parser_name,
                            status='error',
                            error_msg=error_msg
                        )
                        stats['errors'] += 1

                except Exception as e:
                    logger.error(f'Extract failed for {path}: {e}')
                    self.log_extraction(
                        node_id=None,
                        file_path=path,
                        file_checksum=checksum,
                        method=parser_name,
                        status='error',
                        error_msg=str(e)
                    )
                    stats['errors'] += 1

                stats['processed'] += 1

                if stats['processed'] % batch_size == 0:
                    conn.commit()
                    logger.info(f'Batch progress: {stats["processed"]}/{len(file_list)} '
                              f'({stats["extracted"]} extracted, {stats["skipped"]} skipped, {stats["errors"]} errors)')

            conn.commit()

        finally:
            cursor.close()
            conn.close()

        return stats