Files
defrag/app/extraction/incremental.py
2025-12-13 13:57:13 +01:00

197 lines
7.3 KiB
Python

import hashlib
import psycopg2
from pathlib import Path
from typing import Dict, Optional, List
import logging
import time
import json
logger = logging.getLogger(__name__)
class IncrementalExtractor:
def __init__(self, db_config: Dict):
self.db_config = db_config
def get_connection(self):
return psycopg2.connect(**self.db_config)
def should_extract(self, file_path: str, file_checksum: str) -> bool:
conn = self.get_connection()
cursor = conn.cursor()
try:
cursor.execute('''
SELECT file_checksum, status
FROM extraction_log
WHERE file_path = %s
ORDER BY created_at DESC
LIMIT 1
''', (file_path,))
result = cursor.fetchone()
if not result:
return True
last_checksum, status = result
if last_checksum != file_checksum:
logger.info(f'File changed: {file_path}')
return True
if status == 'success':
return False
if status == 'error':
return True
return True
finally:
cursor.close()
conn.close()
def log_extraction(self, node_id: Optional[str], file_path: str, file_checksum: str,
method: str, status: str, error_msg: Optional[str] = None,
extracted_size: Optional[int] = None, processing_time_ms: Optional[int] = None):
conn = self.get_connection()
cursor = conn.cursor()
try:
cursor.execute('''
INSERT INTO extraction_log (node_id, file_path, file_checksum, extraction_method,
status, error_message, extracted_size, processing_time_ms)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
''', (node_id, file_path, file_checksum, method, status, error_msg, extracted_size, processing_time_ms))
conn.commit()
finally:
cursor.close()
conn.close()
def create_or_update_node(self, node_type: str, path: str, disk_label: str,
checksum: Optional[str], size: Optional[int],
content_hash: Optional[str], metadata: Optional[Dict]) -> str:
conn = self.get_connection()
cursor = conn.cursor()
try:
cursor.execute('''
INSERT INTO content_nodes (node_type, path, disk_label, checksum, size,
content_hash, extracted_at, metadata)
VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP, %s)
ON CONFLICT (node_type, path, disk_label) DO UPDATE SET
checksum = EXCLUDED.checksum,
size = EXCLUDED.size,
content_hash = EXCLUDED.content_hash,
extracted_at = CURRENT_TIMESTAMP,
metadata = EXCLUDED.metadata,
updated_at = CURRENT_TIMESTAMP
RETURNING id
''', (node_type, path, disk_label, checksum, size, content_hash, json.dumps(metadata) if metadata else None))
node_id = cursor.fetchone()[0]
conn.commit()
return str(node_id)
finally:
cursor.close()
conn.close()
def batch_extract(self, file_list: List[Dict], parser_func, parser_name: str,
batch_size: int = 100, skip_existing: bool = True) -> Dict:
stats = {
'processed': 0,
'extracted': 0,
'skipped': 0,
'errors': 0,
'total_time_ms': 0
}
conn = self.get_connection()
cursor = conn.cursor()
try:
for idx, file_info in enumerate(file_list, 1):
path = file_info['path']
checksum = file_info.get('checksum')
disk_label = file_info.get('disk_label')
if skip_existing and not self.should_extract(path, checksum):
stats['skipped'] += 1
continue
start_time = time.time()
try:
result = parser_func(Path(path))
processing_time_ms = int((time.time() - start_time) * 1000)
if 'error' not in result and result.get('text'):
text = result['text']
content_hash = hashlib.sha256(text.encode()).hexdigest()
node_id = self.create_or_update_node(
node_type='file',
path=path,
disk_label=disk_label,
checksum=checksum,
size=file_info.get('size'),
content_hash=content_hash,
metadata={
'extraction': result.get('method', parser_name),
'quality': result.get('quality', 'unknown')
}
)
cursor.execute('''
UPDATE files
SET extracted_text = %s,
text_quality = %s
WHERE path = %s
''', (text[:50000], result.get('quality'), path))
self.log_extraction(
node_id=node_id,
file_path=path,
file_checksum=checksum,
method=parser_name,
status='success',
extracted_size=len(text),
processing_time_ms=processing_time_ms
)
stats['extracted'] += 1
stats['total_time_ms'] += processing_time_ms
else:
error_msg = result.get('error', 'No text extracted')
self.log_extraction(
node_id=None,
file_path=path,
file_checksum=checksum,
method=parser_name,
status='error',
error_msg=error_msg
)
stats['errors'] += 1
except Exception as e:
logger.error(f'Extract failed for {path}: {e}')
self.log_extraction(
node_id=None,
file_path=path,
file_checksum=checksum,
method=parser_name,
status='error',
error_msg=str(e)
)
stats['errors'] += 1
stats['processed'] += 1
if stats['processed'] % batch_size == 0:
conn.commit()
logger.info(f'Batch progress: {stats["processed"]}/{len(file_list)} '
f'({stats["extracted"]} extracted, {stats["skipped"]} skipped, {stats["errors"]} errors)')
conn.commit()
finally:
cursor.close()
conn.close()
return stats