197 lines
7.3 KiB
Python
197 lines
7.3 KiB
Python
import hashlib
|
|
import psycopg2
|
|
from pathlib import Path
|
|
from typing import Dict, Optional, List
|
|
import logging
|
|
import time
|
|
import json
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class IncrementalExtractor:
|
|
def __init__(self, db_config: Dict):
|
|
self.db_config = db_config
|
|
|
|
def get_connection(self):
|
|
return psycopg2.connect(**self.db_config)
|
|
|
|
def should_extract(self, file_path: str, file_checksum: str) -> bool:
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
cursor.execute('''
|
|
SELECT file_checksum, status
|
|
FROM extraction_log
|
|
WHERE file_path = %s
|
|
ORDER BY created_at DESC
|
|
LIMIT 1
|
|
''', (file_path,))
|
|
|
|
result = cursor.fetchone()
|
|
if not result:
|
|
return True
|
|
|
|
last_checksum, status = result
|
|
if last_checksum != file_checksum:
|
|
logger.info(f'File changed: {file_path}')
|
|
return True
|
|
|
|
if status == 'success':
|
|
return False
|
|
|
|
if status == 'error':
|
|
return True
|
|
|
|
return True
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def log_extraction(self, node_id: Optional[str], file_path: str, file_checksum: str,
|
|
method: str, status: str, error_msg: Optional[str] = None,
|
|
extracted_size: Optional[int] = None, processing_time_ms: Optional[int] = None):
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
cursor.execute('''
|
|
INSERT INTO extraction_log (node_id, file_path, file_checksum, extraction_method,
|
|
status, error_message, extracted_size, processing_time_ms)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|
''', (node_id, file_path, file_checksum, method, status, error_msg, extracted_size, processing_time_ms))
|
|
conn.commit()
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def create_or_update_node(self, node_type: str, path: str, disk_label: str,
|
|
checksum: Optional[str], size: Optional[int],
|
|
content_hash: Optional[str], metadata: Optional[Dict]) -> str:
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
cursor.execute('''
|
|
INSERT INTO content_nodes (node_type, path, disk_label, checksum, size,
|
|
content_hash, extracted_at, metadata)
|
|
VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP, %s)
|
|
ON CONFLICT (node_type, path, disk_label) DO UPDATE SET
|
|
checksum = EXCLUDED.checksum,
|
|
size = EXCLUDED.size,
|
|
content_hash = EXCLUDED.content_hash,
|
|
extracted_at = CURRENT_TIMESTAMP,
|
|
metadata = EXCLUDED.metadata,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
RETURNING id
|
|
''', (node_type, path, disk_label, checksum, size, content_hash, json.dumps(metadata) if metadata else None))
|
|
|
|
node_id = cursor.fetchone()[0]
|
|
conn.commit()
|
|
return str(node_id)
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def batch_extract(self, file_list: List[Dict], parser_func, parser_name: str,
|
|
batch_size: int = 100, skip_existing: bool = True) -> Dict:
|
|
stats = {
|
|
'processed': 0,
|
|
'extracted': 0,
|
|
'skipped': 0,
|
|
'errors': 0,
|
|
'total_time_ms': 0
|
|
}
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
for idx, file_info in enumerate(file_list, 1):
|
|
path = file_info['path']
|
|
checksum = file_info.get('checksum')
|
|
disk_label = file_info.get('disk_label')
|
|
|
|
if skip_existing and not self.should_extract(path, checksum):
|
|
stats['skipped'] += 1
|
|
continue
|
|
|
|
start_time = time.time()
|
|
try:
|
|
result = parser_func(Path(path))
|
|
processing_time_ms = int((time.time() - start_time) * 1000)
|
|
|
|
if 'error' not in result and result.get('text'):
|
|
text = result['text']
|
|
content_hash = hashlib.sha256(text.encode()).hexdigest()
|
|
|
|
node_id = self.create_or_update_node(
|
|
node_type='file',
|
|
path=path,
|
|
disk_label=disk_label,
|
|
checksum=checksum,
|
|
size=file_info.get('size'),
|
|
content_hash=content_hash,
|
|
metadata={
|
|
'extraction': result.get('method', parser_name),
|
|
'quality': result.get('quality', 'unknown')
|
|
}
|
|
)
|
|
|
|
cursor.execute('''
|
|
UPDATE files
|
|
SET extracted_text = %s,
|
|
text_quality = %s
|
|
WHERE path = %s
|
|
''', (text[:50000], result.get('quality'), path))
|
|
|
|
self.log_extraction(
|
|
node_id=node_id,
|
|
file_path=path,
|
|
file_checksum=checksum,
|
|
method=parser_name,
|
|
status='success',
|
|
extracted_size=len(text),
|
|
processing_time_ms=processing_time_ms
|
|
)
|
|
|
|
stats['extracted'] += 1
|
|
stats['total_time_ms'] += processing_time_ms
|
|
else:
|
|
error_msg = result.get('error', 'No text extracted')
|
|
self.log_extraction(
|
|
node_id=None,
|
|
file_path=path,
|
|
file_checksum=checksum,
|
|
method=parser_name,
|
|
status='error',
|
|
error_msg=error_msg
|
|
)
|
|
stats['errors'] += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f'Extract failed for {path}: {e}')
|
|
self.log_extraction(
|
|
node_id=None,
|
|
file_path=path,
|
|
file_checksum=checksum,
|
|
method=parser_name,
|
|
status='error',
|
|
error_msg=str(e)
|
|
)
|
|
stats['errors'] += 1
|
|
|
|
stats['processed'] += 1
|
|
|
|
if stats['processed'] % batch_size == 0:
|
|
conn.commit()
|
|
logger.info(f'Batch progress: {stats["processed"]}/{len(file_list)} '
|
|
f'({stats["extracted"]} extracted, {stats["skipped"]} skipped, {stats["errors"]} errors)')
|
|
|
|
conn.commit()
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
return stats
|