clean up code
This commit is contained in:
196
app/extraction/incremental.py
Normal file
196
app/extraction/incremental.py
Normal file
@@ -0,0 +1,196 @@
|
||||
import hashlib
|
||||
import psycopg2
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, List
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class IncrementalExtractor:
|
||||
def __init__(self, db_config: Dict):
|
||||
self.db_config = db_config
|
||||
|
||||
def get_connection(self):
|
||||
return psycopg2.connect(**self.db_config)
|
||||
|
||||
def should_extract(self, file_path: str, file_checksum: str) -> bool:
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute('''
|
||||
SELECT file_checksum, status
|
||||
FROM extraction_log
|
||||
WHERE file_path = %s
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
''', (file_path,))
|
||||
|
||||
result = cursor.fetchone()
|
||||
if not result:
|
||||
return True
|
||||
|
||||
last_checksum, status = result
|
||||
if last_checksum != file_checksum:
|
||||
logger.info(f'File changed: {file_path}')
|
||||
return True
|
||||
|
||||
if status == 'success':
|
||||
return False
|
||||
|
||||
if status == 'error':
|
||||
return True
|
||||
|
||||
return True
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def log_extraction(self, node_id: Optional[str], file_path: str, file_checksum: str,
|
||||
method: str, status: str, error_msg: Optional[str] = None,
|
||||
extracted_size: Optional[int] = None, processing_time_ms: Optional[int] = None):
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute('''
|
||||
INSERT INTO extraction_log (node_id, file_path, file_checksum, extraction_method,
|
||||
status, error_message, extracted_size, processing_time_ms)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
''', (node_id, file_path, file_checksum, method, status, error_msg, extracted_size, processing_time_ms))
|
||||
conn.commit()
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def create_or_update_node(self, node_type: str, path: str, disk_label: str,
|
||||
checksum: Optional[str], size: Optional[int],
|
||||
content_hash: Optional[str], metadata: Optional[Dict]) -> str:
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute('''
|
||||
INSERT INTO content_nodes (node_type, path, disk_label, checksum, size,
|
||||
content_hash, extracted_at, metadata)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP, %s)
|
||||
ON CONFLICT (node_type, path, disk_label) DO UPDATE SET
|
||||
checksum = EXCLUDED.checksum,
|
||||
size = EXCLUDED.size,
|
||||
content_hash = EXCLUDED.content_hash,
|
||||
extracted_at = CURRENT_TIMESTAMP,
|
||||
metadata = EXCLUDED.metadata,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
RETURNING id
|
||||
''', (node_type, path, disk_label, checksum, size, content_hash, json.dumps(metadata) if metadata else None))
|
||||
|
||||
node_id = cursor.fetchone()[0]
|
||||
conn.commit()
|
||||
return str(node_id)
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def batch_extract(self, file_list: List[Dict], parser_func, parser_name: str,
|
||||
batch_size: int = 100, skip_existing: bool = True) -> Dict:
|
||||
stats = {
|
||||
'processed': 0,
|
||||
'extracted': 0,
|
||||
'skipped': 0,
|
||||
'errors': 0,
|
||||
'total_time_ms': 0
|
||||
}
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
for idx, file_info in enumerate(file_list, 1):
|
||||
path = file_info['path']
|
||||
checksum = file_info.get('checksum')
|
||||
disk_label = file_info.get('disk_label')
|
||||
|
||||
if skip_existing and not self.should_extract(path, checksum):
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
start_time = time.time()
|
||||
try:
|
||||
result = parser_func(Path(path))
|
||||
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if 'error' not in result and result.get('text'):
|
||||
text = result['text']
|
||||
content_hash = hashlib.sha256(text.encode()).hexdigest()
|
||||
|
||||
node_id = self.create_or_update_node(
|
||||
node_type='file',
|
||||
path=path,
|
||||
disk_label=disk_label,
|
||||
checksum=checksum,
|
||||
size=file_info.get('size'),
|
||||
content_hash=content_hash,
|
||||
metadata={
|
||||
'extraction': result.get('method', parser_name),
|
||||
'quality': result.get('quality', 'unknown')
|
||||
}
|
||||
)
|
||||
|
||||
cursor.execute('''
|
||||
UPDATE files
|
||||
SET extracted_text = %s,
|
||||
text_quality = %s
|
||||
WHERE path = %s
|
||||
''', (text[:50000], result.get('quality'), path))
|
||||
|
||||
self.log_extraction(
|
||||
node_id=node_id,
|
||||
file_path=path,
|
||||
file_checksum=checksum,
|
||||
method=parser_name,
|
||||
status='success',
|
||||
extracted_size=len(text),
|
||||
processing_time_ms=processing_time_ms
|
||||
)
|
||||
|
||||
stats['extracted'] += 1
|
||||
stats['total_time_ms'] += processing_time_ms
|
||||
else:
|
||||
error_msg = result.get('error', 'No text extracted')
|
||||
self.log_extraction(
|
||||
node_id=None,
|
||||
file_path=path,
|
||||
file_checksum=checksum,
|
||||
method=parser_name,
|
||||
status='error',
|
||||
error_msg=error_msg
|
||||
)
|
||||
stats['errors'] += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Extract failed for {path}: {e}')
|
||||
self.log_extraction(
|
||||
node_id=None,
|
||||
file_path=path,
|
||||
file_checksum=checksum,
|
||||
method=parser_name,
|
||||
status='error',
|
||||
error_msg=str(e)
|
||||
)
|
||||
stats['errors'] += 1
|
||||
|
||||
stats['processed'] += 1
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
conn.commit()
|
||||
logger.info(f'Batch progress: {stats["processed"]}/{len(file_list)} '
|
||||
f'({stats["extracted"]} extracted, {stats["skipped"]} skipped, {stats["errors"]} errors)')
|
||||
|
||||
conn.commit()
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
return stats
|
||||
Reference in New Issue
Block a user