Files
defrag/app/migration/engine.py
2025-12-13 11:56:06 +01:00

455 lines
14 KiB
Python

"""Migration engine"""
from pathlib import Path
from typing import Optional, Callable
from datetime import datetime
import psycopg2
from psycopg2.extras import execute_batch
from .copy import CopyMigrationStrategy, SafeCopyStrategy
from .hardlink import HardlinkMigrationStrategy, SymlinkMigrationStrategy
from ..shared.models import OperationRecord, ProcessingStats, MigrationPlan
from ..shared.config import DatabaseConfig, ProcessingConfig
from ..shared.logger import ProgressLogger
class MigrationEngine:
"""Engine for migrating files"""
def __init__(
self,
db_config: DatabaseConfig,
processing_config: ProcessingConfig,
logger: ProgressLogger,
target_base: Path
):
"""Initialize migration engine
Args:
db_config: Database configuration
processing_config: Processing configuration
logger: Progress logger
target_base: Target base directory for migrations
"""
self.db_config = db_config
self.processing_config = processing_config
self.logger = logger
self.target_base = Path(target_base)
self._connection = None
# Initialize strategies
self.copy_strategy = SafeCopyStrategy(logger=logger)
self.hardlink_strategy = HardlinkMigrationStrategy(logger=logger)
self.symlink_strategy = SymlinkMigrationStrategy(logger=logger)
def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection
def _ensure_tables(self):
"""Ensure migration tables exist"""
conn = self._get_connection()
cursor = conn.cursor()
# Create operations table
cursor.execute("""
CREATE TABLE IF NOT EXISTS operations (
id SERIAL PRIMARY KEY,
source_path TEXT NOT NULL,
target_path TEXT NOT NULL,
operation_type TEXT NOT NULL,
size BIGINT DEFAULT 0,
status TEXT DEFAULT 'pending',
error TEXT,
executed_at TIMESTAMP,
verified BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create index on status
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_operations_status
ON operations(status)
""")
conn.commit()
cursor.close()
def plan_migration(
self,
disk: Optional[str] = None,
category: Optional[str] = None
) -> MigrationPlan:
"""Plan migration for files
Args:
disk: Optional disk filter
category: Optional category filter
Returns:
MigrationPlan with planned operations
"""
self.logger.section("Planning Migration")
conn = self._get_connection()
cursor = conn.cursor()
# Build query
conditions = ["category IS NOT NULL"]
params = []
if disk:
conditions.append("disk_label = %s")
params.append(disk)
if category:
conditions.append("category = %s")
params.append(category)
query = f"""
SELECT path, size, category, duplicate_of
FROM files
WHERE {' AND '.join(conditions)}
ORDER BY category, path
"""
cursor.execute(query, params)
files = cursor.fetchall()
self.logger.info(f"Found {len(files)} files to migrate")
operations = []
total_size = 0
for path_str, size, file_category, duplicate_of in files:
source = Path(path_str)
# Determine destination
target_path = self.target_base / file_category / source.name
# Determine operation type
if duplicate_of:
# Use hardlink for duplicates
operation_type = 'hardlink'
else:
# Use copy for unique files
operation_type = 'copy'
operation = OperationRecord(
source_path=source,
target_path=target_path,
operation_type=operation_type,
size=size
)
operations.append(operation)
total_size += size
cursor.close()
plan = MigrationPlan(
target_disk=str(self.target_base),
destination_disks=[str(self.target_base)],
operations=operations,
total_size=total_size,
file_count=len(operations)
)
self.logger.info(
f"Migration plan created: {plan.file_count} files, "
f"{plan.total_size:,} bytes"
)
return plan
def execute_migration(
self,
operations: list[OperationRecord],
dry_run: bool = False,
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
) -> ProcessingStats:
"""Execute migration operations
Args:
operations: List of operations to execute
dry_run: Whether to perform a dry run
progress_callback: Optional callback for progress updates
Returns:
ProcessingStats with execution statistics
"""
self.logger.section("Executing Migration" + (" (DRY RUN)" if dry_run else ""))
self._ensure_tables()
stats = ProcessingStats()
total_ops = len(operations)
for operation in operations:
stats.files_processed += 1
if dry_run:
# In dry run, just log what would happen
self.logger.debug(
f"[DRY RUN] Would {operation.operation_type}: "
f"{operation.source_path} -> {operation.target_path}"
)
stats.files_succeeded += 1
else:
# Execute actual migration
success = self._execute_operation(operation)
if success:
stats.files_succeeded += 1
stats.bytes_processed += operation.size
else:
stats.files_failed += 1
# Progress callback
if progress_callback and stats.files_processed % 100 == 0:
progress_callback(stats.files_processed, total_ops, stats)
# Log progress
if stats.files_processed % 1000 == 0:
self.logger.progress(
stats.files_processed,
total_ops,
prefix="Operations executed",
bytes_processed=stats.bytes_processed,
elapsed_seconds=stats.elapsed_seconds
)
self.logger.info(
f"Migration {'dry run' if dry_run else 'execution'} complete: "
f"{stats.files_succeeded}/{total_ops} operations, "
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
)
return stats
def _execute_operation(self, operation: OperationRecord) -> bool:
"""Execute a single migration operation
Args:
operation: Operation to execute
Returns:
True if successful
"""
operation.status = 'in_progress'
operation.executed_at = datetime.now()
try:
# Select strategy based on operation type
if operation.operation_type == 'copy':
strategy = self.copy_strategy
elif operation.operation_type == 'hardlink':
strategy = self.hardlink_strategy
elif operation.operation_type == 'symlink':
strategy = self.symlink_strategy
else:
raise ValueError(f"Unknown operation type: {operation.operation_type}")
# Execute migration
success = strategy.migrate(
operation.source_path,
operation.target_path,
verify=self.processing_config.verify_operations
)
if success:
operation.status = 'completed'
operation.verified = True
self._record_operation(operation)
return True
else:
operation.status = 'failed'
operation.error = "Migration failed"
self._record_operation(operation)
return False
except Exception as e:
operation.status = 'failed'
operation.error = str(e)
self._record_operation(operation)
self.logger.error(f"Operation failed: {operation.source_path}: {e}")
return False
def _record_operation(self, operation: OperationRecord):
"""Record operation in database
Args:
operation: Operation to record
"""
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("""
INSERT INTO operations (
source_path, target_path, operation_type, bytes_processed,
status, error, executed_at, verified
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
""", (
str(operation.source_path),
str(operation.target_path),
operation.operation_type,
operation.size,
operation.status,
operation.error,
operation.executed_at,
operation.verified
))
conn.commit()
cursor.close()
def rollback(self, operation: OperationRecord) -> bool:
"""Rollback a migration operation
Args:
operation: Operation to rollback
Returns:
True if rollback successful
"""
self.logger.warning(f"Rolling back: {operation.target_path}")
try:
# Remove destination
if operation.target_path.exists():
operation.target_path.unlink()
# Update database
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("""
UPDATE operations
SET status = 'rolled_back'
WHERE source_path = %s AND target_path = %s
""", (str(operation.source_path), str(operation.target_path)))
conn.commit()
cursor.close()
return True
except Exception as e:
self.logger.error(f"Rollback failed: {operation.target_path}: {e}")
return False
def get_migration_stats(self) -> dict:
"""Get migration statistics
Returns:
Dictionary with statistics
"""
conn = self._get_connection()
cursor = conn.cursor()
stats = {}
# Total operations
cursor.execute("SELECT COUNT(*) FROM operations")
stats['total_operations'] = cursor.fetchone()[0]
# Operations by status
cursor.execute("""
SELECT status, COUNT(*)
FROM operations
GROUP BY status
""")
for status, count in cursor.fetchall():
stats[f'{status}_operations'] = count
# Total size migrated
cursor.execute("""
SELECT COALESCE(SUM(size), 0)
FROM operations
WHERE status = 'completed'
""")
stats['total_size_migrated'] = cursor.fetchone()[0]
cursor.close()
return stats
def verify_migrations(self) -> dict:
"""Verify completed migrations
Returns:
Dictionary with verification results
"""
self.logger.subsection("Verifying Migrations")
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT source_path, target_path, operation_type
FROM operations
WHERE status = 'completed' AND verified = FALSE
""")
operations = cursor.fetchall()
cursor.close()
results = {
'total': len(operations),
'verified': 0,
'failed': 0
}
for source_str, dest_str, op_type in operations:
source = Path(source_str)
dest = Path(dest_str)
# Verify destination exists
if not dest.exists():
results['failed'] += 1
self.logger.warning(f"Verification failed: {dest} does not exist")
continue
# Verify based on operation type
if op_type == 'hardlink':
# Check if hardlinked
if source.exists() and source.stat().st_ino == dest.stat().st_ino:
results['verified'] += 1
else:
results['failed'] += 1
else:
# Check if destination exists and has correct size
if dest.exists():
results['verified'] += 1
else:
results['failed'] += 1
self.logger.info(
f"Verification complete: {results['verified']}/{results['total']} verified"
)
return results
def close(self):
"""Close database connection"""
if self._connection and not self._connection.closed:
self._connection.close()
def __enter__(self):
"""Context manager entry"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close()