clean up code

This commit is contained in:
mike
2025-12-13 12:00:34 +01:00
parent 2b2c575385
commit 7ce8c8c73d
10 changed files with 158 additions and 1471 deletions

View File

@@ -1,21 +1,4 @@
"""Deduplication package exports""" from .chunker import RabinChunker, SimpleChunker, hash_chunk, hash_file, compute_file_signature
from .chunker import (
RabinChunker,
SimpleChunker,
hash_chunk,
hash_file,
compute_file_signature
)
from .store import HashStore, MemoryHashStore from .store import HashStore, MemoryHashStore
from .engine import DeduplicationEngine from .engine import DeduplicationEngine
__all__ = ['RabinChunker', 'SimpleChunker', 'hash_chunk', 'hash_file', 'compute_file_signature', 'HashStore', 'MemoryHashStore', 'DeduplicationEngine']
__all__ = [
'RabinChunker',
'SimpleChunker',
'hash_chunk',
'hash_file',
'compute_file_signature',
'HashStore',
'MemoryHashStore',
'DeduplicationEngine',
]

View File

@@ -0,0 +1 @@

View File

@@ -1,75 +1,29 @@
"""Rabin fingerprint chunker for content-defined chunking"""
import hashlib import hashlib
from pathlib import Path from pathlib import Path
from typing import Iterator, Optional from typing import Iterator, Optional
class RabinChunker: class RabinChunker:
"""Content-defined chunking using Rabin fingerprinting
Uses a rolling hash to identify chunk boundaries based on content, def __init__(self, avg_chunk_size: int=8192, min_chunk_size: Optional[int]=None, max_chunk_size: Optional[int]=None, window_size: int=48):
allowing for efficient deduplication even when data is modified.
"""
def __init__(
self,
avg_chunk_size: int = 8192,
min_chunk_size: Optional[int] = None,
max_chunk_size: Optional[int] = None,
window_size: int = 48
):
"""Initialize Rabin chunker
Args:
avg_chunk_size: Target average chunk size in bytes
min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
window_size: Rolling hash window size
"""
self.avg_chunk_size = avg_chunk_size self.avg_chunk_size = avg_chunk_size
self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4) self.min_chunk_size = min_chunk_size or avg_chunk_size // 4
self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8) self.max_chunk_size = max_chunk_size or avg_chunk_size * 8
self.window_size = window_size self.window_size = window_size
# Calculate mask for boundary detection
# For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
bits = 0 bits = 0
size = avg_chunk_size size = avg_chunk_size
while size > 1: while size > 1:
bits += 1 bits += 1
size >>= 1 size >>= 1
self.mask = (1 << bits) - 1 self.mask = (1 << bits) - 1
self.poly = 17349423945073011
# Polynomial for rolling hash (prime number) def chunk_file(self, file_path: Path, chunk_size: Optional[int]=None) -> Iterator[bytes]:
self.poly = 0x3DA3358B4DC173
def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
"""Chunk a file using Rabin fingerprinting
Args:
file_path: Path to file to chunk
chunk_size: If provided, use fixed-size chunking instead
Yields:
Chunk data as bytes
"""
if chunk_size: if chunk_size:
# Use fixed-size chunking
yield from self._chunk_fixed(file_path, chunk_size) yield from self._chunk_fixed(file_path, chunk_size)
else: else:
# Use content-defined chunking
yield from self._chunk_rabin(file_path) yield from self._chunk_rabin(file_path)
def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]: def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
"""Fixed-size chunking
Args:
file_path: Path to file
chunk_size: Chunk size in bytes
Yields:
Fixed-size chunks
"""
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
while True: while True:
chunk = f.read(chunk_size) chunk = f.read(chunk_size)
@@ -78,46 +32,22 @@ class RabinChunker:
yield chunk yield chunk
def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]: def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
"""Content-defined chunking using Rabin fingerprinting
Args:
file_path: Path to file
Yields:
Variable-size chunks based on content
"""
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
chunk_data = bytearray() chunk_data = bytearray()
window = bytearray() window = bytearray()
hash_value = 0 hash_value = 0
while True: while True:
byte = f.read(1) byte = f.read(1)
if not byte: if not byte:
# End of file - yield remaining data
if chunk_data: if chunk_data:
yield bytes(chunk_data) yield bytes(chunk_data)
break break
chunk_data.extend(byte) chunk_data.extend(byte)
window.extend(byte) window.extend(byte)
# Maintain window size
if len(window) > self.window_size: if len(window) > self.window_size:
window.pop(0) window.pop(0)
# Update rolling hash
hash_value = self._rolling_hash(window) hash_value = self._rolling_hash(window)
should_break = len(chunk_data) >= self.min_chunk_size and (hash_value & self.mask == 0 or len(chunk_data) >= self.max_chunk_size)
# Check if we should create a boundary
should_break = (
len(chunk_data) >= self.min_chunk_size and
(
(hash_value & self.mask) == 0 or
len(chunk_data) >= self.max_chunk_size
)
)
if should_break: if should_break:
yield bytes(chunk_data) yield bytes(chunk_data)
chunk_data = bytearray() chunk_data = bytearray()
@@ -125,40 +55,17 @@ class RabinChunker:
hash_value = 0 hash_value = 0
def _rolling_hash(self, window: bytearray) -> int: def _rolling_hash(self, window: bytearray) -> int:
"""Calculate rolling hash for window
Args:
window: Byte window
Returns:
Hash value
"""
hash_value = 0 hash_value = 0
for byte in window: for byte in window:
hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF hash_value = (hash_value << 1) + byte & 18446744073709551615
return hash_value return hash_value
class SimpleChunker: class SimpleChunker:
"""Simple fixed-size chunker for comparison"""
def __init__(self, chunk_size: int = 8192): def __init__(self, chunk_size: int=8192):
"""Initialize simple chunker
Args:
chunk_size: Fixed chunk size in bytes
"""
self.chunk_size = chunk_size self.chunk_size = chunk_size
def chunk_file(self, file_path: Path) -> Iterator[bytes]: def chunk_file(self, file_path: Path) -> Iterator[bytes]:
"""Chunk file into fixed-size pieces
Args:
file_path: Path to file
Yields:
Fixed-size chunks
"""
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
while True: while True:
chunk = f.read(self.chunk_size) chunk = f.read(self.chunk_size)
@@ -166,76 +73,31 @@ class SimpleChunker:
break break
yield chunk yield chunk
def hash_chunk(chunk: bytes, algorithm: str='sha256') -> str:
def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
"""Hash a chunk of data
Args:
chunk: Chunk data
algorithm: Hash algorithm (default: sha256)
Returns:
Hex digest of hash
"""
hasher = hashlib.new(algorithm) hasher = hashlib.new(algorithm)
hasher.update(chunk) hasher.update(chunk)
return hasher.hexdigest() return hasher.hexdigest()
def hash_file(file_path: Path, algorithm: str='sha256', chunk_size: int=65536) -> str:
def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
"""Hash entire file
Args:
file_path: Path to file
algorithm: Hash algorithm (default: sha256)
chunk_size: Size of chunks to read
Returns:
Hex digest of file hash
"""
hasher = hashlib.new(algorithm) hasher = hashlib.new(algorithm)
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
while True: while True:
chunk = f.read(chunk_size) chunk = f.read(chunk_size)
if not chunk: if not chunk:
break break
hasher.update(chunk) hasher.update(chunk)
return hasher.hexdigest() return hasher.hexdigest()
def compute_file_signature(file_path: Path, use_rabin: bool=True, avg_chunk_size: int=8192) -> tuple[str, list[str]]:
def compute_file_signature(
file_path: Path,
use_rabin: bool = True,
avg_chunk_size: int = 8192
) -> tuple[str, list[str]]:
"""Compute file signature with chunk hashes
Args:
file_path: Path to file
use_rabin: Whether to use Rabin chunking (vs fixed-size)
avg_chunk_size: Average chunk size for Rabin or fixed size
Returns:
Tuple of (file_hash, list of chunk hashes)
"""
if use_rabin: if use_rabin:
chunker = RabinChunker(avg_chunk_size=avg_chunk_size) chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
else: else:
chunker = SimpleChunker(chunk_size=avg_chunk_size) chunker = SimpleChunker(chunk_size=avg_chunk_size)
chunk_hashes = [] chunk_hashes = []
file_hasher = hashlib.sha256() file_hasher = hashlib.sha256()
for chunk in chunker.chunk_file(file_path): for chunk in chunker.chunk_file(file_path):
# Hash individual chunk
chunk_hash = hash_chunk(chunk) chunk_hash = hash_chunk(chunk)
chunk_hashes.append(chunk_hash) chunk_hashes.append(chunk_hash)
# Update file hash
file_hasher.update(chunk) file_hasher.update(chunk)
file_hash = file_hasher.hexdigest() file_hash = file_hasher.hexdigest()
return (file_hash, chunk_hashes)
return file_hash, chunk_hashes

View File

@@ -1,32 +1,16 @@
"""Deduplication engine"""
from pathlib import Path from pathlib import Path
from typing import Optional, Callable from typing import Optional, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
import psycopg2 import psycopg2
from .chunker import compute_file_signature, hash_file from .chunker import compute_file_signature, hash_file
from .store import HashStore from .store import HashStore
from ..shared.models import FileRecord, ProcessingStats from ..shared.models import FileRecord, ProcessingStats
from ..shared.config import DatabaseConfig, ProcessingConfig from ..shared.config import DatabaseConfig, ProcessingConfig
from ..shared.logger import ProgressLogger from ..shared.logger import ProgressLogger
class DeduplicationEngine: class DeduplicationEngine:
"""Engine for deduplicating files"""
def __init__( def __init__(self, db_config: DatabaseConfig, processing_config: ProcessingConfig, logger: ProgressLogger):
self,
db_config: DatabaseConfig,
processing_config: ProcessingConfig,
logger: ProgressLogger
):
"""Initialize deduplication engine
Args:
db_config: Database configuration
processing_config: Processing configuration
logger: Progress logger
"""
self.db_config = db_config self.db_config = db_config
self.processing_config = processing_config self.processing_config = processing_config
self.logger = logger self.logger = logger
@@ -34,320 +18,130 @@ class DeduplicationEngine:
self._connection = None self._connection = None
def _get_connection(self): def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed: if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect( self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection return self._connection
def deduplicate_all( def deduplicate_all(self, disk: Optional[str]=None, use_chunks: bool=True, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
self, self.logger.section('Starting Deduplication')
disk: Optional[str] = None,
use_chunks: bool = True,
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
) -> ProcessingStats:
"""Deduplicate all files in database
Args:
disk: Optional disk filter
use_chunks: Whether to use chunk-level deduplication
progress_callback: Optional callback for progress updates
Returns:
ProcessingStats with deduplication statistics
"""
self.logger.section("Starting Deduplication")
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
# Get files without checksums
if disk: if disk:
cursor.execute(""" cursor.execute('\n SELECT path, size\n FROM files\n WHERE disk_label = %s AND checksum IS NULL\n ORDER BY size DESC\n ', (disk,))
SELECT path, size
FROM files
WHERE disk_label = %s AND checksum IS NULL
ORDER BY size DESC
""", (disk,))
else: else:
cursor.execute(""" cursor.execute('\n SELECT path, size\n FROM files\n WHERE checksum IS NULL\n ORDER BY size DESC\n ')
SELECT path, size
FROM files
WHERE checksum IS NULL
ORDER BY size DESC
""")
files_to_process = cursor.fetchall() files_to_process = cursor.fetchall()
total_files = len(files_to_process) total_files = len(files_to_process)
self.logger.info(f'Found {total_files} files to process')
self.logger.info(f"Found {total_files} files to process")
stats = ProcessingStats() stats = ProcessingStats()
# Process files with thread pool
with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor: with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
futures = {} futures = {}
for path_str, size in files_to_process: for path_str, size in files_to_process:
path = Path(path_str) path = Path(path_str)
future = executor.submit(self._process_file, path, use_chunks) future = executor.submit(self._process_file, path, use_chunks)
futures[future] = (path, size) futures[future] = (path, size)
# Process completed futures
for future in as_completed(futures): for future in as_completed(futures):
path, size = futures[future] path, size = futures[future]
try: try:
checksum, duplicate_of = future.result() checksum, duplicate_of = future.result()
if checksum: if checksum:
# Update database cursor.execute('\n UPDATE files\n SET checksum = %s, duplicate_of = %s\n WHERE path = %s\n ', (checksum, duplicate_of, str(path)))
cursor.execute("""
UPDATE files
SET checksum = %s, duplicate_of = %s
WHERE path = %s
""", (checksum, duplicate_of, str(path)))
stats.files_succeeded += 1 stats.files_succeeded += 1
stats.bytes_processed += size stats.bytes_processed += size
stats.files_processed += 1 stats.files_processed += 1
# Commit periodically
if stats.files_processed % self.processing_config.commit_interval == 0: if stats.files_processed % self.processing_config.commit_interval == 0:
conn.commit() conn.commit()
# Progress callback
if progress_callback: if progress_callback:
progress_callback(stats.files_processed, total_files, stats) progress_callback(stats.files_processed, total_files, stats)
self.logger.progress(stats.files_processed, total_files, prefix='Files processed', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
# Log progress
self.logger.progress(
stats.files_processed,
total_files,
prefix="Files processed",
bytes_processed=stats.bytes_processed,
elapsed_seconds=stats.elapsed_seconds
)
except Exception as e: except Exception as e:
self.logger.warning(f"Failed to process {path}: {e}") self.logger.warning(f'Failed to process {path}: {e}')
stats.files_failed += 1 stats.files_failed += 1
stats.files_processed += 1 stats.files_processed += 1
# Final commit
conn.commit() conn.commit()
cursor.close() cursor.close()
self.logger.info(f'Deduplication complete: {stats.files_succeeded}/{total_files} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
self.logger.info(
f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
)
return stats return stats
def _process_file( def _process_file(self, path: Path, use_chunks: bool) -> tuple[Optional[str], Optional[str]]:
self,
path: Path,
use_chunks: bool
) -> tuple[Optional[str], Optional[str]]:
"""Process a single file for deduplication
Args:
path: Path to file
use_chunks: Whether to use chunk-level deduplication
Returns:
Tuple of (checksum, duplicate_of_path)
"""
if not path.exists(): if not path.exists():
return None, None return (None, None)
try: try:
if use_chunks: if use_chunks:
# Compute file signature with chunks checksum, chunk_hashes = compute_file_signature(path, use_rabin=True, avg_chunk_size=self.processing_config.chunk_size)
checksum, chunk_hashes = compute_file_signature(
path,
use_rabin=True,
avg_chunk_size=self.processing_config.chunk_size
)
else: else:
# Just compute file hash checksum = hash_file(path, algorithm=self.processing_config.hash_algorithm)
checksum = hash_file(
path,
algorithm=self.processing_config.hash_algorithm
)
chunk_hashes = None chunk_hashes = None
# Check if hash exists
if self.hash_store.exists(checksum): if self.hash_store.exists(checksum):
# Duplicate found
canonical_path = self.hash_store.get_canonical(checksum) canonical_path = self.hash_store.get_canonical(checksum)
return checksum, canonical_path return (checksum, canonical_path)
else: else:
# New unique file
size = path.stat().st_size size = path.stat().st_size
self.hash_store.store_canonical( self.hash_store.store_canonical(checksum, path, size, chunk_hashes)
checksum, return (checksum, None)
path,
size,
chunk_hashes
)
return checksum, None
except Exception as e: except Exception as e:
self.logger.debug(f"Error processing {path}: {e}") self.logger.debug(f'Error processing {path}: {e}')
raise raise
def find_duplicates( def find_duplicates(self, disk: Optional[str]=None) -> dict[str, list[str]]:
self, self.logger.subsection('Finding Duplicates')
disk: Optional[str] = None
) -> dict[str, list[str]]:
"""Find all duplicate files
Args:
disk: Optional disk filter
Returns:
Dictionary mapping canonical path to list of duplicate paths
"""
self.logger.subsection("Finding Duplicates")
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
# Query for duplicates
if disk: if disk:
cursor.execute(""" cursor.execute('\n SELECT checksum, array_agg(path ORDER BY path) as paths\n FROM files\n WHERE disk_label = %s AND checksum IS NOT NULL\n GROUP BY checksum\n HAVING COUNT(*) > 1\n ', (disk,))
SELECT checksum, array_agg(path ORDER BY path) as paths
FROM files
WHERE disk_label = %s AND checksum IS NOT NULL
GROUP BY checksum
HAVING COUNT(*) > 1
""", (disk,))
else: else:
cursor.execute(""" cursor.execute('\n SELECT checksum, array_agg(path ORDER BY path) as paths\n FROM files\n WHERE checksum IS NOT NULL\n GROUP BY checksum\n HAVING COUNT(*) > 1\n ')
SELECT checksum, array_agg(path ORDER BY path) as paths
FROM files
WHERE checksum IS NOT NULL
GROUP BY checksum
HAVING COUNT(*) > 1
""")
duplicates = {} duplicates = {}
for checksum, paths in cursor.fetchall(): for checksum, paths in cursor.fetchall():
canonical = paths[0] canonical = paths[0]
duplicates[canonical] = paths[1:] duplicates[canonical] = paths[1:]
cursor.close() cursor.close()
self.logger.info(f'Found {len(duplicates)} sets of duplicates')
self.logger.info(f"Found {len(duplicates)} sets of duplicates")
return duplicates return duplicates
def get_deduplication_stats(self) -> dict: def get_deduplication_stats(self) -> dict:
"""Get deduplication statistics
Returns:
Dictionary with statistics
"""
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
stats = {} stats = {}
cursor.execute('SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL')
# Total files
cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
stats['total_files'] = cursor.fetchone()[0] stats['total_files'] = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL')
# Unique files
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
stats['unique_files'] = cursor.fetchone()[0] stats['unique_files'] = cursor.fetchone()[0]
# Duplicate files
stats['duplicate_files'] = stats['total_files'] - stats['unique_files'] stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL')
# Total size
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
stats['total_size'] = cursor.fetchone()[0] stats['total_size'] = cursor.fetchone()[0]
cursor.execute('\n SELECT COALESCE(SUM(size), 0)\n FROM (\n SELECT DISTINCT ON (checksum) size\n FROM files\n WHERE checksum IS NOT NULL\n ) AS unique_files\n ')
# Unique size
cursor.execute("""
SELECT COALESCE(SUM(size), 0)
FROM (
SELECT DISTINCT ON (checksum) size
FROM files
WHERE checksum IS NOT NULL
) AS unique_files
""")
stats['unique_size'] = cursor.fetchone()[0] stats['unique_size'] = cursor.fetchone()[0]
# Wasted space
stats['wasted_space'] = stats['total_size'] - stats['unique_size'] stats['wasted_space'] = stats['total_size'] - stats['unique_size']
# Deduplication ratio
if stats['total_size'] > 0: if stats['total_size'] > 0:
stats['dedup_ratio'] = stats['unique_size'] / stats['total_size'] stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
else: else:
stats['dedup_ratio'] = 1.0 stats['dedup_ratio'] = 1.0
# Space saved percentage
if stats['total_size'] > 0: if stats['total_size'] > 0:
stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100 stats['space_saved_percent'] = stats['wasted_space'] / stats['total_size'] * 100
else: else:
stats['space_saved_percent'] = 0.0 stats['space_saved_percent'] = 0.0
cursor.close() cursor.close()
return stats return stats
def mark_canonical_files(self) -> int: def mark_canonical_files(self) -> int:
"""Mark canonical (first occurrence) files in database self.logger.subsection('Marking Canonical Files')
Returns:
Number of canonical files marked
"""
self.logger.subsection("Marking Canonical Files")
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('\n WITH canonical AS (\n SELECT DISTINCT ON (checksum) path, checksum\n FROM files\n WHERE checksum IS NOT NULL\n ORDER BY checksum, path\n )\n UPDATE files\n SET duplicate_of = NULL\n WHERE path IN (SELECT path FROM canonical)\n ')
# Find first occurrence of each checksum and mark as canonical
cursor.execute("""
WITH canonical AS (
SELECT DISTINCT ON (checksum) path, checksum
FROM files
WHERE checksum IS NOT NULL
ORDER BY checksum, path
)
UPDATE files
SET duplicate_of = NULL
WHERE path IN (SELECT path FROM canonical)
""")
count = cursor.rowcount count = cursor.rowcount
conn.commit() conn.commit()
cursor.close() cursor.close()
self.logger.info(f'Marked {count} canonical files')
self.logger.info(f"Marked {count} canonical files")
return count return count
def close(self): def close(self):
"""Close connections"""
self.hash_store.close() self.hash_store.close()
if self._connection and not self._connection.closed: if self._connection and (not self._connection.closed):
self._connection.close() self._connection.close()
def __enter__(self): def __enter__(self):
"""Context manager entry"""
return self return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close() self.close()

View File

@@ -1,412 +1,174 @@
"""Hash store for deduplication with optional Redis support"""
from typing import Optional, Dict, Set from typing import Optional, Dict, Set
from pathlib import Path from pathlib import Path
import psycopg2 import psycopg2
from psycopg2.extras import execute_batch from psycopg2.extras import execute_batch
from ..shared.config import DatabaseConfig from ..shared.config import DatabaseConfig
class HashStore: class HashStore:
"""PostgreSQL-based hash store for deduplication"""
def __init__(self, db_config: DatabaseConfig): def __init__(self, db_config: DatabaseConfig):
"""Initialize hash store
Args:
db_config: Database configuration
"""
self.db_config = db_config self.db_config = db_config
self._connection = None self._connection = None
def _get_connection(self): def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed: if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect( self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection return self._connection
def _ensure_tables(self): def _ensure_tables(self):
"""Ensure hash store tables exist"""
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('\n CREATE TABLE IF NOT EXISTS file_hashes (\n checksum TEXT PRIMARY KEY,\n canonical_path TEXT NOT NULL,\n size BIGINT NOT NULL,\n first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n ref_count INTEGER DEFAULT 1\n )\n ')
# Create hashes table for file-level deduplication cursor.execute('\n CREATE TABLE IF NOT EXISTS chunk_hashes (\n chunk_hash TEXT PRIMARY KEY,\n size INTEGER NOT NULL,\n first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n ref_count INTEGER DEFAULT 1\n )\n ')
cursor.execute(""" cursor.execute('\n CREATE TABLE IF NOT EXISTS file_chunks (\n id SERIAL PRIMARY KEY,\n file_checksum TEXT NOT NULL,\n chunk_hash TEXT NOT NULL,\n chunk_index INTEGER NOT NULL,\n FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),\n FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),\n UNIQUE (file_checksum, chunk_index)\n )\n ')
CREATE TABLE IF NOT EXISTS file_hashes ( cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_file_chunks_file\n ON file_chunks(file_checksum)\n ')
checksum TEXT PRIMARY KEY, cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk\n ON file_chunks(chunk_hash)\n ')
canonical_path TEXT NOT NULL,
size BIGINT NOT NULL,
first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ref_count INTEGER DEFAULT 1
)
""")
# Create chunk hashes table for chunk-level deduplication
cursor.execute("""
CREATE TABLE IF NOT EXISTS chunk_hashes (
chunk_hash TEXT PRIMARY KEY,
size INTEGER NOT NULL,
first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ref_count INTEGER DEFAULT 1
)
""")
# Create file-chunk mapping table
cursor.execute("""
CREATE TABLE IF NOT EXISTS file_chunks (
id SERIAL PRIMARY KEY,
file_checksum TEXT NOT NULL,
chunk_hash TEXT NOT NULL,
chunk_index INTEGER NOT NULL,
FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),
FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),
UNIQUE (file_checksum, chunk_index)
)
""")
# Create indexes
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_file_chunks_file
ON file_chunks(file_checksum)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk
ON file_chunks(chunk_hash)
""")
conn.commit() conn.commit()
cursor.close() cursor.close()
def exists(self, checksum: str) -> bool: def exists(self, checksum: str) -> bool:
"""Check if hash exists in store
Args:
checksum: File hash to check
Returns:
True if hash exists
"""
self._ensure_tables() self._ensure_tables()
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1', (checksum,))
cursor.execute(
"SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1",
(checksum,)
)
exists = cursor.fetchone() is not None exists = cursor.fetchone() is not None
cursor.close() cursor.close()
return exists return exists
def get_canonical(self, checksum: str) -> Optional[str]: def get_canonical(self, checksum: str) -> Optional[str]:
"""Get canonical path for a hash
Args:
checksum: File hash
Returns:
Canonical file path or None if not found
"""
self._ensure_tables() self._ensure_tables()
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('SELECT canonical_path FROM file_hashes WHERE checksum = %s', (checksum,))
cursor.execute(
"SELECT canonical_path FROM file_hashes WHERE checksum = %s",
(checksum,)
)
result = cursor.fetchone() result = cursor.fetchone()
cursor.close() cursor.close()
return result[0] if result else None return result[0] if result else None
def store_canonical( def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None:
self,
checksum: str,
path: Path,
size: int,
chunk_hashes: Optional[list[str]] = None
) -> None:
"""Store canonical reference for a hash
Args:
checksum: File hash
path: Canonical file path
size: File size in bytes
chunk_hashes: Optional list of chunk hashes
"""
self._ensure_tables() self._ensure_tables()
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
try: try:
# Store file hash cursor.execute('\n INSERT INTO file_hashes (checksum, canonical_path, size)\n VALUES (%s, %s, %s)\n ON CONFLICT (checksum) DO UPDATE SET\n ref_count = file_hashes.ref_count + 1\n ', (checksum, str(path), size))
cursor.execute("""
INSERT INTO file_hashes (checksum, canonical_path, size)
VALUES (%s, %s, %s)
ON CONFLICT (checksum) DO UPDATE SET
ref_count = file_hashes.ref_count + 1
""", (checksum, str(path), size))
# Store chunk hashes if provided
if chunk_hashes: if chunk_hashes:
# Insert chunk hashes
chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes] chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes]
execute_batch(cursor, """ execute_batch(cursor, '\n INSERT INTO chunk_hashes (chunk_hash, size)\n VALUES (%s, %s)\n ON CONFLICT (chunk_hash) DO UPDATE SET\n ref_count = chunk_hashes.ref_count + 1\n ', chunk_data, page_size=1000)
INSERT INTO chunk_hashes (chunk_hash, size) mapping_data = [(checksum, chunk_hash, idx) for idx, chunk_hash in enumerate(chunk_hashes)]
VALUES (%s, %s) execute_batch(cursor, '\n INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)\n VALUES (%s, %s, %s)\n ON CONFLICT (file_checksum, chunk_index) DO NOTHING\n ', mapping_data, page_size=1000)
ON CONFLICT (chunk_hash) DO UPDATE SET
ref_count = chunk_hashes.ref_count + 1
""", chunk_data, page_size=1000)
# Create file-chunk mappings
mapping_data = [
(checksum, chunk_hash, idx)
for idx, chunk_hash in enumerate(chunk_hashes)
]
execute_batch(cursor, """
INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)
VALUES (%s, %s, %s)
ON CONFLICT (file_checksum, chunk_index) DO NOTHING
""", mapping_data, page_size=1000)
conn.commit() conn.commit()
except Exception as e: except Exception as e:
conn.rollback() conn.rollback()
raise raise
finally: finally:
cursor.close() cursor.close()
def get_chunk_hashes(self, checksum: str) -> list[str]: def get_chunk_hashes(self, checksum: str) -> list[str]:
"""Get chunk hashes for a file
Args:
checksum: File hash
Returns:
List of chunk hashes in order
"""
self._ensure_tables() self._ensure_tables()
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('\n SELECT chunk_hash\n FROM file_chunks\n WHERE file_checksum = %s\n ORDER BY chunk_index\n ', (checksum,))
cursor.execute("""
SELECT chunk_hash
FROM file_chunks
WHERE file_checksum = %s
ORDER BY chunk_index
""", (checksum,))
chunk_hashes = [row[0] for row in cursor.fetchall()] chunk_hashes = [row[0] for row in cursor.fetchall()]
cursor.close() cursor.close()
return chunk_hashes return chunk_hashes
def get_duplicates(self) -> Dict[str, list[str]]: def get_duplicates(self) -> Dict[str, list[str]]:
"""Get all duplicate file groups
Returns:
Dictionary mapping canonical path to list of duplicate paths
"""
self._ensure_tables() self._ensure_tables()
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('\n SELECT f.path, f.checksum\n FROM files f\n WHERE f.checksum IS NOT NULL\n ')
# Get all files with their hashes
cursor.execute("""
SELECT f.path, f.checksum
FROM files f
WHERE f.checksum IS NOT NULL
""")
# Group by checksum
hash_to_paths: Dict[str, list[str]] = {} hash_to_paths: Dict[str, list[str]] = {}
for path, checksum in cursor.fetchall(): for path, checksum in cursor.fetchall():
if checksum not in hash_to_paths: if checksum not in hash_to_paths:
hash_to_paths[checksum] = [] hash_to_paths[checksum] = []
hash_to_paths[checksum].append(path) hash_to_paths[checksum].append(path)
cursor.close() cursor.close()
duplicates = {paths[0]: paths[1:] for checksum, paths in hash_to_paths.items() if len(paths) > 1}
# Filter to only duplicates (more than one file)
duplicates = {
paths[0]: paths[1:]
for checksum, paths in hash_to_paths.items()
if len(paths) > 1
}
return duplicates return duplicates
def get_stats(self) -> Dict[str, int]: def get_stats(self) -> Dict[str, int]:
"""Get hash store statistics
Returns:
Dictionary with statistics
"""
self._ensure_tables() self._ensure_tables()
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
stats = {} stats = {}
cursor.execute('SELECT COUNT(*) FROM file_hashes')
# Count unique file hashes
cursor.execute("SELECT COUNT(*) FROM file_hashes")
stats['unique_files'] = cursor.fetchone()[0] stats['unique_files'] = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(*) FROM chunk_hashes')
# Count unique chunk hashes
cursor.execute("SELECT COUNT(*) FROM chunk_hashes")
stats['unique_chunks'] = cursor.fetchone()[0] stats['unique_chunks'] = cursor.fetchone()[0]
cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes')
# Count total references
cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes")
stats['total_file_refs'] = cursor.fetchone()[0] stats['total_file_refs'] = cursor.fetchone()[0]
cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes')
# Count total chunk references
cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes")
stats['total_chunk_refs'] = cursor.fetchone()[0] stats['total_chunk_refs'] = cursor.fetchone()[0]
# Calculate deduplication ratio
if stats['total_file_refs'] > 0: if stats['total_file_refs'] > 0:
stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs'] stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs']
else: else:
stats['dedup_ratio'] = 1.0 stats['dedup_ratio'] = 1.0
cursor.close() cursor.close()
return stats return stats
def find_similar_files(self, checksum: str, threshold: float = 0.8) -> list[tuple[str, float]]: def find_similar_files(self, checksum: str, threshold: float=0.8) -> list[tuple[str, float]]:
"""Find files similar to given hash based on chunk overlap
Args:
checksum: File hash to compare
threshold: Similarity threshold (0.0 to 1.0)
Returns:
List of tuples (other_checksum, similarity_score)
"""
self._ensure_tables() self._ensure_tables()
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
# Get chunks for the target file
target_chunks = set(self.get_chunk_hashes(checksum)) target_chunks = set(self.get_chunk_hashes(checksum))
if not target_chunks: if not target_chunks:
cursor.close() cursor.close()
return [] return []
cursor.execute('\n SELECT DISTINCT fc.file_checksum\n FROM file_chunks fc\n WHERE fc.chunk_hash = ANY(%s)\n AND fc.file_checksum != %s\n ', (list(target_chunks), checksum))
# Find files sharing chunks
cursor.execute("""
SELECT DISTINCT fc.file_checksum
FROM file_chunks fc
WHERE fc.chunk_hash = ANY(%s)
AND fc.file_checksum != %s
""", (list(target_chunks), checksum))
similar_files = [] similar_files = []
for other_checksum, in cursor.fetchall():
for (other_checksum,) in cursor.fetchall():
other_chunks = set(self.get_chunk_hashes(other_checksum)) other_chunks = set(self.get_chunk_hashes(other_checksum))
# Calculate Jaccard similarity
intersection = len(target_chunks & other_chunks) intersection = len(target_chunks & other_chunks)
union = len(target_chunks | other_chunks) union = len(target_chunks | other_chunks)
if union > 0: if union > 0:
similarity = intersection / union similarity = intersection / union
if similarity >= threshold: if similarity >= threshold:
similar_files.append((other_checksum, similarity)) similar_files.append((other_checksum, similarity))
cursor.close() cursor.close()
# Sort by similarity descending
similar_files.sort(key=lambda x: x[1], reverse=True) similar_files.sort(key=lambda x: x[1], reverse=True)
return similar_files return similar_files
def close(self): def close(self):
"""Close database connection""" if self._connection and (not self._connection.closed):
if self._connection and not self._connection.closed:
self._connection.close() self._connection.close()
def __enter__(self): def __enter__(self):
"""Context manager entry"""
self._ensure_tables() self._ensure_tables()
return self return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close() self.close()
class MemoryHashStore: class MemoryHashStore:
"""In-memory hash store for testing and small datasets"""
def __init__(self): def __init__(self):
"""Initialize in-memory hash store"""
self.hashes: Dict[str, tuple[str, int]] = {} self.hashes: Dict[str, tuple[str, int]] = {}
self.chunks: Dict[str, int] = {} self.chunks: Dict[str, int] = {}
self.file_chunks: Dict[str, list[str]] = {} self.file_chunks: Dict[str, list[str]] = {}
def exists(self, checksum: str) -> bool: def exists(self, checksum: str) -> bool:
"""Check if hash exists"""
return checksum in self.hashes return checksum in self.hashes
def get_canonical(self, checksum: str) -> Optional[str]: def get_canonical(self, checksum: str) -> Optional[str]:
"""Get canonical path"""
return self.hashes.get(checksum, (None, 0))[0] return self.hashes.get(checksum, (None, 0))[0]
def store_canonical( def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None:
self,
checksum: str,
path: Path,
size: int,
chunk_hashes: Optional[list[str]] = None
) -> None:
"""Store canonical reference"""
self.hashes[checksum] = (str(path), size) self.hashes[checksum] = (str(path), size)
if chunk_hashes: if chunk_hashes:
self.file_chunks[checksum] = chunk_hashes self.file_chunks[checksum] = chunk_hashes
for chunk_hash in chunk_hashes: for chunk_hash in chunk_hashes:
self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1 self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1
def get_chunk_hashes(self, checksum: str) -> list[str]: def get_chunk_hashes(self, checksum: str) -> list[str]:
"""Get chunk hashes"""
return self.file_chunks.get(checksum, []) return self.file_chunks.get(checksum, [])
def get_stats(self) -> Dict[str, int]: def get_stats(self) -> Dict[str, int]:
"""Get statistics""" return {'unique_files': len(self.hashes), 'unique_chunks': len(self.chunks), 'total_file_refs': len(self.hashes), 'total_chunk_refs': sum(self.chunks.values()), 'dedup_ratio': 1.0}
return {
'unique_files': len(self.hashes),
'unique_chunks': len(self.chunks),
'total_file_refs': len(self.hashes),
'total_chunk_refs': sum(self.chunks.values()),
'dedup_ratio': 1.0
}
def close(self): def close(self):
"""No-op for compatibility"""
pass pass
def __enter__(self): def __enter__(self):
"""Context manager entry"""
return self return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
pass pass

View File

@@ -1,27 +1,5 @@
"""Migration package exports""" from .copy import CopyMigrationStrategy, FastCopyStrategy, SafeCopyStrategy, ReferenceCopyStrategy
from .copy import ( from .hardlink import HardlinkMigrationStrategy, SymlinkMigrationStrategy, DedupHardlinkStrategy
CopyMigrationStrategy,
FastCopyStrategy,
SafeCopyStrategy,
ReferenceCopyStrategy
)
from .hardlink import (
HardlinkMigrationStrategy,
SymlinkMigrationStrategy,
DedupHardlinkStrategy
)
from .engine import MigrationEngine from .engine import MigrationEngine
from ._protocols import IMigrationStrategy, IMigrationEngine from ._protocols import IMigrationStrategy, IMigrationEngine
__all__ = ['CopyMigrationStrategy', 'FastCopyStrategy', 'SafeCopyStrategy', 'ReferenceCopyStrategy', 'HardlinkMigrationStrategy', 'SymlinkMigrationStrategy', 'DedupHardlinkStrategy', 'MigrationEngine', 'IMigrationStrategy', 'IMigrationEngine']
__all__ = [
'CopyMigrationStrategy',
'FastCopyStrategy',
'SafeCopyStrategy',
'ReferenceCopyStrategy',
'HardlinkMigrationStrategy',
'SymlinkMigrationStrategy',
'DedupHardlinkStrategy',
'MigrationEngine',
'IMigrationStrategy',
'IMigrationEngine',
]

View File

@@ -1,107 +1,28 @@
"""Protocol definitions for the migration package"""
from typing import Protocol from typing import Protocol
from pathlib import Path from pathlib import Path
from ..shared.models import OperationRecord from ..shared.models import OperationRecord
class IMigrationStrategy(Protocol): class IMigrationStrategy(Protocol):
"""Protocol for migration strategies"""
def migrate( def migrate(self, source: Path, destination: Path, verify: bool=True) -> bool:
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate a file from source to destination
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
... ...
def can_migrate(self, source: Path, destination: Path) -> bool: def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible
Args:
source: Source file path
destination: Destination file path
Returns:
True if migration is possible
"""
... ...
def estimate_time(self, source: Path) -> float: def estimate_time(self, source: Path) -> float:
"""Estimate migration time in seconds
Args:
source: Source file path
Returns:
Estimated time in seconds
"""
... ...
def cleanup(self, source: Path) -> bool: def cleanup(self, source: Path) -> bool:
"""Cleanup source file after successful migration
Args:
source: Source file path
Returns:
True if cleanup successful
"""
... ...
class IMigrationEngine(Protocol): class IMigrationEngine(Protocol):
"""Protocol for migration engine"""
def plan_migration( def plan_migration(self, disk: str, target_base: Path) -> list[OperationRecord]:
self,
disk: str,
target_base: Path
) -> list[OperationRecord]:
"""Plan migration for a disk
Args:
disk: Disk identifier
target_base: Target base directory
Returns:
List of planned operations
"""
... ...
def execute_migration( def execute_migration(self, operations: list[OperationRecord], dry_run: bool=False) -> dict:
self,
operations: list[OperationRecord],
dry_run: bool = False
) -> dict:
"""Execute migration operations
Args:
operations: List of operations to execute
dry_run: Whether to perform a dry run
Returns:
Dictionary with execution statistics
"""
... ...
def rollback(self, operation: OperationRecord) -> bool: def rollback(self, operation: OperationRecord) -> bool:
"""Rollback a migration operation
Args:
operation: Operation to rollback
Returns:
True if rollback successful
"""
... ...

View File

@@ -1,268 +1,129 @@
"""Copy-based migration strategy"""
import shutil import shutil
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import os import os
from ..shared.logger import ProgressLogger from ..shared.logger import ProgressLogger
class CopyMigrationStrategy: class CopyMigrationStrategy:
"""Copy files to destination with verification"""
def __init__( def __init__(self, logger: Optional[ProgressLogger]=None, preserve_metadata: bool=True, verify_checksums: bool=True):
self,
logger: Optional[ProgressLogger] = None,
preserve_metadata: bool = True,
verify_checksums: bool = True
):
"""Initialize copy migration strategy
Args:
logger: Optional progress logger
preserve_metadata: Whether to preserve file metadata
verify_checksums: Whether to verify checksums after copy
"""
self.logger = logger self.logger = logger
self.preserve_metadata = preserve_metadata self.preserve_metadata = preserve_metadata
self.verify_checksums = verify_checksums self.verify_checksums = verify_checksums
def migrate( def migrate(self, source: Path, destination: Path, verify: bool=True) -> bool:
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate file by copying
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
if not source.exists(): if not source.exists():
if self.logger: if self.logger:
self.logger.error(f"Source file does not exist: {source}") self.logger.error(f'Source file does not exist: {source}')
return False return False
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True) destination.parent.mkdir(parents=True, exist_ok=True)
try: try:
# Copy file
if self.preserve_metadata: if self.preserve_metadata:
shutil.copy2(source, destination) shutil.copy2(source, destination)
else: else:
shutil.copy(source, destination) shutil.copy(source, destination)
# Verify if requested
if verify and self.verify_checksums: if verify and self.verify_checksums:
if not self._verify_copy(source, destination): if not self._verify_copy(source, destination):
if self.logger: if self.logger:
self.logger.error(f"Verification failed: {source} -> {destination}") self.logger.error(f'Verification failed: {source} -> {destination}')
destination.unlink() destination.unlink()
return False return False
return True return True
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.error(f"Copy failed: {source} -> {destination}: {e}") self.logger.error(f'Copy failed: {source} -> {destination}: {e}')
return False return False
def _verify_copy(self, source: Path, destination: Path) -> bool: def _verify_copy(self, source: Path, destination: Path) -> bool:
"""Verify copied file
Args:
source: Source file path
destination: Destination file path
Returns:
True if verification successful
"""
# Check size
source_size = source.stat().st_size source_size = source.stat().st_size
dest_size = destination.stat().st_size dest_size = destination.stat().st_size
if source_size != dest_size: if source_size != dest_size:
return False return False
# Compare checksums for files larger than 1MB
if source_size > 1024 * 1024: if source_size > 1024 * 1024:
from ..deduplication.chunker import hash_file from ..deduplication.chunker import hash_file
source_hash = hash_file(source) source_hash = hash_file(source)
dest_hash = hash_file(destination) dest_hash = hash_file(destination)
return source_hash == dest_hash return source_hash == dest_hash
# For small files, compare content directly
with open(source, 'rb') as f1, open(destination, 'rb') as f2: with open(source, 'rb') as f1, open(destination, 'rb') as f2:
return f1.read() == f2.read() return f1.read() == f2.read()
def can_migrate(self, source: Path, destination: Path) -> bool: def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible
Args:
source: Source file path
destination: Destination file path
Returns:
True if migration is possible
"""
if not source.exists(): if not source.exists():
return False return False
# Check if destination directory is writable
dest_dir = destination.parent dest_dir = destination.parent
if dest_dir.exists(): if dest_dir.exists():
return os.access(dest_dir, os.W_OK) return os.access(dest_dir, os.W_OK)
# Check if parent directory exists and is writable
parent = dest_dir.parent parent = dest_dir.parent
while not parent.exists() and parent != parent.parent: while not parent.exists() and parent != parent.parent:
parent = parent.parent parent = parent.parent
return parent.exists() and os.access(parent, os.W_OK) return parent.exists() and os.access(parent, os.W_OK)
def estimate_time(self, source: Path) -> float: def estimate_time(self, source: Path) -> float:
"""Estimate migration time in seconds
Args:
source: Source file path
Returns:
Estimated time in seconds
"""
if not source.exists(): if not source.exists():
return 0.0 return 0.0
size = source.stat().st_size size = source.stat().st_size
typical_speed = 100 * 1024 * 1024
# Estimate based on typical copy speed (100 MB/s)
typical_speed = 100 * 1024 * 1024 # bytes per second
return size / typical_speed return size / typical_speed
def cleanup(self, source: Path) -> bool: def cleanup(self, source: Path) -> bool:
"""Cleanup source file after successful migration
Args:
source: Source file path
Returns:
True if cleanup successful
"""
try: try:
if source.exists(): if source.exists():
source.unlink() source.unlink()
return True return True
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.warning(f"Failed to cleanup {source}: {e}") self.logger.warning(f'Failed to cleanup {source}: {e}')
return False return False
class FastCopyStrategy(CopyMigrationStrategy): class FastCopyStrategy(CopyMigrationStrategy):
"""Fast copy strategy without verification"""
def __init__(self, logger: Optional[ProgressLogger] = None):
"""Initialize fast copy strategy"""
super().__init__(
logger=logger,
preserve_metadata=True,
verify_checksums=False
)
def __init__(self, logger: Optional[ProgressLogger]=None):
super().__init__(logger=logger, preserve_metadata=True, verify_checksums=False)
class SafeCopyStrategy(CopyMigrationStrategy): class SafeCopyStrategy(CopyMigrationStrategy):
"""Safe copy strategy with full verification"""
def __init__(self, logger: Optional[ProgressLogger] = None):
"""Initialize safe copy strategy"""
super().__init__(
logger=logger,
preserve_metadata=True,
verify_checksums=True
)
def __init__(self, logger: Optional[ProgressLogger]=None):
super().__init__(logger=logger, preserve_metadata=True, verify_checksums=True)
class ReferenceCopyStrategy: class ReferenceCopyStrategy:
"""Create reference copy using reflinks (CoW) if supported"""
def __init__(self, logger: Optional[ProgressLogger] = None): def __init__(self, logger: Optional[ProgressLogger]=None):
"""Initialize reflink copy strategy"""
self.logger = logger self.logger = logger
def migrate( def migrate(self, source: Path, destination: Path, verify: bool=True) -> bool:
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate using reflink (copy-on-write)
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
if not source.exists(): if not source.exists():
if self.logger: if self.logger:
self.logger.error(f"Source file does not exist: {source}") self.logger.error(f'Source file does not exist: {source}')
return False return False
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True) destination.parent.mkdir(parents=True, exist_ok=True)
try: try:
# Try reflink copy (works on btrfs, xfs, etc.)
import subprocess import subprocess
result = subprocess.run(['cp', '--reflink=auto', str(source), str(destination)], capture_output=True, check=False)
result = subprocess.run(
['cp', '--reflink=auto', str(source), str(destination)],
capture_output=True,
check=False
)
if result.returncode != 0: if result.returncode != 0:
# Fallback to regular copy
shutil.copy2(source, destination) shutil.copy2(source, destination)
return True return True
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.error(f"Reflink copy failed: {source} -> {destination}: {e}") self.logger.error(f'Reflink copy failed: {source} -> {destination}: {e}')
return False return False
def can_migrate(self, source: Path, destination: Path) -> bool: def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible"""
if not source.exists(): if not source.exists():
return False return False
dest_dir = destination.parent dest_dir = destination.parent
if dest_dir.exists(): if dest_dir.exists():
return os.access(dest_dir, os.W_OK) return os.access(dest_dir, os.W_OK)
return True return True
def estimate_time(self, source: Path) -> float: def estimate_time(self, source: Path) -> float:
"""Estimate migration time (reflinks are fast)""" return 0.1
return 0.1 # Reflinks are nearly instant
def cleanup(self, source: Path) -> bool: def cleanup(self, source: Path) -> bool:
"""Cleanup source file"""
try: try:
if source.exists(): if source.exists():
source.unlink() source.unlink()
return True return True
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.warning(f"Failed to cleanup {source}: {e}") self.logger.warning(f'Failed to cleanup {source}: {e}')
return False return False

View File

@@ -1,254 +1,100 @@
"""Migration engine"""
from pathlib import Path from pathlib import Path
from typing import Optional, Callable from typing import Optional, Callable
from datetime import datetime from datetime import datetime
import psycopg2 import psycopg2
from psycopg2.extras import execute_batch from psycopg2.extras import execute_batch
from .copy import CopyMigrationStrategy, SafeCopyStrategy from .copy import CopyMigrationStrategy, SafeCopyStrategy
from .hardlink import HardlinkMigrationStrategy, SymlinkMigrationStrategy from .hardlink import HardlinkMigrationStrategy, SymlinkMigrationStrategy
from ..shared.models import OperationRecord, ProcessingStats, MigrationPlan from ..shared.models import OperationRecord, ProcessingStats, MigrationPlan
from ..shared.config import DatabaseConfig, ProcessingConfig from ..shared.config import DatabaseConfig, ProcessingConfig
from ..shared.logger import ProgressLogger from ..shared.logger import ProgressLogger
class MigrationEngine: class MigrationEngine:
"""Engine for migrating files"""
def __init__( def __init__(self, db_config: DatabaseConfig, processing_config: ProcessingConfig, logger: ProgressLogger, target_base: Path):
self,
db_config: DatabaseConfig,
processing_config: ProcessingConfig,
logger: ProgressLogger,
target_base: Path
):
"""Initialize migration engine
Args:
db_config: Database configuration
processing_config: Processing configuration
logger: Progress logger
target_base: Target base directory for migrations
"""
self.db_config = db_config self.db_config = db_config
self.processing_config = processing_config self.processing_config = processing_config
self.logger = logger self.logger = logger
self.target_base = Path(target_base) self.target_base = Path(target_base)
self._connection = None self._connection = None
# Initialize strategies
self.copy_strategy = SafeCopyStrategy(logger=logger) self.copy_strategy = SafeCopyStrategy(logger=logger)
self.hardlink_strategy = HardlinkMigrationStrategy(logger=logger) self.hardlink_strategy = HardlinkMigrationStrategy(logger=logger)
self.symlink_strategy = SymlinkMigrationStrategy(logger=logger) self.symlink_strategy = SymlinkMigrationStrategy(logger=logger)
def _get_connection(self): def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed: if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect( self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection return self._connection
def _ensure_tables(self): def _ensure_tables(self):
"""Ensure migration tables exist"""
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute("\n CREATE TABLE IF NOT EXISTS operations (\n id SERIAL PRIMARY KEY,\n source_path TEXT NOT NULL,\n target_path TEXT NOT NULL,\n operation_type TEXT NOT NULL,\n size BIGINT DEFAULT 0,\n status TEXT DEFAULT 'pending',\n error TEXT,\n executed_at TIMESTAMP,\n verified BOOLEAN DEFAULT FALSE,\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n ")
# Create operations table cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_operations_status\n ON operations(status)\n ')
cursor.execute("""
CREATE TABLE IF NOT EXISTS operations (
id SERIAL PRIMARY KEY,
source_path TEXT NOT NULL,
target_path TEXT NOT NULL,
operation_type TEXT NOT NULL,
size BIGINT DEFAULT 0,
status TEXT DEFAULT 'pending',
error TEXT,
executed_at TIMESTAMP,
verified BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create index on status
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_operations_status
ON operations(status)
""")
conn.commit() conn.commit()
cursor.close() cursor.close()
def plan_migration( def plan_migration(self, disk: Optional[str]=None, category: Optional[str]=None) -> MigrationPlan:
self, self.logger.section('Planning Migration')
disk: Optional[str] = None,
category: Optional[str] = None
) -> MigrationPlan:
"""Plan migration for files
Args:
disk: Optional disk filter
category: Optional category filter
Returns:
MigrationPlan with planned operations
"""
self.logger.section("Planning Migration")
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
conditions = ['category IS NOT NULL']
# Build query
conditions = ["category IS NOT NULL"]
params = [] params = []
if disk: if disk:
conditions.append("disk_label = %s") conditions.append('disk_label = %s')
params.append(disk) params.append(disk)
if category: if category:
conditions.append("category = %s") conditions.append('category = %s')
params.append(category) params.append(category)
query = f"\n SELECT path, size, category, duplicate_of\n FROM files\n WHERE {' AND '.join(conditions)}\n ORDER BY category, path\n "
query = f"""
SELECT path, size, category, duplicate_of
FROM files
WHERE {' AND '.join(conditions)}
ORDER BY category, path
"""
cursor.execute(query, params) cursor.execute(query, params)
files = cursor.fetchall() files = cursor.fetchall()
self.logger.info(f'Found {len(files)} files to migrate')
self.logger.info(f"Found {len(files)} files to migrate")
operations = [] operations = []
total_size = 0 total_size = 0
for path_str, size, file_category, duplicate_of in files: for path_str, size, file_category, duplicate_of in files:
source = Path(path_str) source = Path(path_str)
# Determine destination
target_path = self.target_base / file_category / source.name target_path = self.target_base / file_category / source.name
# Determine operation type
if duplicate_of: if duplicate_of:
# Use hardlink for duplicates
operation_type = 'hardlink' operation_type = 'hardlink'
else: else:
# Use copy for unique files
operation_type = 'copy' operation_type = 'copy'
operation = OperationRecord(source_path=source, target_path=target_path, operation_type=operation_type, size=size)
operation = OperationRecord(
source_path=source,
target_path=target_path,
operation_type=operation_type,
size=size
)
operations.append(operation) operations.append(operation)
total_size += size total_size += size
cursor.close() cursor.close()
plan = MigrationPlan(target_disk=str(self.target_base), destination_disks=[str(self.target_base)], operations=operations, total_size=total_size, file_count=len(operations))
plan = MigrationPlan( self.logger.info(f'Migration plan created: {plan.file_count} files, {plan.total_size:,} bytes')
target_disk=str(self.target_base),
destination_disks=[str(self.target_base)],
operations=operations,
total_size=total_size,
file_count=len(operations)
)
self.logger.info(
f"Migration plan created: {plan.file_count} files, "
f"{plan.total_size:,} bytes"
)
return plan return plan
def execute_migration( def execute_migration(self, operations: list[OperationRecord], dry_run: bool=False, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
self, self.logger.section('Executing Migration' + (' (DRY RUN)' if dry_run else ''))
operations: list[OperationRecord],
dry_run: bool = False,
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
) -> ProcessingStats:
"""Execute migration operations
Args:
operations: List of operations to execute
dry_run: Whether to perform a dry run
progress_callback: Optional callback for progress updates
Returns:
ProcessingStats with execution statistics
"""
self.logger.section("Executing Migration" + (" (DRY RUN)" if dry_run else ""))
self._ensure_tables() self._ensure_tables()
stats = ProcessingStats() stats = ProcessingStats()
total_ops = len(operations) total_ops = len(operations)
for operation in operations: for operation in operations:
stats.files_processed += 1 stats.files_processed += 1
if dry_run: if dry_run:
# In dry run, just log what would happen self.logger.debug(f'[DRY RUN] Would {operation.operation_type}: {operation.source_path} -> {operation.target_path}')
self.logger.debug(
f"[DRY RUN] Would {operation.operation_type}: "
f"{operation.source_path} -> {operation.target_path}"
)
stats.files_succeeded += 1 stats.files_succeeded += 1
else: else:
# Execute actual migration
success = self._execute_operation(operation) success = self._execute_operation(operation)
if success: if success:
stats.files_succeeded += 1 stats.files_succeeded += 1
stats.bytes_processed += operation.size stats.bytes_processed += operation.size
else: else:
stats.files_failed += 1 stats.files_failed += 1
# Progress callback
if progress_callback and stats.files_processed % 100 == 0: if progress_callback and stats.files_processed % 100 == 0:
progress_callback(stats.files_processed, total_ops, stats) progress_callback(stats.files_processed, total_ops, stats)
# Log progress
if stats.files_processed % 1000 == 0: if stats.files_processed % 1000 == 0:
self.logger.progress( self.logger.progress(stats.files_processed, total_ops, prefix='Operations executed', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
stats.files_processed, self.logger.info(f"Migration {('dry run' if dry_run else 'execution')} complete: {stats.files_succeeded}/{total_ops} operations, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s")
total_ops,
prefix="Operations executed",
bytes_processed=stats.bytes_processed,
elapsed_seconds=stats.elapsed_seconds
)
self.logger.info(
f"Migration {'dry run' if dry_run else 'execution'} complete: "
f"{stats.files_succeeded}/{total_ops} operations, "
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
)
return stats return stats
def _execute_operation(self, operation: OperationRecord) -> bool: def _execute_operation(self, operation: OperationRecord) -> bool:
"""Execute a single migration operation
Args:
operation: Operation to execute
Returns:
True if successful
"""
operation.status = 'in_progress' operation.status = 'in_progress'
operation.executed_at = datetime.now() operation.executed_at = datetime.now()
try: try:
# Select strategy based on operation type
if operation.operation_type == 'copy': if operation.operation_type == 'copy':
strategy = self.copy_strategy strategy = self.copy_strategy
elif operation.operation_type == 'hardlink': elif operation.operation_type == 'hardlink':
@@ -256,15 +102,8 @@ class MigrationEngine:
elif operation.operation_type == 'symlink': elif operation.operation_type == 'symlink':
strategy = self.symlink_strategy strategy = self.symlink_strategy
else: else:
raise ValueError(f"Unknown operation type: {operation.operation_type}") raise ValueError(f'Unknown operation type: {operation.operation_type}')
success = strategy.migrate(operation.source_path, operation.target_path, verify=self.processing_config.verify_operations)
# Execute migration
success = strategy.migrate(
operation.source_path,
operation.target_path,
verify=self.processing_config.verify_operations
)
if success: if success:
operation.status = 'completed' operation.status = 'completed'
operation.verified = True operation.verified = True
@@ -272,183 +111,85 @@ class MigrationEngine:
return True return True
else: else:
operation.status = 'failed' operation.status = 'failed'
operation.error = "Migration failed" operation.error = 'Migration failed'
self._record_operation(operation) self._record_operation(operation)
return False return False
except Exception as e: except Exception as e:
operation.status = 'failed' operation.status = 'failed'
operation.error = str(e) operation.error = str(e)
self._record_operation(operation) self._record_operation(operation)
self.logger.error(f"Operation failed: {operation.source_path}: {e}") self.logger.error(f'Operation failed: {operation.source_path}: {e}')
return False return False
def _record_operation(self, operation: OperationRecord): def _record_operation(self, operation: OperationRecord):
"""Record operation in database
Args:
operation: Operation to record
"""
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('\n INSERT INTO operations (\n source_path, target_path, operation_type, bytes_processed,\n status, error, executed_at, verified\n )\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s)\n ', (str(operation.source_path), str(operation.target_path), operation.operation_type, operation.size, operation.status, operation.error, operation.executed_at, operation.verified))
cursor.execute("""
INSERT INTO operations (
source_path, target_path, operation_type, bytes_processed,
status, error, executed_at, verified
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
""", (
str(operation.source_path),
str(operation.target_path),
operation.operation_type,
operation.size,
operation.status,
operation.error,
operation.executed_at,
operation.verified
))
conn.commit() conn.commit()
cursor.close() cursor.close()
def rollback(self, operation: OperationRecord) -> bool: def rollback(self, operation: OperationRecord) -> bool:
"""Rollback a migration operation self.logger.warning(f'Rolling back: {operation.target_path}')
Args:
operation: Operation to rollback
Returns:
True if rollback successful
"""
self.logger.warning(f"Rolling back: {operation.target_path}")
try: try:
# Remove destination
if operation.target_path.exists(): if operation.target_path.exists():
operation.target_path.unlink() operation.target_path.unlink()
# Update database
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute("\n UPDATE operations\n SET status = 'rolled_back'\n WHERE source_path = %s AND target_path = %s\n ", (str(operation.source_path), str(operation.target_path)))
cursor.execute("""
UPDATE operations
SET status = 'rolled_back'
WHERE source_path = %s AND target_path = %s
""", (str(operation.source_path), str(operation.target_path)))
conn.commit() conn.commit()
cursor.close() cursor.close()
return True return True
except Exception as e: except Exception as e:
self.logger.error(f"Rollback failed: {operation.target_path}: {e}") self.logger.error(f'Rollback failed: {operation.target_path}: {e}')
return False return False
def get_migration_stats(self) -> dict: def get_migration_stats(self) -> dict:
"""Get migration statistics
Returns:
Dictionary with statistics
"""
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
stats = {} stats = {}
cursor.execute('SELECT COUNT(*) FROM operations')
# Total operations
cursor.execute("SELECT COUNT(*) FROM operations")
stats['total_operations'] = cursor.fetchone()[0] stats['total_operations'] = cursor.fetchone()[0]
cursor.execute('\n SELECT status, COUNT(*)\n FROM operations\n GROUP BY status\n ')
# Operations by status
cursor.execute("""
SELECT status, COUNT(*)
FROM operations
GROUP BY status
""")
for status, count in cursor.fetchall(): for status, count in cursor.fetchall():
stats[f'{status}_operations'] = count stats[f'{status}_operations'] = count
cursor.execute("\n SELECT COALESCE(SUM(size), 0)\n FROM operations\n WHERE status = 'completed'\n ")
# Total size migrated
cursor.execute("""
SELECT COALESCE(SUM(size), 0)
FROM operations
WHERE status = 'completed'
""")
stats['total_size_migrated'] = cursor.fetchone()[0] stats['total_size_migrated'] = cursor.fetchone()[0]
cursor.close() cursor.close()
return stats return stats
def verify_migrations(self) -> dict: def verify_migrations(self) -> dict:
"""Verify completed migrations self.logger.subsection('Verifying Migrations')
Returns:
Dictionary with verification results
"""
self.logger.subsection("Verifying Migrations")
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute("\n SELECT source_path, target_path, operation_type\n FROM operations\n WHERE status = 'completed' AND verified = FALSE\n ")
cursor.execute("""
SELECT source_path, target_path, operation_type
FROM operations
WHERE status = 'completed' AND verified = FALSE
""")
operations = cursor.fetchall() operations = cursor.fetchall()
cursor.close() cursor.close()
results = {'total': len(operations), 'verified': 0, 'failed': 0}
results = {
'total': len(operations),
'verified': 0,
'failed': 0
}
for source_str, dest_str, op_type in operations: for source_str, dest_str, op_type in operations:
source = Path(source_str) source = Path(source_str)
dest = Path(dest_str) dest = Path(dest_str)
# Verify destination exists
if not dest.exists(): if not dest.exists():
results['failed'] += 1 results['failed'] += 1
self.logger.warning(f"Verification failed: {dest} does not exist") self.logger.warning(f'Verification failed: {dest} does not exist')
continue continue
# Verify based on operation type
if op_type == 'hardlink': if op_type == 'hardlink':
# Check if hardlinked
if source.exists() and source.stat().st_ino == dest.stat().st_ino: if source.exists() and source.stat().st_ino == dest.stat().st_ino:
results['verified'] += 1 results['verified'] += 1
else: else:
results['failed'] += 1 results['failed'] += 1
elif dest.exists():
results['verified'] += 1
else: else:
# Check if destination exists and has correct size results['failed'] += 1
if dest.exists(): self.logger.info(f"Verification complete: {results['verified']}/{results['total']} verified")
results['verified'] += 1
else:
results['failed'] += 1
self.logger.info(
f"Verification complete: {results['verified']}/{results['total']} verified"
)
return results return results
def close(self): def close(self):
"""Close database connection""" if self._connection and (not self._connection.closed):
if self._connection and not self._connection.closed:
self._connection.close() self._connection.close()
def __enter__(self): def __enter__(self):
"""Context manager entry"""
return self return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close() self.close()

View File

@@ -1,90 +1,43 @@
"""Hardlink-based migration strategy"""
import os import os
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from ..shared.logger import ProgressLogger from ..shared.logger import ProgressLogger
class HardlinkMigrationStrategy: class HardlinkMigrationStrategy:
"""Create hardlinks to files instead of copying"""
def __init__(self, logger: Optional[ProgressLogger] = None): def __init__(self, logger: Optional[ProgressLogger]=None):
"""Initialize hardlink migration strategy
Args:
logger: Optional progress logger
"""
self.logger = logger self.logger = logger
def migrate( def migrate(self, source: Path, destination: Path, verify: bool=True) -> bool:
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate file by creating hardlink
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
if not source.exists(): if not source.exists():
if self.logger: if self.logger:
self.logger.error(f"Source file does not exist: {source}") self.logger.error(f'Source file does not exist: {source}')
return False return False
# Check if source and destination are on same filesystem
if not self._same_filesystem(source, destination.parent): if not self._same_filesystem(source, destination.parent):
if self.logger: if self.logger:
self.logger.warning( self.logger.warning(f'Cannot hardlink across filesystems: {source} -> {destination}')
f"Cannot hardlink across filesystems: {source} -> {destination}"
)
return False return False
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True) destination.parent.mkdir(parents=True, exist_ok=True)
try: try:
# Create hardlink
os.link(source, destination) os.link(source, destination)
# Verify if requested
if verify: if verify:
if not self._verify_hardlink(source, destination): if not self._verify_hardlink(source, destination):
if self.logger: if self.logger:
self.logger.error(f"Verification failed: {source} -> {destination}") self.logger.error(f'Verification failed: {source} -> {destination}')
destination.unlink() destination.unlink()
return False return False
return True return True
except FileExistsError: except FileExistsError:
if self.logger: if self.logger:
self.logger.warning(f"Destination already exists: {destination}") self.logger.warning(f'Destination already exists: {destination}')
return False return False
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.error(f"Hardlink failed: {source} -> {destination}: {e}") self.logger.error(f'Hardlink failed: {source} -> {destination}: {e}')
return False return False
def _same_filesystem(self, path1: Path, path2: Path) -> bool: def _same_filesystem(self, path1: Path, path2: Path) -> bool:
"""Check if two paths are on the same filesystem
Args:
path1: First path
path2: Second path
Returns:
True if on same filesystem
"""
try: try:
# Get device IDs
stat1 = path1.stat() stat1 = path1.stat()
stat2 = path2.stat() stat2 = path2.stat()
return stat1.st_dev == stat2.st_dev return stat1.st_dev == stat2.st_dev
@@ -92,286 +45,117 @@ class HardlinkMigrationStrategy:
return False return False
def _verify_hardlink(self, source: Path, destination: Path) -> bool: def _verify_hardlink(self, source: Path, destination: Path) -> bool:
"""Verify hardlink
Args:
source: Source file path
destination: Destination file path
Returns:
True if verification successful
"""
try: try:
# Check if they have the same inode
source_stat = source.stat() source_stat = source.stat()
dest_stat = destination.stat() dest_stat = destination.stat()
return source_stat.st_ino == dest_stat.st_ino return source_stat.st_ino == dest_stat.st_ino
except Exception: except Exception:
return False return False
def can_migrate(self, source: Path, destination: Path) -> bool: def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible
Args:
source: Source file path
destination: Destination file path
Returns:
True if migration is possible
"""
if not source.exists(): if not source.exists():
return False return False
# Check if on same filesystem
dest_dir = destination.parent dest_dir = destination.parent
if dest_dir.exists(): if dest_dir.exists():
return self._same_filesystem(source, dest_dir) return self._same_filesystem(source, dest_dir)
# Check parent directories
parent = dest_dir.parent parent = dest_dir.parent
while not parent.exists() and parent != parent.parent: while not parent.exists() and parent != parent.parent:
parent = parent.parent parent = parent.parent
return parent.exists() and self._same_filesystem(source, parent) return parent.exists() and self._same_filesystem(source, parent)
def estimate_time(self, source: Path) -> float: def estimate_time(self, source: Path) -> float:
"""Estimate migration time in seconds return 0.01
Args:
source: Source file path
Returns:
Estimated time in seconds (hardlinks are instant)
"""
return 0.01 # Hardlinks are nearly instant
def cleanup(self, source: Path) -> bool: def cleanup(self, source: Path) -> bool:
"""Cleanup source file after successful migration
Note: For hardlinks, we typically don't remove the source
immediately as both links point to the same inode.
Args:
source: Source file path
Returns:
True (no cleanup needed for hardlinks)
"""
# For hardlinks, we don't remove the source
# Both source and destination point to the same data
return True return True
class SymlinkMigrationStrategy: class SymlinkMigrationStrategy:
"""Create symbolic links to files"""
def __init__( def __init__(self, logger: Optional[ProgressLogger]=None, absolute_links: bool=True):
self,
logger: Optional[ProgressLogger] = None,
absolute_links: bool = True
):
"""Initialize symlink migration strategy
Args:
logger: Optional progress logger
absolute_links: Whether to create absolute symlinks
"""
self.logger = logger self.logger = logger
self.absolute_links = absolute_links self.absolute_links = absolute_links
def migrate( def migrate(self, source: Path, destination: Path, verify: bool=True) -> bool:
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate file by creating symlink
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
if not source.exists(): if not source.exists():
if self.logger: if self.logger:
self.logger.error(f"Source file does not exist: {source}") self.logger.error(f'Source file does not exist: {source}')
return False return False
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True) destination.parent.mkdir(parents=True, exist_ok=True)
try: try:
# Determine link target
if self.absolute_links: if self.absolute_links:
target = source.resolve() target = source.resolve()
else: else:
# Create relative symlink
target = os.path.relpath(source, destination.parent) target = os.path.relpath(source, destination.parent)
# Create symlink
destination.symlink_to(target) destination.symlink_to(target)
# Verify if requested
if verify: if verify:
if not self._verify_symlink(destination, source): if not self._verify_symlink(destination, source):
if self.logger: if self.logger:
self.logger.error(f"Verification failed: {source} -> {destination}") self.logger.error(f'Verification failed: {source} -> {destination}')
destination.unlink() destination.unlink()
return False return False
return True return True
except FileExistsError: except FileExistsError:
if self.logger: if self.logger:
self.logger.warning(f"Destination already exists: {destination}") self.logger.warning(f'Destination already exists: {destination}')
return False return False
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.error(f"Symlink failed: {source} -> {destination}: {e}") self.logger.error(f'Symlink failed: {source} -> {destination}: {e}')
return False return False
def _verify_symlink(self, symlink: Path, expected_target: Path) -> bool: def _verify_symlink(self, symlink: Path, expected_target: Path) -> bool:
"""Verify symlink
Args:
symlink: Symlink path
expected_target: Expected target path
Returns:
True if verification successful
"""
try: try:
# Check if it's a symlink
if not symlink.is_symlink(): if not symlink.is_symlink():
return False return False
# Resolve and compare
resolved = symlink.resolve() resolved = symlink.resolve()
expected = expected_target.resolve() expected = expected_target.resolve()
return resolved == expected return resolved == expected
except Exception: except Exception:
return False return False
def can_migrate(self, source: Path, destination: Path) -> bool: def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible
Args:
source: Source file path
destination: Destination file path
Returns:
True if migration is possible
"""
if not source.exists(): if not source.exists():
return False return False
# Check if destination directory is writable
dest_dir = destination.parent dest_dir = destination.parent
if dest_dir.exists(): if dest_dir.exists():
return os.access(dest_dir, os.W_OK) return os.access(dest_dir, os.W_OK)
return True return True
def estimate_time(self, source: Path) -> float: def estimate_time(self, source: Path) -> float:
"""Estimate migration time in seconds return 0.01
Args:
source: Source file path
Returns:
Estimated time in seconds (symlinks are instant)
"""
return 0.01 # Symlinks are instant
def cleanup(self, source: Path) -> bool: def cleanup(self, source: Path) -> bool:
"""Cleanup source file after successful migration
Note: For symlinks, we don't remove the source as the
symlink points to it.
Args:
source: Source file path
Returns:
True (no cleanup needed for symlinks)
"""
# For symlinks, we don't remove the source
return True return True
class DedupHardlinkStrategy(HardlinkMigrationStrategy): class DedupHardlinkStrategy(HardlinkMigrationStrategy):
"""Hardlink strategy for deduplication
Creates hardlinks for duplicate files to save space. def __init__(self, logger: Optional[ProgressLogger]=None):
"""
def __init__(self, logger: Optional[ProgressLogger] = None):
"""Initialize dedup hardlink strategy"""
super().__init__(logger=logger) super().__init__(logger=logger)
def deduplicate( def deduplicate(self, canonical: Path, duplicate: Path) -> bool:
self,
canonical: Path,
duplicate: Path
) -> bool:
"""Replace duplicate with hardlink to canonical
Args:
canonical: Canonical file path
duplicate: Duplicate file path
Returns:
True if deduplication successful
"""
if not canonical.exists(): if not canonical.exists():
if self.logger: if self.logger:
self.logger.error(f"Canonical file does not exist: {canonical}") self.logger.error(f'Canonical file does not exist: {canonical}')
return False return False
if not duplicate.exists(): if not duplicate.exists():
if self.logger: if self.logger:
self.logger.error(f"Duplicate file does not exist: {duplicate}") self.logger.error(f'Duplicate file does not exist: {duplicate}')
return False return False
# Check if already hardlinked
if self._verify_hardlink(canonical, duplicate): if self._verify_hardlink(canonical, duplicate):
return True return True
# Check if on same filesystem
if not self._same_filesystem(canonical, duplicate): if not self._same_filesystem(canonical, duplicate):
if self.logger: if self.logger:
self.logger.warning( self.logger.warning(f'Cannot hardlink across filesystems: {canonical} -> {duplicate}')
f"Cannot hardlink across filesystems: {canonical} -> {duplicate}"
)
return False return False
try: try:
# Create temporary backup
backup = duplicate.with_suffix(duplicate.suffix + '.bak') backup = duplicate.with_suffix(duplicate.suffix + '.bak')
duplicate.rename(backup) duplicate.rename(backup)
# Create hardlink
os.link(canonical, duplicate) os.link(canonical, duplicate)
# Remove backup
backup.unlink() backup.unlink()
return True return True
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.error(f"Deduplication failed: {duplicate}: {e}") self.logger.error(f'Deduplication failed: {duplicate}: {e}')
# Restore from backup
if backup.exists(): if backup.exists():
backup.rename(duplicate) backup.rename(duplicate)
return False return False