"""Deduplication engine""" from pathlib import Path from typing import Optional, Callable from concurrent.futures import ThreadPoolExecutor, as_completed import psycopg2 from .chunker import compute_file_signature, hash_file from .store import HashStore from ..shared.models import FileRecord, ProcessingStats from ..shared.config import DatabaseConfig, ProcessingConfig from ..shared.logger import ProgressLogger class DeduplicationEngine: """Engine for deduplicating files""" def __init__( self, db_config: DatabaseConfig, processing_config: ProcessingConfig, logger: ProgressLogger ): """Initialize deduplication engine Args: db_config: Database configuration processing_config: Processing configuration logger: Progress logger """ self.db_config = db_config self.processing_config = processing_config self.logger = logger self.hash_store = HashStore(db_config) self._connection = None def _get_connection(self): """Get or create database connection""" if self._connection is None or self._connection.closed: self._connection = psycopg2.connect( host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password ) return self._connection def deduplicate_all( self, disk: Optional[str] = None, use_chunks: bool = True, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None ) -> ProcessingStats: """Deduplicate all files in database Args: disk: Optional disk filter use_chunks: Whether to use chunk-level deduplication progress_callback: Optional callback for progress updates Returns: ProcessingStats with deduplication statistics """ self.logger.section("Starting Deduplication") conn = self._get_connection() cursor = conn.cursor() # Get files without checksums if disk: cursor.execute(""" SELECT path, size FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC """, (disk,)) else: cursor.execute(""" SELECT path, size FROM files WHERE checksum IS NULL ORDER BY size DESC """) files_to_process = cursor.fetchall() total_files = len(files_to_process) self.logger.info(f"Found {total_files} files to process") stats = ProcessingStats() # Process files with thread pool with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor: futures = {} for path_str, size in files_to_process: path = Path(path_str) future = executor.submit(self._process_file, path, use_chunks) futures[future] = (path, size) # Process completed futures for future in as_completed(futures): path, size = futures[future] try: checksum, duplicate_of = future.result() if checksum: # Update database cursor.execute(""" UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s """, (checksum, duplicate_of, str(path))) stats.files_succeeded += 1 stats.bytes_processed += size stats.files_processed += 1 # Commit periodically if stats.files_processed % self.processing_config.commit_interval == 0: conn.commit() # Progress callback if progress_callback: progress_callback(stats.files_processed, total_files, stats) # Log progress self.logger.progress( stats.files_processed, total_files, prefix="Files processed", bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds ) except Exception as e: self.logger.warning(f"Failed to process {path}: {e}") stats.files_failed += 1 stats.files_processed += 1 # Final commit conn.commit() cursor.close() self.logger.info( f"Deduplication complete: {stats.files_succeeded}/{total_files} files, " f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s" ) return stats def _process_file( self, path: Path, use_chunks: bool ) -> tuple[Optional[str], Optional[str]]: """Process a single file for deduplication Args: path: Path to file use_chunks: Whether to use chunk-level deduplication Returns: Tuple of (checksum, duplicate_of_path) """ if not path.exists(): return None, None try: if use_chunks: # Compute file signature with chunks checksum, chunk_hashes = compute_file_signature( path, use_rabin=True, avg_chunk_size=self.processing_config.chunk_size ) else: # Just compute file hash checksum = hash_file( path, algorithm=self.processing_config.hash_algorithm ) chunk_hashes = None # Check if hash exists if self.hash_store.exists(checksum): # Duplicate found canonical_path = self.hash_store.get_canonical(checksum) return checksum, canonical_path else: # New unique file size = path.stat().st_size self.hash_store.store_canonical( checksum, path, size, chunk_hashes ) return checksum, None except Exception as e: self.logger.debug(f"Error processing {path}: {e}") raise def find_duplicates( self, disk: Optional[str] = None ) -> dict[str, list[str]]: """Find all duplicate files Args: disk: Optional disk filter Returns: Dictionary mapping canonical path to list of duplicate paths """ self.logger.subsection("Finding Duplicates") conn = self._get_connection() cursor = conn.cursor() # Query for duplicates if disk: cursor.execute(""" SELECT checksum, array_agg(path ORDER BY path) as paths FROM files WHERE disk_label = %s AND checksum IS NOT NULL GROUP BY checksum HAVING COUNT(*) > 1 """, (disk,)) else: cursor.execute(""" SELECT checksum, array_agg(path ORDER BY path) as paths FROM files WHERE checksum IS NOT NULL GROUP BY checksum HAVING COUNT(*) > 1 """) duplicates = {} for checksum, paths in cursor.fetchall(): canonical = paths[0] duplicates[canonical] = paths[1:] cursor.close() self.logger.info(f"Found {len(duplicates)} sets of duplicates") return duplicates def get_deduplication_stats(self) -> dict: """Get deduplication statistics Returns: Dictionary with statistics """ conn = self._get_connection() cursor = conn.cursor() stats = {} # Total files cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL") stats['total_files'] = cursor.fetchone()[0] # Unique files cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL") stats['unique_files'] = cursor.fetchone()[0] # Duplicate files stats['duplicate_files'] = stats['total_files'] - stats['unique_files'] # Total size cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL") stats['total_size'] = cursor.fetchone()[0] # Unique size cursor.execute(""" SELECT COALESCE(SUM(size), 0) FROM ( SELECT DISTINCT ON (checksum) size FROM files WHERE checksum IS NOT NULL ) AS unique_files """) stats['unique_size'] = cursor.fetchone()[0] # Wasted space stats['wasted_space'] = stats['total_size'] - stats['unique_size'] # Deduplication ratio if stats['total_size'] > 0: stats['dedup_ratio'] = stats['unique_size'] / stats['total_size'] else: stats['dedup_ratio'] = 1.0 # Space saved percentage if stats['total_size'] > 0: stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100 else: stats['space_saved_percent'] = 0.0 cursor.close() return stats def mark_canonical_files(self) -> int: """Mark canonical (first occurrence) files in database Returns: Number of canonical files marked """ self.logger.subsection("Marking Canonical Files") conn = self._get_connection() cursor = conn.cursor() # Find first occurrence of each checksum and mark as canonical cursor.execute(""" WITH canonical AS ( SELECT DISTINCT ON (checksum) path, checksum FROM files WHERE checksum IS NOT NULL ORDER BY checksum, path ) UPDATE files SET duplicate_of = NULL WHERE path IN (SELECT path FROM canonical) """) count = cursor.rowcount conn.commit() cursor.close() self.logger.info(f"Marked {count} canonical files") return count def close(self): """Close connections""" self.hash_store.close() if self._connection and not self._connection.closed: self._connection.close() def __enter__(self): """Context manager entry""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.close()