Files
defrag/app/deduplication/engine.py
2025-12-13 12:00:34 +01:00

148 lines
8.1 KiB
Python

from pathlib import Path
from typing import Optional, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed
import psycopg2
from .chunker import compute_file_signature, hash_file
from .store import HashStore
from ..shared.models import FileRecord, ProcessingStats
from ..shared.config import DatabaseConfig, ProcessingConfig
from ..shared.logger import ProgressLogger
class DeduplicationEngine:
def __init__(self, db_config: DatabaseConfig, processing_config: ProcessingConfig, logger: ProgressLogger):
self.db_config = db_config
self.processing_config = processing_config
self.logger = logger
self.hash_store = HashStore(db_config)
self._connection = None
def _get_connection(self):
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
return self._connection
def deduplicate_all(self, disk: Optional[str]=None, use_chunks: bool=True, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
self.logger.section('Starting Deduplication')
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute('\n SELECT path, size\n FROM files\n WHERE disk_label = %s AND checksum IS NULL\n ORDER BY size DESC\n ', (disk,))
else:
cursor.execute('\n SELECT path, size\n FROM files\n WHERE checksum IS NULL\n ORDER BY size DESC\n ')
files_to_process = cursor.fetchall()
total_files = len(files_to_process)
self.logger.info(f'Found {total_files} files to process')
stats = ProcessingStats()
with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
futures = {}
for path_str, size in files_to_process:
path = Path(path_str)
future = executor.submit(self._process_file, path, use_chunks)
futures[future] = (path, size)
for future in as_completed(futures):
path, size = futures[future]
try:
checksum, duplicate_of = future.result()
if checksum:
cursor.execute('\n UPDATE files\n SET checksum = %s, duplicate_of = %s\n WHERE path = %s\n ', (checksum, duplicate_of, str(path)))
stats.files_succeeded += 1
stats.bytes_processed += size
stats.files_processed += 1
if stats.files_processed % self.processing_config.commit_interval == 0:
conn.commit()
if progress_callback:
progress_callback(stats.files_processed, total_files, stats)
self.logger.progress(stats.files_processed, total_files, prefix='Files processed', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
except Exception as e:
self.logger.warning(f'Failed to process {path}: {e}')
stats.files_failed += 1
stats.files_processed += 1
conn.commit()
cursor.close()
self.logger.info(f'Deduplication complete: {stats.files_succeeded}/{total_files} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
return stats
def _process_file(self, path: Path, use_chunks: bool) -> tuple[Optional[str], Optional[str]]:
if not path.exists():
return (None, None)
try:
if use_chunks:
checksum, chunk_hashes = compute_file_signature(path, use_rabin=True, avg_chunk_size=self.processing_config.chunk_size)
else:
checksum = hash_file(path, algorithm=self.processing_config.hash_algorithm)
chunk_hashes = None
if self.hash_store.exists(checksum):
canonical_path = self.hash_store.get_canonical(checksum)
return (checksum, canonical_path)
else:
size = path.stat().st_size
self.hash_store.store_canonical(checksum, path, size, chunk_hashes)
return (checksum, None)
except Exception as e:
self.logger.debug(f'Error processing {path}: {e}')
raise
def find_duplicates(self, disk: Optional[str]=None) -> dict[str, list[str]]:
self.logger.subsection('Finding Duplicates')
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute('\n SELECT checksum, array_agg(path ORDER BY path) as paths\n FROM files\n WHERE disk_label = %s AND checksum IS NOT NULL\n GROUP BY checksum\n HAVING COUNT(*) > 1\n ', (disk,))
else:
cursor.execute('\n SELECT checksum, array_agg(path ORDER BY path) as paths\n FROM files\n WHERE checksum IS NOT NULL\n GROUP BY checksum\n HAVING COUNT(*) > 1\n ')
duplicates = {}
for checksum, paths in cursor.fetchall():
canonical = paths[0]
duplicates[canonical] = paths[1:]
cursor.close()
self.logger.info(f'Found {len(duplicates)} sets of duplicates')
return duplicates
def get_deduplication_stats(self) -> dict:
conn = self._get_connection()
cursor = conn.cursor()
stats = {}
cursor.execute('SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL')
stats['total_files'] = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL')
stats['unique_files'] = cursor.fetchone()[0]
stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL')
stats['total_size'] = cursor.fetchone()[0]
cursor.execute('\n SELECT COALESCE(SUM(size), 0)\n FROM (\n SELECT DISTINCT ON (checksum) size\n FROM files\n WHERE checksum IS NOT NULL\n ) AS unique_files\n ')
stats['unique_size'] = cursor.fetchone()[0]
stats['wasted_space'] = stats['total_size'] - stats['unique_size']
if stats['total_size'] > 0:
stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
else:
stats['dedup_ratio'] = 1.0
if stats['total_size'] > 0:
stats['space_saved_percent'] = stats['wasted_space'] / stats['total_size'] * 100
else:
stats['space_saved_percent'] = 0.0
cursor.close()
return stats
def mark_canonical_files(self) -> int:
self.logger.subsection('Marking Canonical Files')
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('\n WITH canonical AS (\n SELECT DISTINCT ON (checksum) path, checksum\n FROM files\n WHERE checksum IS NOT NULL\n ORDER BY checksum, path\n )\n UPDATE files\n SET duplicate_of = NULL\n WHERE path IN (SELECT path FROM canonical)\n ')
count = cursor.rowcount
conn.commit()
cursor.close()
self.logger.info(f'Marked {count} canonical files')
return count
def close(self):
self.hash_store.close()
if self._connection and (not self._connection.closed):
self._connection.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()