354 lines
11 KiB
Python
354 lines
11 KiB
Python
"""Deduplication engine"""
|
|
from pathlib import Path
|
|
from typing import Optional, Callable
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import psycopg2
|
|
|
|
from .chunker import compute_file_signature, hash_file
|
|
from .store import HashStore
|
|
from ..shared.models import FileRecord, ProcessingStats
|
|
from ..shared.config import DatabaseConfig, ProcessingConfig
|
|
from ..shared.logger import ProgressLogger
|
|
|
|
|
|
class DeduplicationEngine:
|
|
"""Engine for deduplicating files"""
|
|
|
|
def __init__(
|
|
self,
|
|
db_config: DatabaseConfig,
|
|
processing_config: ProcessingConfig,
|
|
logger: ProgressLogger
|
|
):
|
|
"""Initialize deduplication engine
|
|
|
|
Args:
|
|
db_config: Database configuration
|
|
processing_config: Processing configuration
|
|
logger: Progress logger
|
|
"""
|
|
self.db_config = db_config
|
|
self.processing_config = processing_config
|
|
self.logger = logger
|
|
self.hash_store = HashStore(db_config)
|
|
self._connection = None
|
|
|
|
def _get_connection(self):
|
|
"""Get or create database connection"""
|
|
if self._connection is None or self._connection.closed:
|
|
self._connection = psycopg2.connect(
|
|
host=self.db_config.host,
|
|
port=self.db_config.port,
|
|
database=self.db_config.database,
|
|
user=self.db_config.user,
|
|
password=self.db_config.password
|
|
)
|
|
return self._connection
|
|
|
|
def deduplicate_all(
|
|
self,
|
|
disk: Optional[str] = None,
|
|
use_chunks: bool = True,
|
|
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
|
|
) -> ProcessingStats:
|
|
"""Deduplicate all files in database
|
|
|
|
Args:
|
|
disk: Optional disk filter
|
|
use_chunks: Whether to use chunk-level deduplication
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
ProcessingStats with deduplication statistics
|
|
"""
|
|
self.logger.section("Starting Deduplication")
|
|
|
|
conn = self._get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
# Get files without checksums
|
|
if disk:
|
|
cursor.execute("""
|
|
SELECT path, size
|
|
FROM files
|
|
WHERE disk_label = %s AND checksum IS NULL
|
|
ORDER BY size DESC
|
|
""", (disk,))
|
|
else:
|
|
cursor.execute("""
|
|
SELECT path, size
|
|
FROM files
|
|
WHERE checksum IS NULL
|
|
ORDER BY size DESC
|
|
""")
|
|
|
|
files_to_process = cursor.fetchall()
|
|
total_files = len(files_to_process)
|
|
|
|
self.logger.info(f"Found {total_files} files to process")
|
|
|
|
stats = ProcessingStats()
|
|
|
|
# Process files with thread pool
|
|
with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
|
|
futures = {}
|
|
|
|
for path_str, size in files_to_process:
|
|
path = Path(path_str)
|
|
future = executor.submit(self._process_file, path, use_chunks)
|
|
futures[future] = (path, size)
|
|
|
|
# Process completed futures
|
|
for future in as_completed(futures):
|
|
path, size = futures[future]
|
|
|
|
try:
|
|
checksum, duplicate_of = future.result()
|
|
|
|
if checksum:
|
|
# Update database
|
|
cursor.execute("""
|
|
UPDATE files
|
|
SET checksum = %s, duplicate_of = %s
|
|
WHERE path = %s
|
|
""", (checksum, duplicate_of, str(path)))
|
|
|
|
stats.files_succeeded += 1
|
|
stats.bytes_processed += size
|
|
|
|
stats.files_processed += 1
|
|
|
|
# Commit periodically
|
|
if stats.files_processed % self.processing_config.commit_interval == 0:
|
|
conn.commit()
|
|
|
|
# Progress callback
|
|
if progress_callback:
|
|
progress_callback(stats.files_processed, total_files, stats)
|
|
|
|
# Log progress
|
|
self.logger.progress(
|
|
stats.files_processed,
|
|
total_files,
|
|
prefix="Files processed",
|
|
bytes_processed=stats.bytes_processed,
|
|
elapsed_seconds=stats.elapsed_seconds
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to process {path}: {e}")
|
|
stats.files_failed += 1
|
|
stats.files_processed += 1
|
|
|
|
# Final commit
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
self.logger.info(
|
|
f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
|
|
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
|
|
)
|
|
|
|
return stats
|
|
|
|
def _process_file(
|
|
self,
|
|
path: Path,
|
|
use_chunks: bool
|
|
) -> tuple[Optional[str], Optional[str]]:
|
|
"""Process a single file for deduplication
|
|
|
|
Args:
|
|
path: Path to file
|
|
use_chunks: Whether to use chunk-level deduplication
|
|
|
|
Returns:
|
|
Tuple of (checksum, duplicate_of_path)
|
|
"""
|
|
if not path.exists():
|
|
return None, None
|
|
|
|
try:
|
|
if use_chunks:
|
|
# Compute file signature with chunks
|
|
checksum, chunk_hashes = compute_file_signature(
|
|
path,
|
|
use_rabin=True,
|
|
avg_chunk_size=self.processing_config.chunk_size
|
|
)
|
|
else:
|
|
# Just compute file hash
|
|
checksum = hash_file(
|
|
path,
|
|
algorithm=self.processing_config.hash_algorithm
|
|
)
|
|
chunk_hashes = None
|
|
|
|
# Check if hash exists
|
|
if self.hash_store.exists(checksum):
|
|
# Duplicate found
|
|
canonical_path = self.hash_store.get_canonical(checksum)
|
|
return checksum, canonical_path
|
|
else:
|
|
# New unique file
|
|
size = path.stat().st_size
|
|
self.hash_store.store_canonical(
|
|
checksum,
|
|
path,
|
|
size,
|
|
chunk_hashes
|
|
)
|
|
return checksum, None
|
|
|
|
except Exception as e:
|
|
self.logger.debug(f"Error processing {path}: {e}")
|
|
raise
|
|
|
|
def find_duplicates(
|
|
self,
|
|
disk: Optional[str] = None
|
|
) -> dict[str, list[str]]:
|
|
"""Find all duplicate files
|
|
|
|
Args:
|
|
disk: Optional disk filter
|
|
|
|
Returns:
|
|
Dictionary mapping canonical path to list of duplicate paths
|
|
"""
|
|
self.logger.subsection("Finding Duplicates")
|
|
|
|
conn = self._get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
# Query for duplicates
|
|
if disk:
|
|
cursor.execute("""
|
|
SELECT checksum, array_agg(path ORDER BY path) as paths
|
|
FROM files
|
|
WHERE disk_label = %s AND checksum IS NOT NULL
|
|
GROUP BY checksum
|
|
HAVING COUNT(*) > 1
|
|
""", (disk,))
|
|
else:
|
|
cursor.execute("""
|
|
SELECT checksum, array_agg(path ORDER BY path) as paths
|
|
FROM files
|
|
WHERE checksum IS NOT NULL
|
|
GROUP BY checksum
|
|
HAVING COUNT(*) > 1
|
|
""")
|
|
|
|
duplicates = {}
|
|
for checksum, paths in cursor.fetchall():
|
|
canonical = paths[0]
|
|
duplicates[canonical] = paths[1:]
|
|
|
|
cursor.close()
|
|
|
|
self.logger.info(f"Found {len(duplicates)} sets of duplicates")
|
|
|
|
return duplicates
|
|
|
|
def get_deduplication_stats(self) -> dict:
|
|
"""Get deduplication statistics
|
|
|
|
Returns:
|
|
Dictionary with statistics
|
|
"""
|
|
conn = self._get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
stats = {}
|
|
|
|
# Total files
|
|
cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
|
|
stats['total_files'] = cursor.fetchone()[0]
|
|
|
|
# Unique files
|
|
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
|
|
stats['unique_files'] = cursor.fetchone()[0]
|
|
|
|
# Duplicate files
|
|
stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
|
|
|
|
# Total size
|
|
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
|
|
stats['total_size'] = cursor.fetchone()[0]
|
|
|
|
# Unique size
|
|
cursor.execute("""
|
|
SELECT COALESCE(SUM(size), 0)
|
|
FROM (
|
|
SELECT DISTINCT ON (checksum) size
|
|
FROM files
|
|
WHERE checksum IS NOT NULL
|
|
) AS unique_files
|
|
""")
|
|
stats['unique_size'] = cursor.fetchone()[0]
|
|
|
|
# Wasted space
|
|
stats['wasted_space'] = stats['total_size'] - stats['unique_size']
|
|
|
|
# Deduplication ratio
|
|
if stats['total_size'] > 0:
|
|
stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
|
|
else:
|
|
stats['dedup_ratio'] = 1.0
|
|
|
|
# Space saved percentage
|
|
if stats['total_size'] > 0:
|
|
stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100
|
|
else:
|
|
stats['space_saved_percent'] = 0.0
|
|
|
|
cursor.close()
|
|
|
|
return stats
|
|
|
|
def mark_canonical_files(self) -> int:
|
|
"""Mark canonical (first occurrence) files in database
|
|
|
|
Returns:
|
|
Number of canonical files marked
|
|
"""
|
|
self.logger.subsection("Marking Canonical Files")
|
|
|
|
conn = self._get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
# Find first occurrence of each checksum and mark as canonical
|
|
cursor.execute("""
|
|
WITH canonical AS (
|
|
SELECT DISTINCT ON (checksum) path, checksum
|
|
FROM files
|
|
WHERE checksum IS NOT NULL
|
|
ORDER BY checksum, path
|
|
)
|
|
UPDATE files
|
|
SET duplicate_of = NULL
|
|
WHERE path IN (SELECT path FROM canonical)
|
|
""")
|
|
|
|
count = cursor.rowcount
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
self.logger.info(f"Marked {count} canonical files")
|
|
|
|
return count
|
|
|
|
def close(self):
|
|
"""Close connections"""
|
|
self.hash_store.close()
|
|
if self._connection and not self._connection.closed:
|
|
self._connection.close()
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry"""
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit"""
|
|
self.close()
|