initial
This commit is contained in:
353
app/deduplication/engine.py
Normal file
353
app/deduplication/engine.py
Normal file
@@ -0,0 +1,353 @@
|
||||
"""Deduplication engine"""
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import psycopg2
|
||||
|
||||
from .chunker import compute_file_signature, hash_file
|
||||
from .store import HashStore
|
||||
from ..shared.models import FileRecord, ProcessingStats
|
||||
from ..shared.config import DatabaseConfig, ProcessingConfig
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
|
||||
class DeduplicationEngine:
|
||||
"""Engine for deduplicating files"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_config: DatabaseConfig,
|
||||
processing_config: ProcessingConfig,
|
||||
logger: ProgressLogger
|
||||
):
|
||||
"""Initialize deduplication engine
|
||||
|
||||
Args:
|
||||
db_config: Database configuration
|
||||
processing_config: Processing configuration
|
||||
logger: Progress logger
|
||||
"""
|
||||
self.db_config = db_config
|
||||
self.processing_config = processing_config
|
||||
self.logger = logger
|
||||
self.hash_store = HashStore(db_config)
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
"""Get or create database connection"""
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(
|
||||
host=self.db_config.host,
|
||||
port=self.db_config.port,
|
||||
database=self.db_config.database,
|
||||
user=self.db_config.user,
|
||||
password=self.db_config.password
|
||||
)
|
||||
return self._connection
|
||||
|
||||
def deduplicate_all(
|
||||
self,
|
||||
disk: Optional[str] = None,
|
||||
use_chunks: bool = True,
|
||||
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
|
||||
) -> ProcessingStats:
|
||||
"""Deduplicate all files in database
|
||||
|
||||
Args:
|
||||
disk: Optional disk filter
|
||||
use_chunks: Whether to use chunk-level deduplication
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
ProcessingStats with deduplication statistics
|
||||
"""
|
||||
self.logger.section("Starting Deduplication")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get files without checksums
|
||||
if disk:
|
||||
cursor.execute("""
|
||||
SELECT path, size
|
||||
FROM files
|
||||
WHERE disk_label = %s AND checksum IS NULL
|
||||
ORDER BY size DESC
|
||||
""", (disk,))
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT path, size
|
||||
FROM files
|
||||
WHERE checksum IS NULL
|
||||
ORDER BY size DESC
|
||||
""")
|
||||
|
||||
files_to_process = cursor.fetchall()
|
||||
total_files = len(files_to_process)
|
||||
|
||||
self.logger.info(f"Found {total_files} files to process")
|
||||
|
||||
stats = ProcessingStats()
|
||||
|
||||
# Process files with thread pool
|
||||
with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
|
||||
futures = {}
|
||||
|
||||
for path_str, size in files_to_process:
|
||||
path = Path(path_str)
|
||||
future = executor.submit(self._process_file, path, use_chunks)
|
||||
futures[future] = (path, size)
|
||||
|
||||
# Process completed futures
|
||||
for future in as_completed(futures):
|
||||
path, size = futures[future]
|
||||
|
||||
try:
|
||||
checksum, duplicate_of = future.result()
|
||||
|
||||
if checksum:
|
||||
# Update database
|
||||
cursor.execute("""
|
||||
UPDATE files
|
||||
SET checksum = %s, duplicate_of = %s
|
||||
WHERE path = %s
|
||||
""", (checksum, duplicate_of, str(path)))
|
||||
|
||||
stats.files_succeeded += 1
|
||||
stats.bytes_processed += size
|
||||
|
||||
stats.files_processed += 1
|
||||
|
||||
# Commit periodically
|
||||
if stats.files_processed % self.processing_config.commit_interval == 0:
|
||||
conn.commit()
|
||||
|
||||
# Progress callback
|
||||
if progress_callback:
|
||||
progress_callback(stats.files_processed, total_files, stats)
|
||||
|
||||
# Log progress
|
||||
self.logger.progress(
|
||||
stats.files_processed,
|
||||
total_files,
|
||||
prefix="Files processed",
|
||||
bytes_processed=stats.bytes_processed,
|
||||
elapsed_seconds=stats.elapsed_seconds
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to process {path}: {e}")
|
||||
stats.files_failed += 1
|
||||
stats.files_processed += 1
|
||||
|
||||
# Final commit
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(
|
||||
f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
|
||||
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
def _process_file(
|
||||
self,
|
||||
path: Path,
|
||||
use_chunks: bool
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Process a single file for deduplication
|
||||
|
||||
Args:
|
||||
path: Path to file
|
||||
use_chunks: Whether to use chunk-level deduplication
|
||||
|
||||
Returns:
|
||||
Tuple of (checksum, duplicate_of_path)
|
||||
"""
|
||||
if not path.exists():
|
||||
return None, None
|
||||
|
||||
try:
|
||||
if use_chunks:
|
||||
# Compute file signature with chunks
|
||||
checksum, chunk_hashes = compute_file_signature(
|
||||
path,
|
||||
use_rabin=True,
|
||||
avg_chunk_size=self.processing_config.chunk_size
|
||||
)
|
||||
else:
|
||||
# Just compute file hash
|
||||
checksum = hash_file(
|
||||
path,
|
||||
algorithm=self.processing_config.hash_algorithm
|
||||
)
|
||||
chunk_hashes = None
|
||||
|
||||
# Check if hash exists
|
||||
if self.hash_store.exists(checksum):
|
||||
# Duplicate found
|
||||
canonical_path = self.hash_store.get_canonical(checksum)
|
||||
return checksum, canonical_path
|
||||
else:
|
||||
# New unique file
|
||||
size = path.stat().st_size
|
||||
self.hash_store.store_canonical(
|
||||
checksum,
|
||||
path,
|
||||
size,
|
||||
chunk_hashes
|
||||
)
|
||||
return checksum, None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Error processing {path}: {e}")
|
||||
raise
|
||||
|
||||
def find_duplicates(
|
||||
self,
|
||||
disk: Optional[str] = None
|
||||
) -> dict[str, list[str]]:
|
||||
"""Find all duplicate files
|
||||
|
||||
Args:
|
||||
disk: Optional disk filter
|
||||
|
||||
Returns:
|
||||
Dictionary mapping canonical path to list of duplicate paths
|
||||
"""
|
||||
self.logger.subsection("Finding Duplicates")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Query for duplicates
|
||||
if disk:
|
||||
cursor.execute("""
|
||||
SELECT checksum, array_agg(path ORDER BY path) as paths
|
||||
FROM files
|
||||
WHERE disk_label = %s AND checksum IS NOT NULL
|
||||
GROUP BY checksum
|
||||
HAVING COUNT(*) > 1
|
||||
""", (disk,))
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT checksum, array_agg(path ORDER BY path) as paths
|
||||
FROM files
|
||||
WHERE checksum IS NOT NULL
|
||||
GROUP BY checksum
|
||||
HAVING COUNT(*) > 1
|
||||
""")
|
||||
|
||||
duplicates = {}
|
||||
for checksum, paths in cursor.fetchall():
|
||||
canonical = paths[0]
|
||||
duplicates[canonical] = paths[1:]
|
||||
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(f"Found {len(duplicates)} sets of duplicates")
|
||||
|
||||
return duplicates
|
||||
|
||||
def get_deduplication_stats(self) -> dict:
|
||||
"""Get deduplication statistics
|
||||
|
||||
Returns:
|
||||
Dictionary with statistics
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
stats = {}
|
||||
|
||||
# Total files
|
||||
cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
|
||||
stats['total_files'] = cursor.fetchone()[0]
|
||||
|
||||
# Unique files
|
||||
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
|
||||
stats['unique_files'] = cursor.fetchone()[0]
|
||||
|
||||
# Duplicate files
|
||||
stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
|
||||
|
||||
# Total size
|
||||
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
|
||||
stats['total_size'] = cursor.fetchone()[0]
|
||||
|
||||
# Unique size
|
||||
cursor.execute("""
|
||||
SELECT COALESCE(SUM(size), 0)
|
||||
FROM (
|
||||
SELECT DISTINCT ON (checksum) size
|
||||
FROM files
|
||||
WHERE checksum IS NOT NULL
|
||||
) AS unique_files
|
||||
""")
|
||||
stats['unique_size'] = cursor.fetchone()[0]
|
||||
|
||||
# Wasted space
|
||||
stats['wasted_space'] = stats['total_size'] - stats['unique_size']
|
||||
|
||||
# Deduplication ratio
|
||||
if stats['total_size'] > 0:
|
||||
stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
|
||||
else:
|
||||
stats['dedup_ratio'] = 1.0
|
||||
|
||||
# Space saved percentage
|
||||
if stats['total_size'] > 0:
|
||||
stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100
|
||||
else:
|
||||
stats['space_saved_percent'] = 0.0
|
||||
|
||||
cursor.close()
|
||||
|
||||
return stats
|
||||
|
||||
def mark_canonical_files(self) -> int:
|
||||
"""Mark canonical (first occurrence) files in database
|
||||
|
||||
Returns:
|
||||
Number of canonical files marked
|
||||
"""
|
||||
self.logger.subsection("Marking Canonical Files")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Find first occurrence of each checksum and mark as canonical
|
||||
cursor.execute("""
|
||||
WITH canonical AS (
|
||||
SELECT DISTINCT ON (checksum) path, checksum
|
||||
FROM files
|
||||
WHERE checksum IS NOT NULL
|
||||
ORDER BY checksum, path
|
||||
)
|
||||
UPDATE files
|
||||
SET duplicate_of = NULL
|
||||
WHERE path IN (SELECT path FROM canonical)
|
||||
""")
|
||||
|
||||
count = cursor.rowcount
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(f"Marked {count} canonical files")
|
||||
|
||||
return count
|
||||
|
||||
def close(self):
|
||||
"""Close connections"""
|
||||
self.hash_store.close()
|
||||
if self._connection and not self._connection.closed:
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close()
|
||||
Reference in New Issue
Block a user