defrag/app/deduplication/engine.py

"""Deduplication engine"""
from pathlib import Path
from typing import Optional, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed
import psycopg2

from .chunker import compute_file_signature, hash_file
from .store import HashStore
from ..shared.models import FileRecord, ProcessingStats
from ..shared.config import DatabaseConfig, ProcessingConfig
from ..shared.logger import ProgressLogger


class DeduplicationEngine:
    """Engine for deduplicating files"""

    def __init__(
        self,
        db_config: DatabaseConfig,
        processing_config: ProcessingConfig,
        logger: ProgressLogger
    ):
        """Initialize deduplication engine

        Args:
            db_config: Database configuration
            processing_config: Processing configuration
            logger: Progress logger
        """
        self.db_config = db_config
        self.processing_config = processing_config
        self.logger = logger
        self.hash_store = HashStore(db_config)
        self._connection = None

    def _get_connection(self):
        """Get or create database connection"""
        if self._connection is None or self._connection.closed:
            self._connection = psycopg2.connect(
                host=self.db_config.host,
                port=self.db_config.port,
                database=self.db_config.database,
                user=self.db_config.user,
                password=self.db_config.password
            )
        return self._connection

    def deduplicate_all(
        self,
        disk: Optional[str] = None,
        use_chunks: bool = True,
        progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
    ) -> ProcessingStats:
        """Deduplicate all files in database

        Args:
            disk: Optional disk filter
            use_chunks: Whether to use chunk-level deduplication
            progress_callback: Optional callback for progress updates

        Returns:
            ProcessingStats with deduplication statistics
        """
        self.logger.section("Starting Deduplication")

        conn = self._get_connection()
        cursor = conn.cursor()

        # Get files without checksums
        if disk:
            cursor.execute("""
                SELECT path, size
                FROM files
                WHERE disk_label = %s AND checksum IS NULL
                ORDER BY size DESC
                           """, (disk,))
        else:
            cursor.execute("""
                SELECT path, size
                FROM files
                WHERE checksum IS NULL
                ORDER BY size DESC
                           """)

        files_to_process = cursor.fetchall()
        total_files = len(files_to_process)

        self.logger.info(f"Found {total_files} files to process")

        stats = ProcessingStats()

        # Process files with thread pool
        with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
            futures = {}

            for path_str, size in files_to_process:
                path = Path(path_str)
                future = executor.submit(self._process_file, path, use_chunks)
                futures[future] = (path, size)

            # Process completed futures
            for future in as_completed(futures):
                path, size = futures[future]

                try:
                    checksum, duplicate_of = future.result()

                    if checksum:
                        # Update database
                        cursor.execute("""
                            UPDATE files
                            SET checksum = %s, duplicate_of = %s
                            WHERE path = %s
                                       """, (checksum, duplicate_of, str(path)))

                        stats.files_succeeded += 1
                        stats.bytes_processed += size

                    stats.files_processed += 1

                    # Commit periodically
                    if stats.files_processed % self.processing_config.commit_interval == 0:
                        conn.commit()

                        # Progress callback
                        if progress_callback:
                            progress_callback(stats.files_processed, total_files, stats)

                        # Log progress
                        self.logger.progress(
                            stats.files_processed,
                            total_files,
                            prefix="Files processed",
                            bytes_processed=stats.bytes_processed,
                            elapsed_seconds=stats.elapsed_seconds
                        )

                except Exception as e:
                    self.logger.warning(f"Failed to process {path}: {e}")
                    stats.files_failed += 1
                    stats.files_processed += 1

        # Final commit
        conn.commit()
        cursor.close()

        self.logger.info(
            f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
            f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
        )

        return stats

    def _process_file(
        self,
        path: Path,
        use_chunks: bool
    ) -> tuple[Optional[str], Optional[str]]:
        """Process a single file for deduplication

        Args:
            path: Path to file
            use_chunks: Whether to use chunk-level deduplication

        Returns:
            Tuple of (checksum, duplicate_of_path)
        """
        if not path.exists():
            return None, None

        try:
            if use_chunks:
                # Compute file signature with chunks
                checksum, chunk_hashes = compute_file_signature(
                    path,
                    use_rabin=True,
                    avg_chunk_size=self.processing_config.chunk_size
                )
            else:
                # Just compute file hash
                checksum = hash_file(
                    path,
                    algorithm=self.processing_config.hash_algorithm
                )
                chunk_hashes = None

            # Check if hash exists
            if self.hash_store.exists(checksum):
                # Duplicate found
                canonical_path = self.hash_store.get_canonical(checksum)
                return checksum, canonical_path
            else:
                # New unique file
                size = path.stat().st_size
                self.hash_store.store_canonical(
                    checksum,
                    path,
                    size,
                    chunk_hashes
                )
                return checksum, None

        except Exception as e:
            self.logger.debug(f"Error processing {path}: {e}")
            raise

    def find_duplicates(
        self,
        disk: Optional[str] = None
    ) -> dict[str, list[str]]:
        """Find all duplicate files

        Args:
            disk: Optional disk filter

        Returns:
            Dictionary mapping canonical path to list of duplicate paths
        """
        self.logger.subsection("Finding Duplicates")

        conn = self._get_connection()
        cursor = conn.cursor()

        # Query for duplicates
        if disk:
            cursor.execute("""
                SELECT checksum, array_agg(path ORDER BY path) as paths
                FROM files
                WHERE disk_label = %s AND checksum IS NOT NULL
                GROUP BY checksum
                HAVING COUNT(*) > 1
                           """, (disk,))
        else:
            cursor.execute("""
                SELECT checksum, array_agg(path ORDER BY path) as paths
                FROM files
                WHERE checksum IS NOT NULL
                GROUP BY checksum
                HAVING COUNT(*) > 1
                           """)

        duplicates = {}
        for checksum, paths in cursor.fetchall():
            canonical = paths[0]
            duplicates[canonical] = paths[1:]

        cursor.close()

        self.logger.info(f"Found {len(duplicates)} sets of duplicates")

        return duplicates

    def get_deduplication_stats(self) -> dict:
        """Get deduplication statistics

        Returns:
            Dictionary with statistics
        """
        conn = self._get_connection()
        cursor = conn.cursor()

        stats = {}

        # Total files
        cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
        stats['total_files'] = cursor.fetchone()[0]

        # Unique files
        cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
        stats['unique_files'] = cursor.fetchone()[0]

        # Duplicate files
        stats['duplicate_files'] = stats['total_files'] - stats['unique_files']

        # Total size
        cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
        stats['total_size'] = cursor.fetchone()[0]

        # Unique size
        cursor.execute("""
            SELECT COALESCE(SUM(size), 0)
            FROM (
                SELECT DISTINCT ON (checksum) size
                FROM files
                WHERE checksum IS NOT NULL
            ) AS unique_files
                       """)
        stats['unique_size'] = cursor.fetchone()[0]

        # Wasted space
        stats['wasted_space'] = stats['total_size'] - stats['unique_size']

        # Deduplication ratio
        if stats['total_size'] > 0:
            stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
        else:
            stats['dedup_ratio'] = 1.0

        # Space saved percentage
        if stats['total_size'] > 0:
            stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100
        else:
            stats['space_saved_percent'] = 0.0

        cursor.close()

        return stats

    def mark_canonical_files(self) -> int:
        """Mark canonical (first occurrence) files in database

        Returns:
            Number of canonical files marked
        """
        self.logger.subsection("Marking Canonical Files")

        conn = self._get_connection()
        cursor = conn.cursor()

        # Find first occurrence of each checksum and mark as canonical
        cursor.execute("""
            WITH canonical AS (
                SELECT DISTINCT ON (checksum) path, checksum
                FROM files
                WHERE checksum IS NOT NULL
                ORDER BY checksum, path
            )
            UPDATE files
            SET duplicate_of = NULL
            WHERE path IN (SELECT path FROM canonical)
                       """)

        count = cursor.rowcount
        conn.commit()
        cursor.close()

        self.logger.info(f"Marked {count} canonical files")

        return count

    def close(self):
        """Close connections"""
        self.hash_store.close()
        if self._connection and not self._connection.closed:
            self._connection.close()

    def __enter__(self):
        """Context manager entry"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        self.close()