initial

2025-12-13 11:56:06 +01:00
commit 2b2c575385
57 changed files with 6505 additions and 0 deletions
--- a/app/deduplication/init.py
+++ b/app/deduplication/init.py
@@ -0,0 +1,21 @@
+"""Deduplication package exports"""
+from .chunker import (
+    RabinChunker,
+    SimpleChunker,
+    hash_chunk,
+    hash_file,
+    compute_file_signature
+)
+from .store import HashStore, MemoryHashStore
+from .engine import DeduplicationEngine
+
+__all__ = [
+    'RabinChunker',
+    'SimpleChunker',
+    'hash_chunk',
+    'hash_file',
+    'compute_file_signature',
+    'HashStore',
+    'MemoryHashStore',
+    'DeduplicationEngine',
+]
--- a/app/deduplication/_protocols.py
+++ b/app/deduplication/_protocols.py
--- a/app/deduplication/chunker.py
+++ b/app/deduplication/chunker.py
@@ -0,0 +1,241 @@
+"""Rabin fingerprint chunker for content-defined chunking"""
+import hashlib
+from pathlib import Path
+from typing import Iterator, Optional
+
+
+class RabinChunker:
+    """Content-defined chunking using Rabin fingerprinting
+
+    Uses a rolling hash to identify chunk boundaries based on content,
+    allowing for efficient deduplication even when data is modified.
+    """
+
+    def __init__(
+        self,
+        avg_chunk_size: int = 8192,
+        min_chunk_size: Optional[int] = None,
+        max_chunk_size: Optional[int] = None,
+        window_size: int = 48
+    ):
+        """Initialize Rabin chunker
+
+        Args:
+            avg_chunk_size: Target average chunk size in bytes
+            min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
+            max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
+            window_size: Rolling hash window size
+        """
+        self.avg_chunk_size = avg_chunk_size
+        self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4)
+        self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8)
+        self.window_size = window_size
+
+        # Calculate mask for boundary detection
+        # For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
+        bits = 0
+        size = avg_chunk_size
+        while size > 1:
+            bits += 1
+            size >>= 1
+        self.mask = (1 << bits) - 1
+
+        # Polynomial for rolling hash (prime number)
+        self.poly = 0x3DA3358B4DC173
+
+    def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
+        """Chunk a file using Rabin fingerprinting
+
+        Args:
+            file_path: Path to file to chunk
+            chunk_size: If provided, use fixed-size chunking instead
+
+        Yields:
+            Chunk data as bytes
+        """
+        if chunk_size:
+            # Use fixed-size chunking
+            yield from self._chunk_fixed(file_path, chunk_size)
+        else:
+            # Use content-defined chunking
+            yield from self._chunk_rabin(file_path)
+
+    def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
+        """Fixed-size chunking
+
+        Args:
+            file_path: Path to file
+            chunk_size: Chunk size in bytes
+
+        Yields:
+            Fixed-size chunks
+        """
+        with open(file_path, 'rb') as f:
+            while True:
+                chunk = f.read(chunk_size)
+                if not chunk:
+                    break
+                yield chunk
+
+    def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
+        """Content-defined chunking using Rabin fingerprinting
+
+        Args:
+            file_path: Path to file
+
+        Yields:
+            Variable-size chunks based on content
+        """
+        with open(file_path, 'rb') as f:
+            chunk_data = bytearray()
+            window = bytearray()
+            hash_value = 0
+
+            while True:
+                byte = f.read(1)
+                if not byte:
+                    # End of file - yield remaining data
+                    if chunk_data:
+                        yield bytes(chunk_data)
+                    break
+
+                chunk_data.extend(byte)
+                window.extend(byte)
+
+                # Maintain window size
+                if len(window) > self.window_size:
+                    window.pop(0)
+
+                # Update rolling hash
+                hash_value = self._rolling_hash(window)
+
+                # Check if we should create a boundary
+                should_break = (
+                    len(chunk_data) >= self.min_chunk_size and
+                    (
+                        (hash_value & self.mask) == 0 or
+                        len(chunk_data) >= self.max_chunk_size
+                    )
+                )
+
+                if should_break:
+                    yield bytes(chunk_data)
+                    chunk_data = bytearray()
+                    window = bytearray()
+                    hash_value = 0
+
+    def _rolling_hash(self, window: bytearray) -> int:
+        """Calculate rolling hash for window
+
+        Args:
+            window: Byte window
+
+        Returns:
+            Hash value
+        """
+        hash_value = 0
+        for byte in window:
+            hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF
+        return hash_value
+
+
+class SimpleChunker:
+    """Simple fixed-size chunker for comparison"""
+
+    def __init__(self, chunk_size: int = 8192):
+        """Initialize simple chunker
+
+        Args:
+            chunk_size: Fixed chunk size in bytes
+        """
+        self.chunk_size = chunk_size
+
+    def chunk_file(self, file_path: Path) -> Iterator[bytes]:
+        """Chunk file into fixed-size pieces
+
+        Args:
+            file_path: Path to file
+
+        Yields:
+            Fixed-size chunks
+        """
+        with open(file_path, 'rb') as f:
+            while True:
+                chunk = f.read(self.chunk_size)
+                if not chunk:
+                    break
+                yield chunk
+
+
+def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
+    """Hash a chunk of data
+
+    Args:
+        chunk: Chunk data
+        algorithm: Hash algorithm (default: sha256)
+
+    Returns:
+        Hex digest of hash
+    """
+    hasher = hashlib.new(algorithm)
+    hasher.update(chunk)
+    return hasher.hexdigest()
+
+
+def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
+    """Hash entire file
+
+    Args:
+        file_path: Path to file
+        algorithm: Hash algorithm (default: sha256)
+        chunk_size: Size of chunks to read
+
+    Returns:
+        Hex digest of file hash
+    """
+    hasher = hashlib.new(algorithm)
+
+    with open(file_path, 'rb') as f:
+        while True:
+            chunk = f.read(chunk_size)
+            if not chunk:
+                break
+            hasher.update(chunk)
+
+    return hasher.hexdigest()
+
+
+def compute_file_signature(
+    file_path: Path,
+    use_rabin: bool = True,
+    avg_chunk_size: int = 8192
+) -> tuple[str, list[str]]:
+    """Compute file signature with chunk hashes
+
+    Args:
+        file_path: Path to file
+        use_rabin: Whether to use Rabin chunking (vs fixed-size)
+        avg_chunk_size: Average chunk size for Rabin or fixed size
+
+    Returns:
+        Tuple of (file_hash, list of chunk hashes)
+    """
+    if use_rabin:
+        chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
+    else:
+        chunker = SimpleChunker(chunk_size=avg_chunk_size)
+
+    chunk_hashes = []
+    file_hasher = hashlib.sha256()
+
+    for chunk in chunker.chunk_file(file_path):
+        # Hash individual chunk
+        chunk_hash = hash_chunk(chunk)
+        chunk_hashes.append(chunk_hash)
+
+        # Update file hash
+        file_hasher.update(chunk)
+
+    file_hash = file_hasher.hexdigest()
+
+    return file_hash, chunk_hashes
--- a/app/deduplication/engine.py
+++ b/app/deduplication/engine.py
@@ -0,0 +1,353 @@
+"""Deduplication engine"""
+from pathlib import Path
+from typing import Optional, Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import psycopg2
+
+from .chunker import compute_file_signature, hash_file
+from .store import HashStore
+from ..shared.models import FileRecord, ProcessingStats
+from ..shared.config import DatabaseConfig, ProcessingConfig
+from ..shared.logger import ProgressLogger
+
+
+class DeduplicationEngine:
+    """Engine for deduplicating files"""
+
+    def __init__(
+        self,
+        db_config: DatabaseConfig,
+        processing_config: ProcessingConfig,
+        logger: ProgressLogger
+    ):
+        """Initialize deduplication engine
+
+        Args:
+            db_config: Database configuration
+            processing_config: Processing configuration
+            logger: Progress logger
+        """
+        self.db_config = db_config
+        self.processing_config = processing_config
+        self.logger = logger
+        self.hash_store = HashStore(db_config)
+        self._connection = None
+
+    def _get_connection(self):
+        """Get or create database connection"""
+        if self._connection is None or self._connection.closed:
+            self._connection = psycopg2.connect(
+                host=self.db_config.host,
+                port=self.db_config.port,
+                database=self.db_config.database,
+                user=self.db_config.user,
+                password=self.db_config.password
+            )
+        return self._connection
+
+    def deduplicate_all(
+        self,
+        disk: Optional[str] = None,
+        use_chunks: bool = True,
+        progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
+    ) -> ProcessingStats:
+        """Deduplicate all files in database
+
+        Args:
+            disk: Optional disk filter
+            use_chunks: Whether to use chunk-level deduplication
+            progress_callback: Optional callback for progress updates
+
+        Returns:
+            ProcessingStats with deduplication statistics
+        """
+        self.logger.section("Starting Deduplication")
+
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        # Get files without checksums
+        if disk:
+            cursor.execute("""
+                SELECT path, size
+                FROM files
+                WHERE disk_label = %s AND checksum IS NULL
+                ORDER BY size DESC
+                           """, (disk,))
+        else:
+            cursor.execute("""
+                SELECT path, size
+                FROM files
+                WHERE checksum IS NULL
+                ORDER BY size DESC
+                           """)
+
+        files_to_process = cursor.fetchall()
+        total_files = len(files_to_process)
+
+        self.logger.info(f"Found {total_files} files to process")
+
+        stats = ProcessingStats()
+
+        # Process files with thread pool
+        with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
+            futures = {}
+
+            for path_str, size in files_to_process:
+                path = Path(path_str)
+                future = executor.submit(self._process_file, path, use_chunks)
+                futures[future] = (path, size)
+
+            # Process completed futures
+            for future in as_completed(futures):
+                path, size = futures[future]
+
+                try:
+                    checksum, duplicate_of = future.result()
+
+                    if checksum:
+                        # Update database
+                        cursor.execute("""
+                            UPDATE files
+                            SET checksum = %s, duplicate_of = %s
+                            WHERE path = %s
+                                       """, (checksum, duplicate_of, str(path)))
+
+                        stats.files_succeeded += 1
+                        stats.bytes_processed += size
+
+                    stats.files_processed += 1
+
+                    # Commit periodically
+                    if stats.files_processed % self.processing_config.commit_interval == 0:
+                        conn.commit()
+
+                        # Progress callback
+                        if progress_callback:
+                            progress_callback(stats.files_processed, total_files, stats)
+
+                        # Log progress
+                        self.logger.progress(
+                            stats.files_processed,
+                            total_files,
+                            prefix="Files processed",
+                            bytes_processed=stats.bytes_processed,
+                            elapsed_seconds=stats.elapsed_seconds
+                        )
+
+                except Exception as e:
+                    self.logger.warning(f"Failed to process {path}: {e}")
+                    stats.files_failed += 1
+                    stats.files_processed += 1
+
+        # Final commit
+        conn.commit()
+        cursor.close()
+
+        self.logger.info(
+            f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
+            f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
+        )
+
+        return stats
+
+    def _process_file(
+        self,
+        path: Path,
+        use_chunks: bool
+    ) -> tuple[Optional[str], Optional[str]]:
+        """Process a single file for deduplication
+
+        Args:
+            path: Path to file
+            use_chunks: Whether to use chunk-level deduplication
+
+        Returns:
+            Tuple of (checksum, duplicate_of_path)
+        """
+        if not path.exists():
+            return None, None
+
+        try:
+            if use_chunks:
+                # Compute file signature with chunks
+                checksum, chunk_hashes = compute_file_signature(
+                    path,
+                    use_rabin=True,
+                    avg_chunk_size=self.processing_config.chunk_size
+                )
+            else:
+                # Just compute file hash
+                checksum = hash_file(
+                    path,
+                    algorithm=self.processing_config.hash_algorithm
+                )
+                chunk_hashes = None
+
+            # Check if hash exists
+            if self.hash_store.exists(checksum):
+                # Duplicate found
+                canonical_path = self.hash_store.get_canonical(checksum)
+                return checksum, canonical_path
+            else:
+                # New unique file
+                size = path.stat().st_size
+                self.hash_store.store_canonical(
+                    checksum,
+                    path,
+                    size,
+                    chunk_hashes
+                )
+                return checksum, None
+
+        except Exception as e:
+            self.logger.debug(f"Error processing {path}: {e}")
+            raise
+
+    def find_duplicates(
+        self,
+        disk: Optional[str] = None
+    ) -> dict[str, list[str]]:
+        """Find all duplicate files
+
+        Args:
+            disk: Optional disk filter
+
+        Returns:
+            Dictionary mapping canonical path to list of duplicate paths
+        """
+        self.logger.subsection("Finding Duplicates")
+
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        # Query for duplicates
+        if disk:
+            cursor.execute("""
+                SELECT checksum, array_agg(path ORDER BY path) as paths
+                FROM files
+                WHERE disk_label = %s AND checksum IS NOT NULL
+                GROUP BY checksum
+                HAVING COUNT(*) > 1
+                           """, (disk,))
+        else:
+            cursor.execute("""
+                SELECT checksum, array_agg(path ORDER BY path) as paths
+                FROM files
+                WHERE checksum IS NOT NULL
+                GROUP BY checksum
+                HAVING COUNT(*) > 1
+                           """)
+
+        duplicates = {}
+        for checksum, paths in cursor.fetchall():
+            canonical = paths[0]
+            duplicates[canonical] = paths[1:]
+
+        cursor.close()
+
+        self.logger.info(f"Found {len(duplicates)} sets of duplicates")
+
+        return duplicates
+
+    def get_deduplication_stats(self) -> dict:
+        """Get deduplication statistics
+
+        Returns:
+            Dictionary with statistics
+        """
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        stats = {}
+
+        # Total files
+        cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
+        stats['total_files'] = cursor.fetchone()[0]
+
+        # Unique files
+        cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
+        stats['unique_files'] = cursor.fetchone()[0]
+
+        # Duplicate files
+        stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
+
+        # Total size
+        cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
+        stats['total_size'] = cursor.fetchone()[0]
+
+        # Unique size
+        cursor.execute("""
+            SELECT COALESCE(SUM(size), 0)
+            FROM (
+                SELECT DISTINCT ON (checksum) size
+                FROM files
+                WHERE checksum IS NOT NULL
+            ) AS unique_files
+                       """)
+        stats['unique_size'] = cursor.fetchone()[0]
+
+        # Wasted space
+        stats['wasted_space'] = stats['total_size'] - stats['unique_size']
+
+        # Deduplication ratio
+        if stats['total_size'] > 0:
+            stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
+        else:
+            stats['dedup_ratio'] = 1.0
+
+        # Space saved percentage
+        if stats['total_size'] > 0:
+            stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100
+        else:
+            stats['space_saved_percent'] = 0.0
+
+        cursor.close()
+
+        return stats
+
+    def mark_canonical_files(self) -> int:
+        """Mark canonical (first occurrence) files in database
+
+        Returns:
+            Number of canonical files marked
+        """
+        self.logger.subsection("Marking Canonical Files")
+
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        # Find first occurrence of each checksum and mark as canonical
+        cursor.execute("""
+            WITH canonical AS (
+                SELECT DISTINCT ON (checksum) path, checksum
+                FROM files
+                WHERE checksum IS NOT NULL
+                ORDER BY checksum, path
+            )
+            UPDATE files
+            SET duplicate_of = NULL
+            WHERE path IN (SELECT path FROM canonical)
+                       """)
+
+        count = cursor.rowcount
+        conn.commit()
+        cursor.close()
+
+        self.logger.info(f"Marked {count} canonical files")
+
+        return count
+
+    def close(self):
+        """Close connections"""
+        self.hash_store.close()
+        if self._connection and not self._connection.closed:
+            self._connection.close()
+
+    def __enter__(self):
+        """Context manager entry"""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        self.close()
--- a/app/deduplication/store.py
+++ b/app/deduplication/store.py
@@ -0,0 +1,412 @@
+"""Hash store for deduplication with optional Redis support"""
+from typing import Optional, Dict, Set
+from pathlib import Path
+import psycopg2
+from psycopg2.extras import execute_batch
+
+from ..shared.config import DatabaseConfig
+
+
+class HashStore:
+    """PostgreSQL-based hash store for deduplication"""
+
+    def __init__(self, db_config: DatabaseConfig):
+        """Initialize hash store
+
+        Args:
+            db_config: Database configuration
+        """
+        self.db_config = db_config
+        self._connection = None
+
+    def _get_connection(self):
+        """Get or create database connection"""
+        if self._connection is None or self._connection.closed:
+            self._connection = psycopg2.connect(
+                host=self.db_config.host,
+                port=self.db_config.port,
+                database=self.db_config.database,
+                user=self.db_config.user,
+                password=self.db_config.password
+            )
+        return self._connection
+
+    def _ensure_tables(self):
+        """Ensure hash store tables exist"""
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        # Create hashes table for file-level deduplication
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS file_hashes (
+                checksum TEXT PRIMARY KEY,
+                canonical_path TEXT NOT NULL,
+                size BIGINT NOT NULL,
+                first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                ref_count INTEGER DEFAULT 1
+            )
+        """)
+
+        # Create chunk hashes table for chunk-level deduplication
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS chunk_hashes (
+                chunk_hash TEXT PRIMARY KEY,
+                size INTEGER NOT NULL,
+                first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                ref_count INTEGER DEFAULT 1
+            )
+        """)
+
+        # Create file-chunk mapping table
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS file_chunks (
+                id SERIAL PRIMARY KEY,
+                file_checksum TEXT NOT NULL,
+                chunk_hash TEXT NOT NULL,
+                chunk_index INTEGER NOT NULL,
+                FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),
+                FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),
+                UNIQUE (file_checksum, chunk_index)
+            )
+        """)
+
+        # Create indexes
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_file_chunks_file
+            ON file_chunks(file_checksum)
+        """)
+
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk
+            ON file_chunks(chunk_hash)
+        """)
+
+        conn.commit()
+        cursor.close()
+
+    def exists(self, checksum: str) -> bool:
+        """Check if hash exists in store
+
+        Args:
+            checksum: File hash to check
+
+        Returns:
+            True if hash exists
+        """
+        self._ensure_tables()
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        cursor.execute(
+            "SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1",
+            (checksum,)
+        )
+
+        exists = cursor.fetchone() is not None
+        cursor.close()
+
+        return exists
+
+    def get_canonical(self, checksum: str) -> Optional[str]:
+        """Get canonical path for a hash
+
+        Args:
+            checksum: File hash
+
+        Returns:
+            Canonical file path or None if not found
+        """
+        self._ensure_tables()
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        cursor.execute(
+            "SELECT canonical_path FROM file_hashes WHERE checksum = %s",
+            (checksum,)
+        )
+
+        result = cursor.fetchone()
+        cursor.close()
+
+        return result[0] if result else None
+
+    def store_canonical(
+        self,
+        checksum: str,
+        path: Path,
+        size: int,
+        chunk_hashes: Optional[list[str]] = None
+    ) -> None:
+        """Store canonical reference for a hash
+
+        Args:
+            checksum: File hash
+            path: Canonical file path
+            size: File size in bytes
+            chunk_hashes: Optional list of chunk hashes
+        """
+        self._ensure_tables()
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        try:
+            # Store file hash
+            cursor.execute("""
+                INSERT INTO file_hashes (checksum, canonical_path, size)
+                VALUES (%s, %s, %s)
+                ON CONFLICT (checksum) DO UPDATE SET
+                    ref_count = file_hashes.ref_count + 1
+            """, (checksum, str(path), size))
+
+            # Store chunk hashes if provided
+            if chunk_hashes:
+                # Insert chunk hashes
+                chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes]
+                execute_batch(cursor, """
+                    INSERT INTO chunk_hashes (chunk_hash, size)
+                    VALUES (%s, %s)
+                    ON CONFLICT (chunk_hash) DO UPDATE SET
+                        ref_count = chunk_hashes.ref_count + 1
+                """, chunk_data, page_size=1000)
+
+                # Create file-chunk mappings
+                mapping_data = [
+                    (checksum, chunk_hash, idx)
+                    for idx, chunk_hash in enumerate(chunk_hashes)
+                ]
+                execute_batch(cursor, """
+                    INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)
+                    VALUES (%s, %s, %s)
+                    ON CONFLICT (file_checksum, chunk_index) DO NOTHING
+                """, mapping_data, page_size=1000)
+
+            conn.commit()
+
+        except Exception as e:
+            conn.rollback()
+            raise
+
+        finally:
+            cursor.close()
+
+    def get_chunk_hashes(self, checksum: str) -> list[str]:
+        """Get chunk hashes for a file
+
+        Args:
+            checksum: File hash
+
+        Returns:
+            List of chunk hashes in order
+        """
+        self._ensure_tables()
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            SELECT chunk_hash
+            FROM file_chunks
+            WHERE file_checksum = %s
+            ORDER BY chunk_index
+        """, (checksum,))
+
+        chunk_hashes = [row[0] for row in cursor.fetchall()]
+        cursor.close()
+
+        return chunk_hashes
+
+    def get_duplicates(self) -> Dict[str, list[str]]:
+        """Get all duplicate file groups
+
+        Returns:
+            Dictionary mapping canonical path to list of duplicate paths
+        """
+        self._ensure_tables()
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        # Get all files with their hashes
+        cursor.execute("""
+            SELECT f.path, f.checksum
+            FROM files f
+            WHERE f.checksum IS NOT NULL
+        """)
+
+        # Group by checksum
+        hash_to_paths: Dict[str, list[str]] = {}
+        for path, checksum in cursor.fetchall():
+            if checksum not in hash_to_paths:
+                hash_to_paths[checksum] = []
+            hash_to_paths[checksum].append(path)
+
+        cursor.close()
+
+        # Filter to only duplicates (more than one file)
+        duplicates = {
+            paths[0]: paths[1:]
+            for checksum, paths in hash_to_paths.items()
+            if len(paths) > 1
+        }
+
+        return duplicates
+
+    def get_stats(self) -> Dict[str, int]:
+        """Get hash store statistics
+
+        Returns:
+            Dictionary with statistics
+        """
+        self._ensure_tables()
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        stats = {}
+
+        # Count unique file hashes
+        cursor.execute("SELECT COUNT(*) FROM file_hashes")
+        stats['unique_files'] = cursor.fetchone()[0]
+
+        # Count unique chunk hashes
+        cursor.execute("SELECT COUNT(*) FROM chunk_hashes")
+        stats['unique_chunks'] = cursor.fetchone()[0]
+
+        # Count total references
+        cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes")
+        stats['total_file_refs'] = cursor.fetchone()[0]
+
+        # Count total chunk references
+        cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes")
+        stats['total_chunk_refs'] = cursor.fetchone()[0]
+
+        # Calculate deduplication ratio
+        if stats['total_file_refs'] > 0:
+            stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs']
+        else:
+            stats['dedup_ratio'] = 1.0
+
+        cursor.close()
+
+        return stats
+
+    def find_similar_files(self, checksum: str, threshold: float = 0.8) -> list[tuple[str, float]]:
+        """Find files similar to given hash based on chunk overlap
+
+        Args:
+            checksum: File hash to compare
+            threshold: Similarity threshold (0.0 to 1.0)
+
+        Returns:
+            List of tuples (other_checksum, similarity_score)
+        """
+        self._ensure_tables()
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        # Get chunks for the target file
+        target_chunks = set(self.get_chunk_hashes(checksum))
+
+        if not target_chunks:
+            cursor.close()
+            return []
+
+        # Find files sharing chunks
+        cursor.execute("""
+            SELECT DISTINCT fc.file_checksum
+            FROM file_chunks fc
+            WHERE fc.chunk_hash = ANY(%s)
+            AND fc.file_checksum != %s
+        """, (list(target_chunks), checksum))
+
+        similar_files = []
+
+        for (other_checksum,) in cursor.fetchall():
+            other_chunks = set(self.get_chunk_hashes(other_checksum))
+
+            # Calculate Jaccard similarity
+            intersection = len(target_chunks & other_chunks)
+            union = len(target_chunks | other_chunks)
+
+            if union > 0:
+                similarity = intersection / union
+
+                if similarity >= threshold:
+                    similar_files.append((other_checksum, similarity))
+
+        cursor.close()
+
+        # Sort by similarity descending
+        similar_files.sort(key=lambda x: x[1], reverse=True)
+
+        return similar_files
+
+    def close(self):
+        """Close database connection"""
+        if self._connection and not self._connection.closed:
+            self._connection.close()
+
+    def __enter__(self):
+        """Context manager entry"""
+        self._ensure_tables()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        self.close()
+
+
+class MemoryHashStore:
+    """In-memory hash store for testing and small datasets"""
+
+    def __init__(self):
+        """Initialize in-memory hash store"""
+        self.hashes: Dict[str, tuple[str, int]] = {}
+        self.chunks: Dict[str, int] = {}
+        self.file_chunks: Dict[str, list[str]] = {}
+
+    def exists(self, checksum: str) -> bool:
+        """Check if hash exists"""
+        return checksum in self.hashes
+
+    def get_canonical(self, checksum: str) -> Optional[str]:
+        """Get canonical path"""
+        return self.hashes.get(checksum, (None, 0))[0]
+
+    def store_canonical(
+        self,
+        checksum: str,
+        path: Path,
+        size: int,
+        chunk_hashes: Optional[list[str]] = None
+    ) -> None:
+        """Store canonical reference"""
+        self.hashes[checksum] = (str(path), size)
+
+        if chunk_hashes:
+            self.file_chunks[checksum] = chunk_hashes
+            for chunk_hash in chunk_hashes:
+                self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1
+
+    def get_chunk_hashes(self, checksum: str) -> list[str]:
+        """Get chunk hashes"""
+        return self.file_chunks.get(checksum, [])
+
+    def get_stats(self) -> Dict[str, int]:
+        """Get statistics"""
+        return {
+            'unique_files': len(self.hashes),
+            'unique_chunks': len(self.chunks),
+            'total_file_refs': len(self.hashes),
+            'total_chunk_refs': sum(self.chunks.values()),
+            'dedup_ratio': 1.0
+        }
+
+    def close(self):
+        """No-op for compatibility"""
+        pass
+
+    def __enter__(self):
+        """Context manager entry"""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        pass