clean up code

2025-12-13 12:00:34 +01:00
parent 2b2c575385
commit 7ce8c8c73d
10 changed files with 158 additions and 1471 deletions
--- a/app/deduplication/init.py
+++ b/app/deduplication/init.py
@@ -1,21 +1,4 @@
-"""Deduplication package exports"""
-from .chunker import (
-    RabinChunker,
-    SimpleChunker,
-    hash_chunk,
-    hash_file,
-    compute_file_signature
-)
+from .chunker import RabinChunker, SimpleChunker, hash_chunk, hash_file, compute_file_signature
 from .store import HashStore, MemoryHashStore
 from .engine import DeduplicationEngine
-
-__all__ = [
-    'RabinChunker',
-    'SimpleChunker',
-    'hash_chunk',
-    'hash_file',
-    'compute_file_signature',
-    'HashStore',
-    'MemoryHashStore',
-    'DeduplicationEngine',
-]
+__all__ = ['RabinChunker', 'SimpleChunker', 'hash_chunk', 'hash_file', 'compute_file_signature', 'HashStore', 'MemoryHashStore', 'DeduplicationEngine']
--- a/app/deduplication/_protocols.py
+++ b/app/deduplication/_protocols.py
@@ -0,0 +1 @@
+
--- a/app/deduplication/chunker.py
+++ b/app/deduplication/chunker.py
@@ -1,75 +1,29 @@
-"""Rabin fingerprint chunker for content-defined chunking"""
 import hashlib
 from pathlib import Path
 from typing import Iterator, Optional

-
 class RabinChunker:
-    """Content-defined chunking using Rabin fingerprinting

-    Uses a rolling hash to identify chunk boundaries based on content,
-    allowing for efficient deduplication even when data is modified.
-    """
-
-    def __init__(
-        self,
-        avg_chunk_size: int = 8192,
-        min_chunk_size: Optional[int] = None,
-        max_chunk_size: Optional[int] = None,
-        window_size: int = 48
-    ):
-        """Initialize Rabin chunker
-
-        Args:
-            avg_chunk_size: Target average chunk size in bytes
-            min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
-            max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
-            window_size: Rolling hash window size
-        """
+    def __init__(self, avg_chunk_size: int=8192, min_chunk_size: Optional[int]=None, max_chunk_size: Optional[int]=None, window_size: int=48):
        self.avg_chunk_size = avg_chunk_size
-        self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4)
-        self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8)
+        self.min_chunk_size = min_chunk_size or avg_chunk_size // 4
+        self.max_chunk_size = max_chunk_size or avg_chunk_size * 8
        self.window_size = window_size
-
-        # Calculate mask for boundary detection
-        # For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
        bits = 0
        size = avg_chunk_size
        while size > 1:
            bits += 1
            size >>= 1
        self.mask = (1 << bits) - 1
+        self.poly = 17349423945073011

-        # Polynomial for rolling hash (prime number)
-        self.poly = 0x3DA3358B4DC173
-
-    def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
-        """Chunk a file using Rabin fingerprinting
-
-        Args:
-            file_path: Path to file to chunk
-            chunk_size: If provided, use fixed-size chunking instead
-
-        Yields:
-            Chunk data as bytes
-        """
+    def chunk_file(self, file_path: Path, chunk_size: Optional[int]=None) -> Iterator[bytes]:
        if chunk_size:
-            # Use fixed-size chunking
            yield from self._chunk_fixed(file_path, chunk_size)
        else:
-            # Use content-defined chunking
            yield from self._chunk_rabin(file_path)

    def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
-        """Fixed-size chunking
-
-        Args:
-            file_path: Path to file
-            chunk_size: Chunk size in bytes
-
-        Yields:
-            Fixed-size chunks
-        """
        with open(file_path, 'rb') as f:
            while True:
                chunk = f.read(chunk_size)
@@ -78,46 +32,22 @@ class RabinChunker:
                yield chunk

    def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
-        """Content-defined chunking using Rabin fingerprinting
-
-        Args:
-            file_path: Path to file
-
-        Yields:
-            Variable-size chunks based on content
-        """
        with open(file_path, 'rb') as f:
            chunk_data = bytearray()
            window = bytearray()
            hash_value = 0
-
            while True:
                byte = f.read(1)
                if not byte:
-                    # End of file - yield remaining data
                    if chunk_data:
                        yield bytes(chunk_data)
                    break
-
                chunk_data.extend(byte)
                window.extend(byte)
-
-                # Maintain window size
                if len(window) > self.window_size:
                    window.pop(0)
-
-                # Update rolling hash
                hash_value = self._rolling_hash(window)
-
-                # Check if we should create a boundary
-                should_break = (
-                    len(chunk_data) >= self.min_chunk_size and
-                    (
-                        (hash_value & self.mask) == 0 or
-                        len(chunk_data) >= self.max_chunk_size
-                    )
-                )
-
+                should_break = len(chunk_data) >= self.min_chunk_size and (hash_value & self.mask == 0 or len(chunk_data) >= self.max_chunk_size)
                if should_break:
                    yield bytes(chunk_data)
                    chunk_data = bytearray()
@@ -125,40 +55,17 @@ class RabinChunker:
                    hash_value = 0

    def _rolling_hash(self, window: bytearray) -> int:
-        """Calculate rolling hash for window
-
-        Args:
-            window: Byte window
-
-        Returns:
-            Hash value
-        """
        hash_value = 0
        for byte in window:
-            hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF
+            hash_value = (hash_value << 1) + byte & 18446744073709551615
        return hash_value

-
 class SimpleChunker:
-    """Simple fixed-size chunker for comparison"""

-    def __init__(self, chunk_size: int = 8192):
-        """Initialize simple chunker
-
-        Args:
-            chunk_size: Fixed chunk size in bytes
-        """
+    def __init__(self, chunk_size: int=8192):
        self.chunk_size = chunk_size

    def chunk_file(self, file_path: Path) -> Iterator[bytes]:
-        """Chunk file into fixed-size pieces
-
-        Args:
-            file_path: Path to file
-
-        Yields:
-            Fixed-size chunks
-        """
        with open(file_path, 'rb') as f:
            while True:
                chunk = f.read(self.chunk_size)
@@ -166,76 +73,31 @@ class SimpleChunker:
                    break
                yield chunk

-
-def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
-    """Hash a chunk of data
-
-    Args:
-        chunk: Chunk data
-        algorithm: Hash algorithm (default: sha256)
-
-    Returns:
-        Hex digest of hash
-    """
+def hash_chunk(chunk: bytes, algorithm: str='sha256') -> str:
    hasher = hashlib.new(algorithm)
    hasher.update(chunk)
    return hasher.hexdigest()

-
-def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
-    """Hash entire file
-
-    Args:
-        file_path: Path to file
-        algorithm: Hash algorithm (default: sha256)
-        chunk_size: Size of chunks to read
-
-    Returns:
-        Hex digest of file hash
-    """
+def hash_file(file_path: Path, algorithm: str='sha256', chunk_size: int=65536) -> str:
    hasher = hashlib.new(algorithm)
-
    with open(file_path, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            hasher.update(chunk)
-
    return hasher.hexdigest()

-
-def compute_file_signature(
-    file_path: Path,
-    use_rabin: bool = True,
-    avg_chunk_size: int = 8192
-) -> tuple[str, list[str]]:
-    """Compute file signature with chunk hashes
-
-    Args:
-        file_path: Path to file
-        use_rabin: Whether to use Rabin chunking (vs fixed-size)
-        avg_chunk_size: Average chunk size for Rabin or fixed size
-
-    Returns:
-        Tuple of (file_hash, list of chunk hashes)
-    """
+def compute_file_signature(file_path: Path, use_rabin: bool=True, avg_chunk_size: int=8192) -> tuple[str, list[str]]:
    if use_rabin:
        chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
    else:
        chunker = SimpleChunker(chunk_size=avg_chunk_size)
-
    chunk_hashes = []
    file_hasher = hashlib.sha256()
-
    for chunk in chunker.chunk_file(file_path):
-        # Hash individual chunk
        chunk_hash = hash_chunk(chunk)
        chunk_hashes.append(chunk_hash)
-
-        # Update file hash
        file_hasher.update(chunk)
-
    file_hash = file_hasher.hexdigest()
-
-    return file_hash, chunk_hashes
+    return (file_hash, chunk_hashes)
--- a/app/deduplication/engine.py
+++ b/app/deduplication/engine.py
@@ -1,32 +1,16 @@
-"""Deduplication engine"""
 from pathlib import Path
 from typing import Optional, Callable
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import psycopg2
-
 from .chunker import compute_file_signature, hash_file
 from .store import HashStore
 from ..shared.models import FileRecord, ProcessingStats
 from ..shared.config import DatabaseConfig, ProcessingConfig
 from ..shared.logger import ProgressLogger

-
 class DeduplicationEngine:
-    """Engine for deduplicating files"""

-    def __init__(
-        self,
-        db_config: DatabaseConfig,
-        processing_config: ProcessingConfig,
-        logger: ProgressLogger
-    ):
-        """Initialize deduplication engine
-
-        Args:
-            db_config: Database configuration
-            processing_config: Processing configuration
-            logger: Progress logger
-        """
+    def __init__(self, db_config: DatabaseConfig, processing_config: ProcessingConfig, logger: ProgressLogger):
        self.db_config = db_config
        self.processing_config = processing_config
        self.logger = logger
@@ -34,320 +18,130 @@ class DeduplicationEngine:
        self._connection = None

    def _get_connection(self):
-        """Get or create database connection"""
        if self._connection is None or self._connection.closed:
-            self._connection = psycopg2.connect(
-                host=self.db_config.host,
-                port=self.db_config.port,
-                database=self.db_config.database,
-                user=self.db_config.user,
-                password=self.db_config.password
-            )
+            self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
        return self._connection

-    def deduplicate_all(
-        self,
-        disk: Optional[str] = None,
-        use_chunks: bool = True,
-        progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
-    ) -> ProcessingStats:
-        """Deduplicate all files in database
-
-        Args:
-            disk: Optional disk filter
-            use_chunks: Whether to use chunk-level deduplication
-            progress_callback: Optional callback for progress updates
-
-        Returns:
-            ProcessingStats with deduplication statistics
-        """
-        self.logger.section("Starting Deduplication")
-
+    def deduplicate_all(self, disk: Optional[str]=None, use_chunks: bool=True, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
+        self.logger.section('Starting Deduplication')
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        # Get files without checksums
        if disk:
-            cursor.execute("""
-                SELECT path, size
-                FROM files
-                WHERE disk_label = %s AND checksum IS NULL
-                ORDER BY size DESC
-                           """, (disk,))
+            cursor.execute('\n                SELECT path, size\n                FROM files\n                WHERE disk_label = %s AND checksum IS NULL\n                ORDER BY size DESC\n                           ', (disk,))
        else:
-            cursor.execute("""
-                SELECT path, size
-                FROM files
-                WHERE checksum IS NULL
-                ORDER BY size DESC
-                           """)
-
+            cursor.execute('\n                SELECT path, size\n                FROM files\n                WHERE checksum IS NULL\n                ORDER BY size DESC\n                           ')
        files_to_process = cursor.fetchall()
        total_files = len(files_to_process)
-
-        self.logger.info(f"Found {total_files} files to process")
-
+        self.logger.info(f'Found {total_files} files to process')
        stats = ProcessingStats()
-
-        # Process files with thread pool
        with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
            futures = {}
-
            for path_str, size in files_to_process:
                path = Path(path_str)
                future = executor.submit(self._process_file, path, use_chunks)
                futures[future] = (path, size)
-
-            # Process completed futures
            for future in as_completed(futures):
                path, size = futures[future]
-
                try:
                    checksum, duplicate_of = future.result()
-
                    if checksum:
-                        # Update database
-                        cursor.execute("""
-                            UPDATE files
-                            SET checksum = %s, duplicate_of = %s
-                            WHERE path = %s
-                                       """, (checksum, duplicate_of, str(path)))
-
+                        cursor.execute('\n                            UPDATE files\n                            SET checksum = %s, duplicate_of = %s\n                            WHERE path = %s\n                                       ', (checksum, duplicate_of, str(path)))
                        stats.files_succeeded += 1
                        stats.bytes_processed += size
-
                    stats.files_processed += 1
-
-                    # Commit periodically
                    if stats.files_processed % self.processing_config.commit_interval == 0:
                        conn.commit()
-
-                        # Progress callback
                        if progress_callback:
                            progress_callback(stats.files_processed, total_files, stats)
-
-                        # Log progress
-                        self.logger.progress(
-                            stats.files_processed,
-                            total_files,
-                            prefix="Files processed",
-                            bytes_processed=stats.bytes_processed,
-                            elapsed_seconds=stats.elapsed_seconds
-                        )
-
+                        self.logger.progress(stats.files_processed, total_files, prefix='Files processed', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
                except Exception as e:
-                    self.logger.warning(f"Failed to process {path}: {e}")
+                    self.logger.warning(f'Failed to process {path}: {e}')
                    stats.files_failed += 1
                    stats.files_processed += 1
-
-        # Final commit
        conn.commit()
        cursor.close()
-
-        self.logger.info(
-            f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
-            f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
-        )
-
+        self.logger.info(f'Deduplication complete: {stats.files_succeeded}/{total_files} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
        return stats

-    def _process_file(
-        self,
-        path: Path,
-        use_chunks: bool
-    ) -> tuple[Optional[str], Optional[str]]:
-        """Process a single file for deduplication
-
-        Args:
-            path: Path to file
-            use_chunks: Whether to use chunk-level deduplication
-
-        Returns:
-            Tuple of (checksum, duplicate_of_path)
-        """
+    def _process_file(self, path: Path, use_chunks: bool) -> tuple[Optional[str], Optional[str]]:
        if not path.exists():
-            return None, None
-
+            return (None, None)
        try:
            if use_chunks:
-                # Compute file signature with chunks
-                checksum, chunk_hashes = compute_file_signature(
-                    path,
-                    use_rabin=True,
-                    avg_chunk_size=self.processing_config.chunk_size
-                )
+                checksum, chunk_hashes = compute_file_signature(path, use_rabin=True, avg_chunk_size=self.processing_config.chunk_size)
            else:
-                # Just compute file hash
-                checksum = hash_file(
-                    path,
-                    algorithm=self.processing_config.hash_algorithm
-                )
+                checksum = hash_file(path, algorithm=self.processing_config.hash_algorithm)
                chunk_hashes = None
-
-            # Check if hash exists
            if self.hash_store.exists(checksum):
-                # Duplicate found
                canonical_path = self.hash_store.get_canonical(checksum)
-                return checksum, canonical_path
+                return (checksum, canonical_path)
            else:
-                # New unique file
                size = path.stat().st_size
-                self.hash_store.store_canonical(
-                    checksum,
-                    path,
-                    size,
-                    chunk_hashes
-                )
-                return checksum, None
-
+                self.hash_store.store_canonical(checksum, path, size, chunk_hashes)
+                return (checksum, None)
        except Exception as e:
-            self.logger.debug(f"Error processing {path}: {e}")
+            self.logger.debug(f'Error processing {path}: {e}')
            raise

-    def find_duplicates(
-        self,
-        disk: Optional[str] = None
-    ) -> dict[str, list[str]]:
-        """Find all duplicate files
-
-        Args:
-            disk: Optional disk filter
-
-        Returns:
-            Dictionary mapping canonical path to list of duplicate paths
-        """
-        self.logger.subsection("Finding Duplicates")
-
+    def find_duplicates(self, disk: Optional[str]=None) -> dict[str, list[str]]:
+        self.logger.subsection('Finding Duplicates')
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        # Query for duplicates
        if disk:
-            cursor.execute("""
-                SELECT checksum, array_agg(path ORDER BY path) as paths
-                FROM files
-                WHERE disk_label = %s AND checksum IS NOT NULL
-                GROUP BY checksum
-                HAVING COUNT(*) > 1
-                           """, (disk,))
+            cursor.execute('\n                SELECT checksum, array_agg(path ORDER BY path) as paths\n                FROM files\n                WHERE disk_label = %s AND checksum IS NOT NULL\n                GROUP BY checksum\n                HAVING COUNT(*) > 1\n                           ', (disk,))
        else:
-            cursor.execute("""
-                SELECT checksum, array_agg(path ORDER BY path) as paths
-                FROM files
-                WHERE checksum IS NOT NULL
-                GROUP BY checksum
-                HAVING COUNT(*) > 1
-                           """)
-
+            cursor.execute('\n                SELECT checksum, array_agg(path ORDER BY path) as paths\n                FROM files\n                WHERE checksum IS NOT NULL\n                GROUP BY checksum\n                HAVING COUNT(*) > 1\n                           ')
        duplicates = {}
        for checksum, paths in cursor.fetchall():
            canonical = paths[0]
            duplicates[canonical] = paths[1:]
-
        cursor.close()
-
-        self.logger.info(f"Found {len(duplicates)} sets of duplicates")
-
+        self.logger.info(f'Found {len(duplicates)} sets of duplicates')
        return duplicates

    def get_deduplication_stats(self) -> dict:
-        """Get deduplication statistics
-
-        Returns:
-            Dictionary with statistics
-        """
        conn = self._get_connection()
        cursor = conn.cursor()
-
        stats = {}
-
-        # Total files
-        cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
+        cursor.execute('SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL')
        stats['total_files'] = cursor.fetchone()[0]
-
-        # Unique files
-        cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
+        cursor.execute('SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL')
        stats['unique_files'] = cursor.fetchone()[0]
-
-        # Duplicate files
        stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
-
-        # Total size
-        cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
+        cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL')
        stats['total_size'] = cursor.fetchone()[0]
-
-        # Unique size
-        cursor.execute("""
-            SELECT COALESCE(SUM(size), 0)
-            FROM (
-                SELECT DISTINCT ON (checksum) size
-                FROM files
-                WHERE checksum IS NOT NULL
-            ) AS unique_files
-                       """)
+        cursor.execute('\n            SELECT COALESCE(SUM(size), 0)\n            FROM (\n                SELECT DISTINCT ON (checksum) size\n                FROM files\n                WHERE checksum IS NOT NULL\n            ) AS unique_files\n                       ')
        stats['unique_size'] = cursor.fetchone()[0]
-
-        # Wasted space
        stats['wasted_space'] = stats['total_size'] - stats['unique_size']
-
-        # Deduplication ratio
        if stats['total_size'] > 0:
            stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
        else:
            stats['dedup_ratio'] = 1.0
-
-        # Space saved percentage
        if stats['total_size'] > 0:
-            stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100
+            stats['space_saved_percent'] = stats['wasted_space'] / stats['total_size'] * 100
        else:
            stats['space_saved_percent'] = 0.0
-
        cursor.close()
-
        return stats

    def mark_canonical_files(self) -> int:
-        """Mark canonical (first occurrence) files in database
-
-        Returns:
-            Number of canonical files marked
-        """
-        self.logger.subsection("Marking Canonical Files")
-
+        self.logger.subsection('Marking Canonical Files')
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        # Find first occurrence of each checksum and mark as canonical
-        cursor.execute("""
-            WITH canonical AS (
-                SELECT DISTINCT ON (checksum) path, checksum
-                FROM files
-                WHERE checksum IS NOT NULL
-                ORDER BY checksum, path
-            )
-            UPDATE files
-            SET duplicate_of = NULL
-            WHERE path IN (SELECT path FROM canonical)
-                       """)
-
+        cursor.execute('\n            WITH canonical AS (\n                SELECT DISTINCT ON (checksum) path, checksum\n                FROM files\n                WHERE checksum IS NOT NULL\n                ORDER BY checksum, path\n            )\n            UPDATE files\n            SET duplicate_of = NULL\n            WHERE path IN (SELECT path FROM canonical)\n                       ')
        count = cursor.rowcount
        conn.commit()
        cursor.close()
-
-        self.logger.info(f"Marked {count} canonical files")
-
+        self.logger.info(f'Marked {count} canonical files')
        return count

    def close(self):
-        """Close connections"""
        self.hash_store.close()
-        if self._connection and not self._connection.closed:
+        if self._connection and (not self._connection.closed):
            self._connection.close()

    def __enter__(self):
-        """Context manager entry"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Context manager exit"""
        self.close()
--- a/app/deduplication/store.py
+++ b/app/deduplication/store.py
@@ -1,412 +1,174 @@
-"""Hash store for deduplication with optional Redis support"""
 from typing import Optional, Dict, Set
 from pathlib import Path
 import psycopg2
 from psycopg2.extras import execute_batch
-
 from ..shared.config import DatabaseConfig

-
 class HashStore:
-    """PostgreSQL-based hash store for deduplication"""

    def __init__(self, db_config: DatabaseConfig):
-        """Initialize hash store
-
-        Args:
-            db_config: Database configuration
-        """
        self.db_config = db_config
        self._connection = None

    def _get_connection(self):
-        """Get or create database connection"""
        if self._connection is None or self._connection.closed:
-            self._connection = psycopg2.connect(
-                host=self.db_config.host,
-                port=self.db_config.port,
-                database=self.db_config.database,
-                user=self.db_config.user,
-                password=self.db_config.password
-            )
+            self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
        return self._connection

    def _ensure_tables(self):
-        """Ensure hash store tables exist"""
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        # Create hashes table for file-level deduplication
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS file_hashes (
-                checksum TEXT PRIMARY KEY,
-                canonical_path TEXT NOT NULL,
-                size BIGINT NOT NULL,
-                first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                ref_count INTEGER DEFAULT 1
-            )
-        """)
-
-        # Create chunk hashes table for chunk-level deduplication
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS chunk_hashes (
-                chunk_hash TEXT PRIMARY KEY,
-                size INTEGER NOT NULL,
-                first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                ref_count INTEGER DEFAULT 1
-            )
-        """)
-
-        # Create file-chunk mapping table
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS file_chunks (
-                id SERIAL PRIMARY KEY,
-                file_checksum TEXT NOT NULL,
-                chunk_hash TEXT NOT NULL,
-                chunk_index INTEGER NOT NULL,
-                FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),
-                FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),
-                UNIQUE (file_checksum, chunk_index)
-            )
-        """)
-
-        # Create indexes
-        cursor.execute("""
-            CREATE INDEX IF NOT EXISTS idx_file_chunks_file
-            ON file_chunks(file_checksum)
-        """)
-
-        cursor.execute("""
-            CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk
-            ON file_chunks(chunk_hash)
-        """)
-
+        cursor.execute('\n            CREATE TABLE IF NOT EXISTS file_hashes (\n                checksum TEXT PRIMARY KEY,\n                canonical_path TEXT NOT NULL,\n                size BIGINT NOT NULL,\n                first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n                ref_count INTEGER DEFAULT 1\n            )\n        ')
+        cursor.execute('\n            CREATE TABLE IF NOT EXISTS chunk_hashes (\n                chunk_hash TEXT PRIMARY KEY,\n                size INTEGER NOT NULL,\n                first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n                ref_count INTEGER DEFAULT 1\n            )\n        ')
+        cursor.execute('\n            CREATE TABLE IF NOT EXISTS file_chunks (\n                id SERIAL PRIMARY KEY,\n                file_checksum TEXT NOT NULL,\n                chunk_hash TEXT NOT NULL,\n                chunk_index INTEGER NOT NULL,\n                FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),\n                FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),\n                UNIQUE (file_checksum, chunk_index)\n            )\n        ')
+        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_file_chunks_file\n            ON file_chunks(file_checksum)\n        ')
+        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk\n            ON file_chunks(chunk_hash)\n        ')
        conn.commit()
        cursor.close()

    def exists(self, checksum: str) -> bool:
-        """Check if hash exists in store
-
-        Args:
-            checksum: File hash to check
-
-        Returns:
-            True if hash exists
-        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        cursor.execute(
-            "SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1",
-            (checksum,)
-        )
-
+        cursor.execute('SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1', (checksum,))
        exists = cursor.fetchone() is not None
        cursor.close()
-
        return exists

    def get_canonical(self, checksum: str) -> Optional[str]:
-        """Get canonical path for a hash
-
-        Args:
-            checksum: File hash
-
-        Returns:
-            Canonical file path or None if not found
-        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        cursor.execute(
-            "SELECT canonical_path FROM file_hashes WHERE checksum = %s",
-            (checksum,)
-        )
-
+        cursor.execute('SELECT canonical_path FROM file_hashes WHERE checksum = %s', (checksum,))
        result = cursor.fetchone()
        cursor.close()
-
        return result[0] if result else None

-    def store_canonical(
-        self,
-        checksum: str,
-        path: Path,
-        size: int,
-        chunk_hashes: Optional[list[str]] = None
-    ) -> None:
-        """Store canonical reference for a hash
-
-        Args:
-            checksum: File hash
-            path: Canonical file path
-            size: File size in bytes
-            chunk_hashes: Optional list of chunk hashes
-        """
+    def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None:
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
-
        try:
-            # Store file hash
-            cursor.execute("""
-                INSERT INTO file_hashes (checksum, canonical_path, size)
-                VALUES (%s, %s, %s)
-                ON CONFLICT (checksum) DO UPDATE SET
-                    ref_count = file_hashes.ref_count + 1
-            """, (checksum, str(path), size))
-
-            # Store chunk hashes if provided
+            cursor.execute('\n                INSERT INTO file_hashes (checksum, canonical_path, size)\n                VALUES (%s, %s, %s)\n                ON CONFLICT (checksum) DO UPDATE SET\n                    ref_count = file_hashes.ref_count + 1\n            ', (checksum, str(path), size))
            if chunk_hashes:
-                # Insert chunk hashes
                chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes]
-                execute_batch(cursor, """
-                    INSERT INTO chunk_hashes (chunk_hash, size)
-                    VALUES (%s, %s)
-                    ON CONFLICT (chunk_hash) DO UPDATE SET
-                        ref_count = chunk_hashes.ref_count + 1
-                """, chunk_data, page_size=1000)
-
-                # Create file-chunk mappings
-                mapping_data = [
-                    (checksum, chunk_hash, idx)
-                    for idx, chunk_hash in enumerate(chunk_hashes)
-                ]
-                execute_batch(cursor, """
-                    INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)
-                    VALUES (%s, %s, %s)
-                    ON CONFLICT (file_checksum, chunk_index) DO NOTHING
-                """, mapping_data, page_size=1000)
-
+                execute_batch(cursor, '\n                    INSERT INTO chunk_hashes (chunk_hash, size)\n                    VALUES (%s, %s)\n                    ON CONFLICT (chunk_hash) DO UPDATE SET\n                        ref_count = chunk_hashes.ref_count + 1\n                ', chunk_data, page_size=1000)
+                mapping_data = [(checksum, chunk_hash, idx) for idx, chunk_hash in enumerate(chunk_hashes)]
+                execute_batch(cursor, '\n                    INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)\n                    VALUES (%s, %s, %s)\n                    ON CONFLICT (file_checksum, chunk_index) DO NOTHING\n                ', mapping_data, page_size=1000)
            conn.commit()
-
        except Exception as e:
            conn.rollback()
            raise
-
        finally:
            cursor.close()

    def get_chunk_hashes(self, checksum: str) -> list[str]:
-        """Get chunk hashes for a file
-
-        Args:
-            checksum: File hash
-
-        Returns:
-            List of chunk hashes in order
-        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        cursor.execute("""
-            SELECT chunk_hash
-            FROM file_chunks
-            WHERE file_checksum = %s
-            ORDER BY chunk_index
-        """, (checksum,))
-
+        cursor.execute('\n            SELECT chunk_hash\n            FROM file_chunks\n            WHERE file_checksum = %s\n            ORDER BY chunk_index\n        ', (checksum,))
        chunk_hashes = [row[0] for row in cursor.fetchall()]
        cursor.close()
-
        return chunk_hashes

    def get_duplicates(self) -> Dict[str, list[str]]:
-        """Get all duplicate file groups
-
-        Returns:
-            Dictionary mapping canonical path to list of duplicate paths
-        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        # Get all files with their hashes
-        cursor.execute("""
-            SELECT f.path, f.checksum
-            FROM files f
-            WHERE f.checksum IS NOT NULL
-        """)
-
-        # Group by checksum
+        cursor.execute('\n            SELECT f.path, f.checksum\n            FROM files f\n            WHERE f.checksum IS NOT NULL\n        ')
        hash_to_paths: Dict[str, list[str]] = {}
        for path, checksum in cursor.fetchall():
            if checksum not in hash_to_paths:
                hash_to_paths[checksum] = []
            hash_to_paths[checksum].append(path)
-
        cursor.close()
-
-        # Filter to only duplicates (more than one file)
-        duplicates = {
-            paths[0]: paths[1:]
-            for checksum, paths in hash_to_paths.items()
-            if len(paths) > 1
-        }
-
+        duplicates = {paths[0]: paths[1:] for checksum, paths in hash_to_paths.items() if len(paths) > 1}
        return duplicates

    def get_stats(self) -> Dict[str, int]:
-        """Get hash store statistics
-
-        Returns:
-            Dictionary with statistics
-        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
-
        stats = {}
-
-        # Count unique file hashes
-        cursor.execute("SELECT COUNT(*) FROM file_hashes")
+        cursor.execute('SELECT COUNT(*) FROM file_hashes')
        stats['unique_files'] = cursor.fetchone()[0]
-
-        # Count unique chunk hashes
-        cursor.execute("SELECT COUNT(*) FROM chunk_hashes")
+        cursor.execute('SELECT COUNT(*) FROM chunk_hashes')
        stats['unique_chunks'] = cursor.fetchone()[0]
-
-        # Count total references
-        cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes")
+        cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes')
        stats['total_file_refs'] = cursor.fetchone()[0]
-
-        # Count total chunk references
-        cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes")
+        cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes')
        stats['total_chunk_refs'] = cursor.fetchone()[0]
-
-        # Calculate deduplication ratio
        if stats['total_file_refs'] > 0:
            stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs']
        else:
            stats['dedup_ratio'] = 1.0
-
        cursor.close()
-
        return stats

-    def find_similar_files(self, checksum: str, threshold: float = 0.8) -> list[tuple[str, float]]:
-        """Find files similar to given hash based on chunk overlap
-
-        Args:
-            checksum: File hash to compare
-            threshold: Similarity threshold (0.0 to 1.0)
-
-        Returns:
-            List of tuples (other_checksum, similarity_score)
-        """
+    def find_similar_files(self, checksum: str, threshold: float=0.8) -> list[tuple[str, float]]:
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        # Get chunks for the target file
        target_chunks = set(self.get_chunk_hashes(checksum))
-
        if not target_chunks:
            cursor.close()
            return []
-
-        # Find files sharing chunks
-        cursor.execute("""
-            SELECT DISTINCT fc.file_checksum
-            FROM file_chunks fc
-            WHERE fc.chunk_hash = ANY(%s)
-            AND fc.file_checksum != %s
-        """, (list(target_chunks), checksum))
-
+        cursor.execute('\n            SELECT DISTINCT fc.file_checksum\n            FROM file_chunks fc\n            WHERE fc.chunk_hash = ANY(%s)\n            AND fc.file_checksum != %s\n        ', (list(target_chunks), checksum))
        similar_files = []
-
-        for (other_checksum,) in cursor.fetchall():
+        for other_checksum, in cursor.fetchall():
            other_chunks = set(self.get_chunk_hashes(other_checksum))
-
-            # Calculate Jaccard similarity
            intersection = len(target_chunks & other_chunks)
            union = len(target_chunks | other_chunks)
-
            if union > 0:
                similarity = intersection / union
-
                if similarity >= threshold:
                    similar_files.append((other_checksum, similarity))
-
        cursor.close()
-
-        # Sort by similarity descending
        similar_files.sort(key=lambda x: x[1], reverse=True)
-
        return similar_files

    def close(self):
-        """Close database connection"""
-        if self._connection and not self._connection.closed:
+        if self._connection and (not self._connection.closed):
            self._connection.close()

    def __enter__(self):
-        """Context manager entry"""
        self._ensure_tables()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Context manager exit"""
        self.close()

-
 class MemoryHashStore:
-    """In-memory hash store for testing and small datasets"""

    def __init__(self):
-        """Initialize in-memory hash store"""
        self.hashes: Dict[str, tuple[str, int]] = {}
        self.chunks: Dict[str, int] = {}
        self.file_chunks: Dict[str, list[str]] = {}

    def exists(self, checksum: str) -> bool:
-        """Check if hash exists"""
        return checksum in self.hashes

    def get_canonical(self, checksum: str) -> Optional[str]:
-        """Get canonical path"""
        return self.hashes.get(checksum, (None, 0))[0]

-    def store_canonical(
-        self,
-        checksum: str,
-        path: Path,
-        size: int,
-        chunk_hashes: Optional[list[str]] = None
-    ) -> None:
-        """Store canonical reference"""
+    def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None:
        self.hashes[checksum] = (str(path), size)
-
        if chunk_hashes:
            self.file_chunks[checksum] = chunk_hashes
            for chunk_hash in chunk_hashes:
                self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1

    def get_chunk_hashes(self, checksum: str) -> list[str]:
-        """Get chunk hashes"""
        return self.file_chunks.get(checksum, [])

    def get_stats(self) -> Dict[str, int]:
-        """Get statistics"""
-        return {
-            'unique_files': len(self.hashes),
-            'unique_chunks': len(self.chunks),
-            'total_file_refs': len(self.hashes),
-            'total_chunk_refs': sum(self.chunks.values()),
-            'dedup_ratio': 1.0
-        }
+        return {'unique_files': len(self.hashes), 'unique_chunks': len(self.chunks), 'total_file_refs': len(self.hashes), 'total_chunk_refs': sum(self.chunks.values()), 'dedup_ratio': 1.0}

    def close(self):
-        """No-op for compatibility"""
        pass

    def __enter__(self):
-        """Context manager entry"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Context manager exit"""
        pass