clean up code

2025-12-13 12:00:34 +01:00
parent 2b2c575385
commit 7ce8c8c73d
10 changed files with 158 additions and 1471 deletions
--- a/app/deduplication/chunker.py
+++ b/app/deduplication/chunker.py
@@ -1,75 +1,29 @@
-"""Rabin fingerprint chunker for content-defined chunking"""
 import hashlib
 from pathlib import Path
 from typing import Iterator, Optional

-
 class RabinChunker:
-    """Content-defined chunking using Rabin fingerprinting

-    Uses a rolling hash to identify chunk boundaries based on content,
-    allowing for efficient deduplication even when data is modified.
-    """
-
-    def __init__(
-        self,
-        avg_chunk_size: int = 8192,
-        min_chunk_size: Optional[int] = None,
-        max_chunk_size: Optional[int] = None,
-        window_size: int = 48
-    ):
-        """Initialize Rabin chunker
-
-        Args:
-            avg_chunk_size: Target average chunk size in bytes
-            min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
-            max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
-            window_size: Rolling hash window size
-        """
+    def __init__(self, avg_chunk_size: int=8192, min_chunk_size: Optional[int]=None, max_chunk_size: Optional[int]=None, window_size: int=48):
        self.avg_chunk_size = avg_chunk_size
-        self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4)
-        self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8)
+        self.min_chunk_size = min_chunk_size or avg_chunk_size // 4
+        self.max_chunk_size = max_chunk_size or avg_chunk_size * 8
        self.window_size = window_size
-
-        # Calculate mask for boundary detection
-        # For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
        bits = 0
        size = avg_chunk_size
        while size > 1:
            bits += 1
            size >>= 1
        self.mask = (1 << bits) - 1
+        self.poly = 17349423945073011

-        # Polynomial for rolling hash (prime number)
-        self.poly = 0x3DA3358B4DC173
-
-    def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
-        """Chunk a file using Rabin fingerprinting
-
-        Args:
-            file_path: Path to file to chunk
-            chunk_size: If provided, use fixed-size chunking instead
-
-        Yields:
-            Chunk data as bytes
-        """
+    def chunk_file(self, file_path: Path, chunk_size: Optional[int]=None) -> Iterator[bytes]:
        if chunk_size:
-            # Use fixed-size chunking
            yield from self._chunk_fixed(file_path, chunk_size)
        else:
-            # Use content-defined chunking
            yield from self._chunk_rabin(file_path)

    def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
-        """Fixed-size chunking
-
-        Args:
-            file_path: Path to file
-            chunk_size: Chunk size in bytes
-
-        Yields:
-            Fixed-size chunks
-        """
        with open(file_path, 'rb') as f:
            while True:
                chunk = f.read(chunk_size)
@@ -78,46 +32,22 @@ class RabinChunker:
                yield chunk

    def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
-        """Content-defined chunking using Rabin fingerprinting
-
-        Args:
-            file_path: Path to file
-
-        Yields:
-            Variable-size chunks based on content
-        """
        with open(file_path, 'rb') as f:
            chunk_data = bytearray()
            window = bytearray()
            hash_value = 0
-
            while True:
                byte = f.read(1)
                if not byte:
-                    # End of file - yield remaining data
                    if chunk_data:
                        yield bytes(chunk_data)
                    break
-
                chunk_data.extend(byte)
                window.extend(byte)
-
-                # Maintain window size
                if len(window) > self.window_size:
                    window.pop(0)
-
-                # Update rolling hash
                hash_value = self._rolling_hash(window)
-
-                # Check if we should create a boundary
-                should_break = (
-                    len(chunk_data) >= self.min_chunk_size and
-                    (
-                        (hash_value & self.mask) == 0 or
-                        len(chunk_data) >= self.max_chunk_size
-                    )
-                )
-
+                should_break = len(chunk_data) >= self.min_chunk_size and (hash_value & self.mask == 0 or len(chunk_data) >= self.max_chunk_size)
                if should_break:
                    yield bytes(chunk_data)
                    chunk_data = bytearray()
@@ -125,40 +55,17 @@ class RabinChunker:
                    hash_value = 0

    def _rolling_hash(self, window: bytearray) -> int:
-        """Calculate rolling hash for window
-
-        Args:
-            window: Byte window
-
-        Returns:
-            Hash value
-        """
        hash_value = 0
        for byte in window:
-            hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF
+            hash_value = (hash_value << 1) + byte & 18446744073709551615
        return hash_value

-
 class SimpleChunker:
-    """Simple fixed-size chunker for comparison"""

-    def __init__(self, chunk_size: int = 8192):
-        """Initialize simple chunker
-
-        Args:
-            chunk_size: Fixed chunk size in bytes
-        """
+    def __init__(self, chunk_size: int=8192):
        self.chunk_size = chunk_size

    def chunk_file(self, file_path: Path) -> Iterator[bytes]:
-        """Chunk file into fixed-size pieces
-
-        Args:
-            file_path: Path to file
-
-        Yields:
-            Fixed-size chunks
-        """
        with open(file_path, 'rb') as f:
            while True:
                chunk = f.read(self.chunk_size)
@@ -166,76 +73,31 @@ class SimpleChunker:
                    break
                yield chunk

-
-def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
-    """Hash a chunk of data
-
-    Args:
-        chunk: Chunk data
-        algorithm: Hash algorithm (default: sha256)
-
-    Returns:
-        Hex digest of hash
-    """
+def hash_chunk(chunk: bytes, algorithm: str='sha256') -> str:
    hasher = hashlib.new(algorithm)
    hasher.update(chunk)
    return hasher.hexdigest()

-
-def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
-    """Hash entire file
-
-    Args:
-        file_path: Path to file
-        algorithm: Hash algorithm (default: sha256)
-        chunk_size: Size of chunks to read
-
-    Returns:
-        Hex digest of file hash
-    """
+def hash_file(file_path: Path, algorithm: str='sha256', chunk_size: int=65536) -> str:
    hasher = hashlib.new(algorithm)
-
    with open(file_path, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            hasher.update(chunk)
-
    return hasher.hexdigest()

-
-def compute_file_signature(
-    file_path: Path,
-    use_rabin: bool = True,
-    avg_chunk_size: int = 8192
-) -> tuple[str, list[str]]:
-    """Compute file signature with chunk hashes
-
-    Args:
-        file_path: Path to file
-        use_rabin: Whether to use Rabin chunking (vs fixed-size)
-        avg_chunk_size: Average chunk size for Rabin or fixed size
-
-    Returns:
-        Tuple of (file_hash, list of chunk hashes)
-    """
+def compute_file_signature(file_path: Path, use_rabin: bool=True, avg_chunk_size: int=8192) -> tuple[str, list[str]]:
    if use_rabin:
        chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
    else:
        chunker = SimpleChunker(chunk_size=avg_chunk_size)
-
    chunk_hashes = []
    file_hasher = hashlib.sha256()
-
    for chunk in chunker.chunk_file(file_path):
-        # Hash individual chunk
        chunk_hash = hash_chunk(chunk)
        chunk_hashes.append(chunk_hash)
-
-        # Update file hash
        file_hasher.update(chunk)
-
    file_hash = file_hasher.hexdigest()
-
-    return file_hash, chunk_hashes
+    return (file_hash, chunk_hashes)