defrag/app/deduplication/chunker.py

"""Rabin fingerprint chunker for content-defined chunking"""
import hashlib
from pathlib import Path
from typing import Iterator, Optional


class RabinChunker:
    """Content-defined chunking using Rabin fingerprinting

    Uses a rolling hash to identify chunk boundaries based on content,
    allowing for efficient deduplication even when data is modified.
    """

    def __init__(
        self,
        avg_chunk_size: int = 8192,
        min_chunk_size: Optional[int] = None,
        max_chunk_size: Optional[int] = None,
        window_size: int = 48
    ):
        """Initialize Rabin chunker

        Args:
            avg_chunk_size: Target average chunk size in bytes
            min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
            max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
            window_size: Rolling hash window size
        """
        self.avg_chunk_size = avg_chunk_size
        self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4)
        self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8)
        self.window_size = window_size

        # Calculate mask for boundary detection
        # For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
        bits = 0
        size = avg_chunk_size
        while size > 1:
            bits += 1
            size >>= 1
        self.mask = (1 << bits) - 1

        # Polynomial for rolling hash (prime number)
        self.poly = 0x3DA3358B4DC173

    def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
        """Chunk a file using Rabin fingerprinting

        Args:
            file_path: Path to file to chunk
            chunk_size: If provided, use fixed-size chunking instead

        Yields:
            Chunk data as bytes
        """
        if chunk_size:
            # Use fixed-size chunking
            yield from self._chunk_fixed(file_path, chunk_size)
        else:
            # Use content-defined chunking
            yield from self._chunk_rabin(file_path)

    def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
        """Fixed-size chunking

        Args:
            file_path: Path to file
            chunk_size: Chunk size in bytes

        Yields:
            Fixed-size chunks
        """
        with open(file_path, 'rb') as f:
            while True:
                chunk = f.read(chunk_size)
                if not chunk:
                    break
                yield chunk

    def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
        """Content-defined chunking using Rabin fingerprinting

        Args:
            file_path: Path to file

        Yields:
            Variable-size chunks based on content
        """
        with open(file_path, 'rb') as f:
            chunk_data = bytearray()
            window = bytearray()
            hash_value = 0

            while True:
                byte = f.read(1)
                if not byte:
                    # End of file - yield remaining data
                    if chunk_data:
                        yield bytes(chunk_data)
                    break

                chunk_data.extend(byte)
                window.extend(byte)

                # Maintain window size
                if len(window) > self.window_size:
                    window.pop(0)

                # Update rolling hash
                hash_value = self._rolling_hash(window)

                # Check if we should create a boundary
                should_break = (
                    len(chunk_data) >= self.min_chunk_size and
                    (
                        (hash_value & self.mask) == 0 or
                        len(chunk_data) >= self.max_chunk_size
                    )
                )

                if should_break:
                    yield bytes(chunk_data)
                    chunk_data = bytearray()
                    window = bytearray()
                    hash_value = 0

    def _rolling_hash(self, window: bytearray) -> int:
        """Calculate rolling hash for window

        Args:
            window: Byte window

        Returns:
            Hash value
        """
        hash_value = 0
        for byte in window:
            hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF
        return hash_value


class SimpleChunker:
    """Simple fixed-size chunker for comparison"""

    def __init__(self, chunk_size: int = 8192):
        """Initialize simple chunker

        Args:
            chunk_size: Fixed chunk size in bytes
        """
        self.chunk_size = chunk_size

    def chunk_file(self, file_path: Path) -> Iterator[bytes]:
        """Chunk file into fixed-size pieces

        Args:
            file_path: Path to file

        Yields:
            Fixed-size chunks
        """
        with open(file_path, 'rb') as f:
            while True:
                chunk = f.read(self.chunk_size)
                if not chunk:
                    break
                yield chunk


def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
    """Hash a chunk of data

    Args:
        chunk: Chunk data
        algorithm: Hash algorithm (default: sha256)

    Returns:
        Hex digest of hash
    """
    hasher = hashlib.new(algorithm)
    hasher.update(chunk)
    return hasher.hexdigest()


def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
    """Hash entire file

    Args:
        file_path: Path to file
        algorithm: Hash algorithm (default: sha256)
        chunk_size: Size of chunks to read

    Returns:
        Hex digest of file hash
    """
    hasher = hashlib.new(algorithm)

    with open(file_path, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            hasher.update(chunk)

    return hasher.hexdigest()


def compute_file_signature(
    file_path: Path,
    use_rabin: bool = True,
    avg_chunk_size: int = 8192
) -> tuple[str, list[str]]:
    """Compute file signature with chunk hashes

    Args:
        file_path: Path to file
        use_rabin: Whether to use Rabin chunking (vs fixed-size)
        avg_chunk_size: Average chunk size for Rabin or fixed size

    Returns:
        Tuple of (file_hash, list of chunk hashes)
    """
    if use_rabin:
        chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
    else:
        chunker = SimpleChunker(chunk_size=avg_chunk_size)

    chunk_hashes = []
    file_hasher = hashlib.sha256()

    for chunk in chunker.chunk_file(file_path):
        # Hash individual chunk
        chunk_hash = hash_chunk(chunk)
        chunk_hashes.append(chunk_hash)

        # Update file hash
        file_hasher.update(chunk)

    file_hash = file_hasher.hexdigest()

    return file_hash, chunk_hashes