"""Rabin fingerprint chunker for content-defined chunking""" import hashlib from pathlib import Path from typing import Iterator, Optional class RabinChunker: """Content-defined chunking using Rabin fingerprinting Uses a rolling hash to identify chunk boundaries based on content, allowing for efficient deduplication even when data is modified. """ def __init__( self, avg_chunk_size: int = 8192, min_chunk_size: Optional[int] = None, max_chunk_size: Optional[int] = None, window_size: int = 48 ): """Initialize Rabin chunker Args: avg_chunk_size: Target average chunk size in bytes min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4) max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8) window_size: Rolling hash window size """ self.avg_chunk_size = avg_chunk_size self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4) self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8) self.window_size = window_size # Calculate mask for boundary detection # For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability bits = 0 size = avg_chunk_size while size > 1: bits += 1 size >>= 1 self.mask = (1 << bits) - 1 # Polynomial for rolling hash (prime number) self.poly = 0x3DA3358B4DC173 def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]: """Chunk a file using Rabin fingerprinting Args: file_path: Path to file to chunk chunk_size: If provided, use fixed-size chunking instead Yields: Chunk data as bytes """ if chunk_size: # Use fixed-size chunking yield from self._chunk_fixed(file_path, chunk_size) else: # Use content-defined chunking yield from self._chunk_rabin(file_path) def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]: """Fixed-size chunking Args: file_path: Path to file chunk_size: Chunk size in bytes Yields: Fixed-size chunks """ with open(file_path, 'rb') as f: while True: chunk = f.read(chunk_size) if not chunk: break yield chunk def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]: """Content-defined chunking using Rabin fingerprinting Args: file_path: Path to file Yields: Variable-size chunks based on content """ with open(file_path, 'rb') as f: chunk_data = bytearray() window = bytearray() hash_value = 0 while True: byte = f.read(1) if not byte: # End of file - yield remaining data if chunk_data: yield bytes(chunk_data) break chunk_data.extend(byte) window.extend(byte) # Maintain window size if len(window) > self.window_size: window.pop(0) # Update rolling hash hash_value = self._rolling_hash(window) # Check if we should create a boundary should_break = ( len(chunk_data) >= self.min_chunk_size and ( (hash_value & self.mask) == 0 or len(chunk_data) >= self.max_chunk_size ) ) if should_break: yield bytes(chunk_data) chunk_data = bytearray() window = bytearray() hash_value = 0 def _rolling_hash(self, window: bytearray) -> int: """Calculate rolling hash for window Args: window: Byte window Returns: Hash value """ hash_value = 0 for byte in window: hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF return hash_value class SimpleChunker: """Simple fixed-size chunker for comparison""" def __init__(self, chunk_size: int = 8192): """Initialize simple chunker Args: chunk_size: Fixed chunk size in bytes """ self.chunk_size = chunk_size def chunk_file(self, file_path: Path) -> Iterator[bytes]: """Chunk file into fixed-size pieces Args: file_path: Path to file Yields: Fixed-size chunks """ with open(file_path, 'rb') as f: while True: chunk = f.read(self.chunk_size) if not chunk: break yield chunk def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str: """Hash a chunk of data Args: chunk: Chunk data algorithm: Hash algorithm (default: sha256) Returns: Hex digest of hash """ hasher = hashlib.new(algorithm) hasher.update(chunk) return hasher.hexdigest() def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str: """Hash entire file Args: file_path: Path to file algorithm: Hash algorithm (default: sha256) chunk_size: Size of chunks to read Returns: Hex digest of file hash """ hasher = hashlib.new(algorithm) with open(file_path, 'rb') as f: while True: chunk = f.read(chunk_size) if not chunk: break hasher.update(chunk) return hasher.hexdigest() def compute_file_signature( file_path: Path, use_rabin: bool = True, avg_chunk_size: int = 8192 ) -> tuple[str, list[str]]: """Compute file signature with chunk hashes Args: file_path: Path to file use_rabin: Whether to use Rabin chunking (vs fixed-size) avg_chunk_size: Average chunk size for Rabin or fixed size Returns: Tuple of (file_hash, list of chunk hashes) """ if use_rabin: chunker = RabinChunker(avg_chunk_size=avg_chunk_size) else: chunker = SimpleChunker(chunk_size=avg_chunk_size) chunk_hashes = [] file_hasher = hashlib.sha256() for chunk in chunker.chunk_file(file_path): # Hash individual chunk chunk_hash = hash_chunk(chunk) chunk_hashes.append(chunk_hash) # Update file hash file_hasher.update(chunk) file_hash = file_hasher.hexdigest() return file_hash, chunk_hashes