clean up code
This commit is contained in:
@@ -1,75 +1,29 @@
|
||||
"""Rabin fingerprint chunker for content-defined chunking"""
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional
|
||||
|
||||
|
||||
class RabinChunker:
|
||||
"""Content-defined chunking using Rabin fingerprinting
|
||||
|
||||
Uses a rolling hash to identify chunk boundaries based on content,
|
||||
allowing for efficient deduplication even when data is modified.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
avg_chunk_size: int = 8192,
|
||||
min_chunk_size: Optional[int] = None,
|
||||
max_chunk_size: Optional[int] = None,
|
||||
window_size: int = 48
|
||||
):
|
||||
"""Initialize Rabin chunker
|
||||
|
||||
Args:
|
||||
avg_chunk_size: Target average chunk size in bytes
|
||||
min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
|
||||
max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
|
||||
window_size: Rolling hash window size
|
||||
"""
|
||||
def __init__(self, avg_chunk_size: int=8192, min_chunk_size: Optional[int]=None, max_chunk_size: Optional[int]=None, window_size: int=48):
|
||||
self.avg_chunk_size = avg_chunk_size
|
||||
self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4)
|
||||
self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8)
|
||||
self.min_chunk_size = min_chunk_size or avg_chunk_size // 4
|
||||
self.max_chunk_size = max_chunk_size or avg_chunk_size * 8
|
||||
self.window_size = window_size
|
||||
|
||||
# Calculate mask for boundary detection
|
||||
# For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
|
||||
bits = 0
|
||||
size = avg_chunk_size
|
||||
while size > 1:
|
||||
bits += 1
|
||||
size >>= 1
|
||||
self.mask = (1 << bits) - 1
|
||||
self.poly = 17349423945073011
|
||||
|
||||
# Polynomial for rolling hash (prime number)
|
||||
self.poly = 0x3DA3358B4DC173
|
||||
|
||||
def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
|
||||
"""Chunk a file using Rabin fingerprinting
|
||||
|
||||
Args:
|
||||
file_path: Path to file to chunk
|
||||
chunk_size: If provided, use fixed-size chunking instead
|
||||
|
||||
Yields:
|
||||
Chunk data as bytes
|
||||
"""
|
||||
def chunk_file(self, file_path: Path, chunk_size: Optional[int]=None) -> Iterator[bytes]:
|
||||
if chunk_size:
|
||||
# Use fixed-size chunking
|
||||
yield from self._chunk_fixed(file_path, chunk_size)
|
||||
else:
|
||||
# Use content-defined chunking
|
||||
yield from self._chunk_rabin(file_path)
|
||||
|
||||
def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
|
||||
"""Fixed-size chunking
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
chunk_size: Chunk size in bytes
|
||||
|
||||
Yields:
|
||||
Fixed-size chunks
|
||||
"""
|
||||
with open(file_path, 'rb') as f:
|
||||
while True:
|
||||
chunk = f.read(chunk_size)
|
||||
@@ -78,46 +32,22 @@ class RabinChunker:
|
||||
yield chunk
|
||||
|
||||
def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
|
||||
"""Content-defined chunking using Rabin fingerprinting
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Yields:
|
||||
Variable-size chunks based on content
|
||||
"""
|
||||
with open(file_path, 'rb') as f:
|
||||
chunk_data = bytearray()
|
||||
window = bytearray()
|
||||
hash_value = 0
|
||||
|
||||
while True:
|
||||
byte = f.read(1)
|
||||
if not byte:
|
||||
# End of file - yield remaining data
|
||||
if chunk_data:
|
||||
yield bytes(chunk_data)
|
||||
break
|
||||
|
||||
chunk_data.extend(byte)
|
||||
window.extend(byte)
|
||||
|
||||
# Maintain window size
|
||||
if len(window) > self.window_size:
|
||||
window.pop(0)
|
||||
|
||||
# Update rolling hash
|
||||
hash_value = self._rolling_hash(window)
|
||||
|
||||
# Check if we should create a boundary
|
||||
should_break = (
|
||||
len(chunk_data) >= self.min_chunk_size and
|
||||
(
|
||||
(hash_value & self.mask) == 0 or
|
||||
len(chunk_data) >= self.max_chunk_size
|
||||
)
|
||||
)
|
||||
|
||||
should_break = len(chunk_data) >= self.min_chunk_size and (hash_value & self.mask == 0 or len(chunk_data) >= self.max_chunk_size)
|
||||
if should_break:
|
||||
yield bytes(chunk_data)
|
||||
chunk_data = bytearray()
|
||||
@@ -125,40 +55,17 @@ class RabinChunker:
|
||||
hash_value = 0
|
||||
|
||||
def _rolling_hash(self, window: bytearray) -> int:
|
||||
"""Calculate rolling hash for window
|
||||
|
||||
Args:
|
||||
window: Byte window
|
||||
|
||||
Returns:
|
||||
Hash value
|
||||
"""
|
||||
hash_value = 0
|
||||
for byte in window:
|
||||
hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF
|
||||
hash_value = (hash_value << 1) + byte & 18446744073709551615
|
||||
return hash_value
|
||||
|
||||
|
||||
class SimpleChunker:
|
||||
"""Simple fixed-size chunker for comparison"""
|
||||
|
||||
def __init__(self, chunk_size: int = 8192):
|
||||
"""Initialize simple chunker
|
||||
|
||||
Args:
|
||||
chunk_size: Fixed chunk size in bytes
|
||||
"""
|
||||
def __init__(self, chunk_size: int=8192):
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
def chunk_file(self, file_path: Path) -> Iterator[bytes]:
|
||||
"""Chunk file into fixed-size pieces
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Yields:
|
||||
Fixed-size chunks
|
||||
"""
|
||||
with open(file_path, 'rb') as f:
|
||||
while True:
|
||||
chunk = f.read(self.chunk_size)
|
||||
@@ -166,76 +73,31 @@ class SimpleChunker:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
|
||||
def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
|
||||
"""Hash a chunk of data
|
||||
|
||||
Args:
|
||||
chunk: Chunk data
|
||||
algorithm: Hash algorithm (default: sha256)
|
||||
|
||||
Returns:
|
||||
Hex digest of hash
|
||||
"""
|
||||
def hash_chunk(chunk: bytes, algorithm: str='sha256') -> str:
|
||||
hasher = hashlib.new(algorithm)
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
|
||||
"""Hash entire file
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
algorithm: Hash algorithm (default: sha256)
|
||||
chunk_size: Size of chunks to read
|
||||
|
||||
Returns:
|
||||
Hex digest of file hash
|
||||
"""
|
||||
def hash_file(file_path: Path, algorithm: str='sha256', chunk_size: int=65536) -> str:
|
||||
hasher = hashlib.new(algorithm)
|
||||
|
||||
with open(file_path, 'rb') as f:
|
||||
while True:
|
||||
chunk = f.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
hasher.update(chunk)
|
||||
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def compute_file_signature(
|
||||
file_path: Path,
|
||||
use_rabin: bool = True,
|
||||
avg_chunk_size: int = 8192
|
||||
) -> tuple[str, list[str]]:
|
||||
"""Compute file signature with chunk hashes
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
use_rabin: Whether to use Rabin chunking (vs fixed-size)
|
||||
avg_chunk_size: Average chunk size for Rabin or fixed size
|
||||
|
||||
Returns:
|
||||
Tuple of (file_hash, list of chunk hashes)
|
||||
"""
|
||||
def compute_file_signature(file_path: Path, use_rabin: bool=True, avg_chunk_size: int=8192) -> tuple[str, list[str]]:
|
||||
if use_rabin:
|
||||
chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
|
||||
else:
|
||||
chunker = SimpleChunker(chunk_size=avg_chunk_size)
|
||||
|
||||
chunk_hashes = []
|
||||
file_hasher = hashlib.sha256()
|
||||
|
||||
for chunk in chunker.chunk_file(file_path):
|
||||
# Hash individual chunk
|
||||
chunk_hash = hash_chunk(chunk)
|
||||
chunk_hashes.append(chunk_hash)
|
||||
|
||||
# Update file hash
|
||||
file_hasher.update(chunk)
|
||||
|
||||
file_hash = file_hasher.hexdigest()
|
||||
|
||||
return file_hash, chunk_hashes
|
||||
return (file_hash, chunk_hashes)
|
||||
|
||||
Reference in New Issue
Block a user