"""Hash store for deduplication with optional Redis support""" from typing import Optional, Dict, Set from pathlib import Path import psycopg2 from psycopg2.extras import execute_batch from ..shared.config import DatabaseConfig class HashStore: """PostgreSQL-based hash store for deduplication""" def __init__(self, db_config: DatabaseConfig): """Initialize hash store Args: db_config: Database configuration """ self.db_config = db_config self._connection = None def _get_connection(self): """Get or create database connection""" if self._connection is None or self._connection.closed: self._connection = psycopg2.connect( host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password ) return self._connection def _ensure_tables(self): """Ensure hash store tables exist""" conn = self._get_connection() cursor = conn.cursor() # Create hashes table for file-level deduplication cursor.execute(""" CREATE TABLE IF NOT EXISTS file_hashes ( checksum TEXT PRIMARY KEY, canonical_path TEXT NOT NULL, size BIGINT NOT NULL, first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP, ref_count INTEGER DEFAULT 1 ) """) # Create chunk hashes table for chunk-level deduplication cursor.execute(""" CREATE TABLE IF NOT EXISTS chunk_hashes ( chunk_hash TEXT PRIMARY KEY, size INTEGER NOT NULL, first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP, ref_count INTEGER DEFAULT 1 ) """) # Create file-chunk mapping table cursor.execute(""" CREATE TABLE IF NOT EXISTS file_chunks ( id SERIAL PRIMARY KEY, file_checksum TEXT NOT NULL, chunk_hash TEXT NOT NULL, chunk_index INTEGER NOT NULL, FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum), FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash), UNIQUE (file_checksum, chunk_index) ) """) # Create indexes cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_file_chunks_file ON file_chunks(file_checksum) """) cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk ON file_chunks(chunk_hash) """) conn.commit() cursor.close() def exists(self, checksum: str) -> bool: """Check if hash exists in store Args: checksum: File hash to check Returns: True if hash exists """ self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() cursor.execute( "SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1", (checksum,) ) exists = cursor.fetchone() is not None cursor.close() return exists def get_canonical(self, checksum: str) -> Optional[str]: """Get canonical path for a hash Args: checksum: File hash Returns: Canonical file path or None if not found """ self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() cursor.execute( "SELECT canonical_path FROM file_hashes WHERE checksum = %s", (checksum,) ) result = cursor.fetchone() cursor.close() return result[0] if result else None def store_canonical( self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]] = None ) -> None: """Store canonical reference for a hash Args: checksum: File hash path: Canonical file path size: File size in bytes chunk_hashes: Optional list of chunk hashes """ self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() try: # Store file hash cursor.execute(""" INSERT INTO file_hashes (checksum, canonical_path, size) VALUES (%s, %s, %s) ON CONFLICT (checksum) DO UPDATE SET ref_count = file_hashes.ref_count + 1 """, (checksum, str(path), size)) # Store chunk hashes if provided if chunk_hashes: # Insert chunk hashes chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes] execute_batch(cursor, """ INSERT INTO chunk_hashes (chunk_hash, size) VALUES (%s, %s) ON CONFLICT (chunk_hash) DO UPDATE SET ref_count = chunk_hashes.ref_count + 1 """, chunk_data, page_size=1000) # Create file-chunk mappings mapping_data = [ (checksum, chunk_hash, idx) for idx, chunk_hash in enumerate(chunk_hashes) ] execute_batch(cursor, """ INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index) VALUES (%s, %s, %s) ON CONFLICT (file_checksum, chunk_index) DO NOTHING """, mapping_data, page_size=1000) conn.commit() except Exception as e: conn.rollback() raise finally: cursor.close() def get_chunk_hashes(self, checksum: str) -> list[str]: """Get chunk hashes for a file Args: checksum: File hash Returns: List of chunk hashes in order """ self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() cursor.execute(""" SELECT chunk_hash FROM file_chunks WHERE file_checksum = %s ORDER BY chunk_index """, (checksum,)) chunk_hashes = [row[0] for row in cursor.fetchall()] cursor.close() return chunk_hashes def get_duplicates(self) -> Dict[str, list[str]]: """Get all duplicate file groups Returns: Dictionary mapping canonical path to list of duplicate paths """ self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() # Get all files with their hashes cursor.execute(""" SELECT f.path, f.checksum FROM files f WHERE f.checksum IS NOT NULL """) # Group by checksum hash_to_paths: Dict[str, list[str]] = {} for path, checksum in cursor.fetchall(): if checksum not in hash_to_paths: hash_to_paths[checksum] = [] hash_to_paths[checksum].append(path) cursor.close() # Filter to only duplicates (more than one file) duplicates = { paths[0]: paths[1:] for checksum, paths in hash_to_paths.items() if len(paths) > 1 } return duplicates def get_stats(self) -> Dict[str, int]: """Get hash store statistics Returns: Dictionary with statistics """ self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() stats = {} # Count unique file hashes cursor.execute("SELECT COUNT(*) FROM file_hashes") stats['unique_files'] = cursor.fetchone()[0] # Count unique chunk hashes cursor.execute("SELECT COUNT(*) FROM chunk_hashes") stats['unique_chunks'] = cursor.fetchone()[0] # Count total references cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes") stats['total_file_refs'] = cursor.fetchone()[0] # Count total chunk references cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes") stats['total_chunk_refs'] = cursor.fetchone()[0] # Calculate deduplication ratio if stats['total_file_refs'] > 0: stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs'] else: stats['dedup_ratio'] = 1.0 cursor.close() return stats def find_similar_files(self, checksum: str, threshold: float = 0.8) -> list[tuple[str, float]]: """Find files similar to given hash based on chunk overlap Args: checksum: File hash to compare threshold: Similarity threshold (0.0 to 1.0) Returns: List of tuples (other_checksum, similarity_score) """ self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() # Get chunks for the target file target_chunks = set(self.get_chunk_hashes(checksum)) if not target_chunks: cursor.close() return [] # Find files sharing chunks cursor.execute(""" SELECT DISTINCT fc.file_checksum FROM file_chunks fc WHERE fc.chunk_hash = ANY(%s) AND fc.file_checksum != %s """, (list(target_chunks), checksum)) similar_files = [] for (other_checksum,) in cursor.fetchall(): other_chunks = set(self.get_chunk_hashes(other_checksum)) # Calculate Jaccard similarity intersection = len(target_chunks & other_chunks) union = len(target_chunks | other_chunks) if union > 0: similarity = intersection / union if similarity >= threshold: similar_files.append((other_checksum, similarity)) cursor.close() # Sort by similarity descending similar_files.sort(key=lambda x: x[1], reverse=True) return similar_files def close(self): """Close database connection""" if self._connection and not self._connection.closed: self._connection.close() def __enter__(self): """Context manager entry""" self._ensure_tables() return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.close() class MemoryHashStore: """In-memory hash store for testing and small datasets""" def __init__(self): """Initialize in-memory hash store""" self.hashes: Dict[str, tuple[str, int]] = {} self.chunks: Dict[str, int] = {} self.file_chunks: Dict[str, list[str]] = {} def exists(self, checksum: str) -> bool: """Check if hash exists""" return checksum in self.hashes def get_canonical(self, checksum: str) -> Optional[str]: """Get canonical path""" return self.hashes.get(checksum, (None, 0))[0] def store_canonical( self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]] = None ) -> None: """Store canonical reference""" self.hashes[checksum] = (str(path), size) if chunk_hashes: self.file_chunks[checksum] = chunk_hashes for chunk_hash in chunk_hashes: self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1 def get_chunk_hashes(self, checksum: str) -> list[str]: """Get chunk hashes""" return self.file_chunks.get(checksum, []) def get_stats(self) -> Dict[str, int]: """Get statistics""" return { 'unique_files': len(self.hashes), 'unique_chunks': len(self.chunks), 'total_file_refs': len(self.hashes), 'total_chunk_refs': sum(self.chunks.values()), 'dedup_ratio': 1.0 } def close(self): """No-op for compatibility""" pass def __enter__(self): """Context manager entry""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" pass