from typing import Optional, Dict, Set from pathlib import Path import psycopg2 from psycopg2.extras import execute_batch from ..shared.config import DatabaseConfig class HashStore: def __init__(self, db_config: DatabaseConfig): self.db_config = db_config self._connection = None def _get_connection(self): if self._connection is None or self._connection.closed: self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password) return self._connection def _ensure_tables(self): conn = self._get_connection() cursor = conn.cursor() cursor.execute('\n CREATE TABLE IF NOT EXISTS file_hashes (\n checksum TEXT PRIMARY KEY,\n canonical_path TEXT NOT NULL,\n size BIGINT NOT NULL,\n first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n ref_count INTEGER DEFAULT 1\n )\n ') cursor.execute('\n CREATE TABLE IF NOT EXISTS chunk_hashes (\n chunk_hash TEXT PRIMARY KEY,\n size INTEGER NOT NULL,\n first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n ref_count INTEGER DEFAULT 1\n )\n ') cursor.execute('\n CREATE TABLE IF NOT EXISTS file_chunks (\n id SERIAL PRIMARY KEY,\n file_checksum TEXT NOT NULL,\n chunk_hash TEXT NOT NULL,\n chunk_index INTEGER NOT NULL,\n FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),\n FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),\n UNIQUE (file_checksum, chunk_index)\n )\n ') cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_file_chunks_file\n ON file_chunks(file_checksum)\n ') cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk\n ON file_chunks(chunk_hash)\n ') conn.commit() cursor.close() def exists(self, checksum: str) -> bool: self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() cursor.execute('SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1', (checksum,)) exists = cursor.fetchone() is not None cursor.close() return exists def get_canonical(self, checksum: str) -> Optional[str]: self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() cursor.execute('SELECT canonical_path FROM file_hashes WHERE checksum = %s', (checksum,)) result = cursor.fetchone() cursor.close() return result[0] if result else None def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None: self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() try: cursor.execute('\n INSERT INTO file_hashes (checksum, canonical_path, size)\n VALUES (%s, %s, %s)\n ON CONFLICT (checksum) DO UPDATE SET\n ref_count = file_hashes.ref_count + 1\n ', (checksum, str(path), size)) if chunk_hashes: chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes] execute_batch(cursor, '\n INSERT INTO chunk_hashes (chunk_hash, size)\n VALUES (%s, %s)\n ON CONFLICT (chunk_hash) DO UPDATE SET\n ref_count = chunk_hashes.ref_count + 1\n ', chunk_data, page_size=1000) mapping_data = [(checksum, chunk_hash, idx) for idx, chunk_hash in enumerate(chunk_hashes)] execute_batch(cursor, '\n INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)\n VALUES (%s, %s, %s)\n ON CONFLICT (file_checksum, chunk_index) DO NOTHING\n ', mapping_data, page_size=1000) conn.commit() except Exception as e: conn.rollback() raise finally: cursor.close() def get_chunk_hashes(self, checksum: str) -> list[str]: self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() cursor.execute('\n SELECT chunk_hash\n FROM file_chunks\n WHERE file_checksum = %s\n ORDER BY chunk_index\n ', (checksum,)) chunk_hashes = [row[0] for row in cursor.fetchall()] cursor.close() return chunk_hashes def get_duplicates(self) -> Dict[str, list[str]]: self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() cursor.execute('\n SELECT f.path, f.checksum\n FROM files f\n WHERE f.checksum IS NOT NULL\n ') hash_to_paths: Dict[str, list[str]] = {} for path, checksum in cursor.fetchall(): if checksum not in hash_to_paths: hash_to_paths[checksum] = [] hash_to_paths[checksum].append(path) cursor.close() duplicates = {paths[0]: paths[1:] for checksum, paths in hash_to_paths.items() if len(paths) > 1} return duplicates def get_stats(self) -> Dict[str, int]: self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() stats = {} cursor.execute('SELECT COUNT(*) FROM file_hashes') stats['unique_files'] = cursor.fetchone()[0] cursor.execute('SELECT COUNT(*) FROM chunk_hashes') stats['unique_chunks'] = cursor.fetchone()[0] cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes') stats['total_file_refs'] = cursor.fetchone()[0] cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes') stats['total_chunk_refs'] = cursor.fetchone()[0] if stats['total_file_refs'] > 0: stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs'] else: stats['dedup_ratio'] = 1.0 cursor.close() return stats def find_similar_files(self, checksum: str, threshold: float=0.8) -> list[tuple[str, float]]: self._ensure_tables() conn = self._get_connection() cursor = conn.cursor() target_chunks = set(self.get_chunk_hashes(checksum)) if not target_chunks: cursor.close() return [] cursor.execute('\n SELECT DISTINCT fc.file_checksum\n FROM file_chunks fc\n WHERE fc.chunk_hash = ANY(%s)\n AND fc.file_checksum != %s\n ', (list(target_chunks), checksum)) similar_files = [] for other_checksum, in cursor.fetchall(): other_chunks = set(self.get_chunk_hashes(other_checksum)) intersection = len(target_chunks & other_chunks) union = len(target_chunks | other_chunks) if union > 0: similarity = intersection / union if similarity >= threshold: similar_files.append((other_checksum, similarity)) cursor.close() similar_files.sort(key=lambda x: x[1], reverse=True) return similar_files def close(self): if self._connection and (not self._connection.closed): self._connection.close() def __enter__(self): self._ensure_tables() return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() class MemoryHashStore: def __init__(self): self.hashes: Dict[str, tuple[str, int]] = {} self.chunks: Dict[str, int] = {} self.file_chunks: Dict[str, list[str]] = {} def exists(self, checksum: str) -> bool: return checksum in self.hashes def get_canonical(self, checksum: str) -> Optional[str]: return self.hashes.get(checksum, (None, 0))[0] def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None: self.hashes[checksum] = (str(path), size) if chunk_hashes: self.file_chunks[checksum] = chunk_hashes for chunk_hash in chunk_hashes: self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1 def get_chunk_hashes(self, checksum: str) -> list[str]: return self.file_chunks.get(checksum, []) def get_stats(self) -> Dict[str, int]: return {'unique_files': len(self.hashes), 'unique_chunks': len(self.chunks), 'total_file_refs': len(self.hashes), 'total_chunk_refs': sum(self.chunks.values()), 'dedup_ratio': 1.0} def close(self): pass def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass