Files
defrag/app/deduplication/store.py
2025-12-13 12:00:34 +01:00

175 lines
8.9 KiB
Python

from typing import Optional, Dict, Set
from pathlib import Path
import psycopg2
from psycopg2.extras import execute_batch
from ..shared.config import DatabaseConfig
class HashStore:
def __init__(self, db_config: DatabaseConfig):
self.db_config = db_config
self._connection = None
def _get_connection(self):
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
return self._connection
def _ensure_tables(self):
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('\n CREATE TABLE IF NOT EXISTS file_hashes (\n checksum TEXT PRIMARY KEY,\n canonical_path TEXT NOT NULL,\n size BIGINT NOT NULL,\n first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n ref_count INTEGER DEFAULT 1\n )\n ')
cursor.execute('\n CREATE TABLE IF NOT EXISTS chunk_hashes (\n chunk_hash TEXT PRIMARY KEY,\n size INTEGER NOT NULL,\n first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n ref_count INTEGER DEFAULT 1\n )\n ')
cursor.execute('\n CREATE TABLE IF NOT EXISTS file_chunks (\n id SERIAL PRIMARY KEY,\n file_checksum TEXT NOT NULL,\n chunk_hash TEXT NOT NULL,\n chunk_index INTEGER NOT NULL,\n FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),\n FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),\n UNIQUE (file_checksum, chunk_index)\n )\n ')
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_file_chunks_file\n ON file_chunks(file_checksum)\n ')
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk\n ON file_chunks(chunk_hash)\n ')
conn.commit()
cursor.close()
def exists(self, checksum: str) -> bool:
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1', (checksum,))
exists = cursor.fetchone() is not None
cursor.close()
return exists
def get_canonical(self, checksum: str) -> Optional[str]:
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('SELECT canonical_path FROM file_hashes WHERE checksum = %s', (checksum,))
result = cursor.fetchone()
cursor.close()
return result[0] if result else None
def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None:
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
try:
cursor.execute('\n INSERT INTO file_hashes (checksum, canonical_path, size)\n VALUES (%s, %s, %s)\n ON CONFLICT (checksum) DO UPDATE SET\n ref_count = file_hashes.ref_count + 1\n ', (checksum, str(path), size))
if chunk_hashes:
chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes]
execute_batch(cursor, '\n INSERT INTO chunk_hashes (chunk_hash, size)\n VALUES (%s, %s)\n ON CONFLICT (chunk_hash) DO UPDATE SET\n ref_count = chunk_hashes.ref_count + 1\n ', chunk_data, page_size=1000)
mapping_data = [(checksum, chunk_hash, idx) for idx, chunk_hash in enumerate(chunk_hashes)]
execute_batch(cursor, '\n INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)\n VALUES (%s, %s, %s)\n ON CONFLICT (file_checksum, chunk_index) DO NOTHING\n ', mapping_data, page_size=1000)
conn.commit()
except Exception as e:
conn.rollback()
raise
finally:
cursor.close()
def get_chunk_hashes(self, checksum: str) -> list[str]:
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('\n SELECT chunk_hash\n FROM file_chunks\n WHERE file_checksum = %s\n ORDER BY chunk_index\n ', (checksum,))
chunk_hashes = [row[0] for row in cursor.fetchall()]
cursor.close()
return chunk_hashes
def get_duplicates(self) -> Dict[str, list[str]]:
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('\n SELECT f.path, f.checksum\n FROM files f\n WHERE f.checksum IS NOT NULL\n ')
hash_to_paths: Dict[str, list[str]] = {}
for path, checksum in cursor.fetchall():
if checksum not in hash_to_paths:
hash_to_paths[checksum] = []
hash_to_paths[checksum].append(path)
cursor.close()
duplicates = {paths[0]: paths[1:] for checksum, paths in hash_to_paths.items() if len(paths) > 1}
return duplicates
def get_stats(self) -> Dict[str, int]:
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
stats = {}
cursor.execute('SELECT COUNT(*) FROM file_hashes')
stats['unique_files'] = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(*) FROM chunk_hashes')
stats['unique_chunks'] = cursor.fetchone()[0]
cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes')
stats['total_file_refs'] = cursor.fetchone()[0]
cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes')
stats['total_chunk_refs'] = cursor.fetchone()[0]
if stats['total_file_refs'] > 0:
stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs']
else:
stats['dedup_ratio'] = 1.0
cursor.close()
return stats
def find_similar_files(self, checksum: str, threshold: float=0.8) -> list[tuple[str, float]]:
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
target_chunks = set(self.get_chunk_hashes(checksum))
if not target_chunks:
cursor.close()
return []
cursor.execute('\n SELECT DISTINCT fc.file_checksum\n FROM file_chunks fc\n WHERE fc.chunk_hash = ANY(%s)\n AND fc.file_checksum != %s\n ', (list(target_chunks), checksum))
similar_files = []
for other_checksum, in cursor.fetchall():
other_chunks = set(self.get_chunk_hashes(other_checksum))
intersection = len(target_chunks & other_chunks)
union = len(target_chunks | other_chunks)
if union > 0:
similarity = intersection / union
if similarity >= threshold:
similar_files.append((other_checksum, similarity))
cursor.close()
similar_files.sort(key=lambda x: x[1], reverse=True)
return similar_files
def close(self):
if self._connection and (not self._connection.closed):
self._connection.close()
def __enter__(self):
self._ensure_tables()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
class MemoryHashStore:
def __init__(self):
self.hashes: Dict[str, tuple[str, int]] = {}
self.chunks: Dict[str, int] = {}
self.file_chunks: Dict[str, list[str]] = {}
def exists(self, checksum: str) -> bool:
return checksum in self.hashes
def get_canonical(self, checksum: str) -> Optional[str]:
return self.hashes.get(checksum, (None, 0))[0]
def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None:
self.hashes[checksum] = (str(path), size)
if chunk_hashes:
self.file_chunks[checksum] = chunk_hashes
for chunk_hash in chunk_hashes:
self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1
def get_chunk_hashes(self, checksum: str) -> list[str]:
return self.file_chunks.get(checksum, [])
def get_stats(self) -> Dict[str, int]:
return {'unique_files': len(self.hashes), 'unique_chunks': len(self.chunks), 'total_file_refs': len(self.hashes), 'total_chunk_refs': sum(self.chunks.values()), 'dedup_ratio': 1.0}
def close(self):
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass