clean up code
This commit is contained in:
@@ -1,412 +1,174 @@
|
||||
"""Hash store for deduplication with optional Redis support"""
|
||||
from typing import Optional, Dict, Set
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
from ..shared.config import DatabaseConfig
|
||||
|
||||
|
||||
class HashStore:
|
||||
"""PostgreSQL-based hash store for deduplication"""
|
||||
|
||||
def __init__(self, db_config: DatabaseConfig):
|
||||
"""Initialize hash store
|
||||
|
||||
Args:
|
||||
db_config: Database configuration
|
||||
"""
|
||||
self.db_config = db_config
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
"""Get or create database connection"""
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(
|
||||
host=self.db_config.host,
|
||||
port=self.db_config.port,
|
||||
database=self.db_config.database,
|
||||
user=self.db_config.user,
|
||||
password=self.db_config.password
|
||||
)
|
||||
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
|
||||
return self._connection
|
||||
|
||||
def _ensure_tables(self):
|
||||
"""Ensure hash store tables exist"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create hashes table for file-level deduplication
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS file_hashes (
|
||||
checksum TEXT PRIMARY KEY,
|
||||
canonical_path TEXT NOT NULL,
|
||||
size BIGINT NOT NULL,
|
||||
first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
ref_count INTEGER DEFAULT 1
|
||||
)
|
||||
""")
|
||||
|
||||
# Create chunk hashes table for chunk-level deduplication
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS chunk_hashes (
|
||||
chunk_hash TEXT PRIMARY KEY,
|
||||
size INTEGER NOT NULL,
|
||||
first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
ref_count INTEGER DEFAULT 1
|
||||
)
|
||||
""")
|
||||
|
||||
# Create file-chunk mapping table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS file_chunks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
file_checksum TEXT NOT NULL,
|
||||
chunk_hash TEXT NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),
|
||||
FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),
|
||||
UNIQUE (file_checksum, chunk_index)
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_file_chunks_file
|
||||
ON file_chunks(file_checksum)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk
|
||||
ON file_chunks(chunk_hash)
|
||||
""")
|
||||
|
||||
cursor.execute('\n CREATE TABLE IF NOT EXISTS file_hashes (\n checksum TEXT PRIMARY KEY,\n canonical_path TEXT NOT NULL,\n size BIGINT NOT NULL,\n first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n ref_count INTEGER DEFAULT 1\n )\n ')
|
||||
cursor.execute('\n CREATE TABLE IF NOT EXISTS chunk_hashes (\n chunk_hash TEXT PRIMARY KEY,\n size INTEGER NOT NULL,\n first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n ref_count INTEGER DEFAULT 1\n )\n ')
|
||||
cursor.execute('\n CREATE TABLE IF NOT EXISTS file_chunks (\n id SERIAL PRIMARY KEY,\n file_checksum TEXT NOT NULL,\n chunk_hash TEXT NOT NULL,\n chunk_index INTEGER NOT NULL,\n FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),\n FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),\n UNIQUE (file_checksum, chunk_index)\n )\n ')
|
||||
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_file_chunks_file\n ON file_chunks(file_checksum)\n ')
|
||||
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk\n ON file_chunks(chunk_hash)\n ')
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def exists(self, checksum: str) -> bool:
|
||||
"""Check if hash exists in store
|
||||
|
||||
Args:
|
||||
checksum: File hash to check
|
||||
|
||||
Returns:
|
||||
True if hash exists
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1",
|
||||
(checksum,)
|
||||
)
|
||||
|
||||
cursor.execute('SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1', (checksum,))
|
||||
exists = cursor.fetchone() is not None
|
||||
cursor.close()
|
||||
|
||||
return exists
|
||||
|
||||
def get_canonical(self, checksum: str) -> Optional[str]:
|
||||
"""Get canonical path for a hash
|
||||
|
||||
Args:
|
||||
checksum: File hash
|
||||
|
||||
Returns:
|
||||
Canonical file path or None if not found
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"SELECT canonical_path FROM file_hashes WHERE checksum = %s",
|
||||
(checksum,)
|
||||
)
|
||||
|
||||
cursor.execute('SELECT canonical_path FROM file_hashes WHERE checksum = %s', (checksum,))
|
||||
result = cursor.fetchone()
|
||||
cursor.close()
|
||||
|
||||
return result[0] if result else None
|
||||
|
||||
def store_canonical(
|
||||
self,
|
||||
checksum: str,
|
||||
path: Path,
|
||||
size: int,
|
||||
chunk_hashes: Optional[list[str]] = None
|
||||
) -> None:
|
||||
"""Store canonical reference for a hash
|
||||
|
||||
Args:
|
||||
checksum: File hash
|
||||
path: Canonical file path
|
||||
size: File size in bytes
|
||||
chunk_hashes: Optional list of chunk hashes
|
||||
"""
|
||||
def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None:
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# Store file hash
|
||||
cursor.execute("""
|
||||
INSERT INTO file_hashes (checksum, canonical_path, size)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT (checksum) DO UPDATE SET
|
||||
ref_count = file_hashes.ref_count + 1
|
||||
""", (checksum, str(path), size))
|
||||
|
||||
# Store chunk hashes if provided
|
||||
cursor.execute('\n INSERT INTO file_hashes (checksum, canonical_path, size)\n VALUES (%s, %s, %s)\n ON CONFLICT (checksum) DO UPDATE SET\n ref_count = file_hashes.ref_count + 1\n ', (checksum, str(path), size))
|
||||
if chunk_hashes:
|
||||
# Insert chunk hashes
|
||||
chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes]
|
||||
execute_batch(cursor, """
|
||||
INSERT INTO chunk_hashes (chunk_hash, size)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (chunk_hash) DO UPDATE SET
|
||||
ref_count = chunk_hashes.ref_count + 1
|
||||
""", chunk_data, page_size=1000)
|
||||
|
||||
# Create file-chunk mappings
|
||||
mapping_data = [
|
||||
(checksum, chunk_hash, idx)
|
||||
for idx, chunk_hash in enumerate(chunk_hashes)
|
||||
]
|
||||
execute_batch(cursor, """
|
||||
INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT (file_checksum, chunk_index) DO NOTHING
|
||||
""", mapping_data, page_size=1000)
|
||||
|
||||
execute_batch(cursor, '\n INSERT INTO chunk_hashes (chunk_hash, size)\n VALUES (%s, %s)\n ON CONFLICT (chunk_hash) DO UPDATE SET\n ref_count = chunk_hashes.ref_count + 1\n ', chunk_data, page_size=1000)
|
||||
mapping_data = [(checksum, chunk_hash, idx) for idx, chunk_hash in enumerate(chunk_hashes)]
|
||||
execute_batch(cursor, '\n INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)\n VALUES (%s, %s, %s)\n ON CONFLICT (file_checksum, chunk_index) DO NOTHING\n ', mapping_data, page_size=1000)
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
raise
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def get_chunk_hashes(self, checksum: str) -> list[str]:
|
||||
"""Get chunk hashes for a file
|
||||
|
||||
Args:
|
||||
checksum: File hash
|
||||
|
||||
Returns:
|
||||
List of chunk hashes in order
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT chunk_hash
|
||||
FROM file_chunks
|
||||
WHERE file_checksum = %s
|
||||
ORDER BY chunk_index
|
||||
""", (checksum,))
|
||||
|
||||
cursor.execute('\n SELECT chunk_hash\n FROM file_chunks\n WHERE file_checksum = %s\n ORDER BY chunk_index\n ', (checksum,))
|
||||
chunk_hashes = [row[0] for row in cursor.fetchall()]
|
||||
cursor.close()
|
||||
|
||||
return chunk_hashes
|
||||
|
||||
def get_duplicates(self) -> Dict[str, list[str]]:
|
||||
"""Get all duplicate file groups
|
||||
|
||||
Returns:
|
||||
Dictionary mapping canonical path to list of duplicate paths
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all files with their hashes
|
||||
cursor.execute("""
|
||||
SELECT f.path, f.checksum
|
||||
FROM files f
|
||||
WHERE f.checksum IS NOT NULL
|
||||
""")
|
||||
|
||||
# Group by checksum
|
||||
cursor.execute('\n SELECT f.path, f.checksum\n FROM files f\n WHERE f.checksum IS NOT NULL\n ')
|
||||
hash_to_paths: Dict[str, list[str]] = {}
|
||||
for path, checksum in cursor.fetchall():
|
||||
if checksum not in hash_to_paths:
|
||||
hash_to_paths[checksum] = []
|
||||
hash_to_paths[checksum].append(path)
|
||||
|
||||
cursor.close()
|
||||
|
||||
# Filter to only duplicates (more than one file)
|
||||
duplicates = {
|
||||
paths[0]: paths[1:]
|
||||
for checksum, paths in hash_to_paths.items()
|
||||
if len(paths) > 1
|
||||
}
|
||||
|
||||
duplicates = {paths[0]: paths[1:] for checksum, paths in hash_to_paths.items() if len(paths) > 1}
|
||||
return duplicates
|
||||
|
||||
def get_stats(self) -> Dict[str, int]:
|
||||
"""Get hash store statistics
|
||||
|
||||
Returns:
|
||||
Dictionary with statistics
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
stats = {}
|
||||
|
||||
# Count unique file hashes
|
||||
cursor.execute("SELECT COUNT(*) FROM file_hashes")
|
||||
cursor.execute('SELECT COUNT(*) FROM file_hashes')
|
||||
stats['unique_files'] = cursor.fetchone()[0]
|
||||
|
||||
# Count unique chunk hashes
|
||||
cursor.execute("SELECT COUNT(*) FROM chunk_hashes")
|
||||
cursor.execute('SELECT COUNT(*) FROM chunk_hashes')
|
||||
stats['unique_chunks'] = cursor.fetchone()[0]
|
||||
|
||||
# Count total references
|
||||
cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes")
|
||||
cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes')
|
||||
stats['total_file_refs'] = cursor.fetchone()[0]
|
||||
|
||||
# Count total chunk references
|
||||
cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes")
|
||||
cursor.execute('SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes')
|
||||
stats['total_chunk_refs'] = cursor.fetchone()[0]
|
||||
|
||||
# Calculate deduplication ratio
|
||||
if stats['total_file_refs'] > 0:
|
||||
stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs']
|
||||
else:
|
||||
stats['dedup_ratio'] = 1.0
|
||||
|
||||
cursor.close()
|
||||
|
||||
return stats
|
||||
|
||||
def find_similar_files(self, checksum: str, threshold: float = 0.8) -> list[tuple[str, float]]:
|
||||
"""Find files similar to given hash based on chunk overlap
|
||||
|
||||
Args:
|
||||
checksum: File hash to compare
|
||||
threshold: Similarity threshold (0.0 to 1.0)
|
||||
|
||||
Returns:
|
||||
List of tuples (other_checksum, similarity_score)
|
||||
"""
|
||||
def find_similar_files(self, checksum: str, threshold: float=0.8) -> list[tuple[str, float]]:
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get chunks for the target file
|
||||
target_chunks = set(self.get_chunk_hashes(checksum))
|
||||
|
||||
if not target_chunks:
|
||||
cursor.close()
|
||||
return []
|
||||
|
||||
# Find files sharing chunks
|
||||
cursor.execute("""
|
||||
SELECT DISTINCT fc.file_checksum
|
||||
FROM file_chunks fc
|
||||
WHERE fc.chunk_hash = ANY(%s)
|
||||
AND fc.file_checksum != %s
|
||||
""", (list(target_chunks), checksum))
|
||||
|
||||
cursor.execute('\n SELECT DISTINCT fc.file_checksum\n FROM file_chunks fc\n WHERE fc.chunk_hash = ANY(%s)\n AND fc.file_checksum != %s\n ', (list(target_chunks), checksum))
|
||||
similar_files = []
|
||||
|
||||
for (other_checksum,) in cursor.fetchall():
|
||||
for other_checksum, in cursor.fetchall():
|
||||
other_chunks = set(self.get_chunk_hashes(other_checksum))
|
||||
|
||||
# Calculate Jaccard similarity
|
||||
intersection = len(target_chunks & other_chunks)
|
||||
union = len(target_chunks | other_chunks)
|
||||
|
||||
if union > 0:
|
||||
similarity = intersection / union
|
||||
|
||||
if similarity >= threshold:
|
||||
similar_files.append((other_checksum, similarity))
|
||||
|
||||
cursor.close()
|
||||
|
||||
# Sort by similarity descending
|
||||
similar_files.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return similar_files
|
||||
|
||||
def close(self):
|
||||
"""Close database connection"""
|
||||
if self._connection and not self._connection.closed:
|
||||
if self._connection and (not self._connection.closed):
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
self._ensure_tables()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close()
|
||||
|
||||
|
||||
class MemoryHashStore:
|
||||
"""In-memory hash store for testing and small datasets"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize in-memory hash store"""
|
||||
self.hashes: Dict[str, tuple[str, int]] = {}
|
||||
self.chunks: Dict[str, int] = {}
|
||||
self.file_chunks: Dict[str, list[str]] = {}
|
||||
|
||||
def exists(self, checksum: str) -> bool:
|
||||
"""Check if hash exists"""
|
||||
return checksum in self.hashes
|
||||
|
||||
def get_canonical(self, checksum: str) -> Optional[str]:
|
||||
"""Get canonical path"""
|
||||
return self.hashes.get(checksum, (None, 0))[0]
|
||||
|
||||
def store_canonical(
|
||||
self,
|
||||
checksum: str,
|
||||
path: Path,
|
||||
size: int,
|
||||
chunk_hashes: Optional[list[str]] = None
|
||||
) -> None:
|
||||
"""Store canonical reference"""
|
||||
def store_canonical(self, checksum: str, path: Path, size: int, chunk_hashes: Optional[list[str]]=None) -> None:
|
||||
self.hashes[checksum] = (str(path), size)
|
||||
|
||||
if chunk_hashes:
|
||||
self.file_chunks[checksum] = chunk_hashes
|
||||
for chunk_hash in chunk_hashes:
|
||||
self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1
|
||||
|
||||
def get_chunk_hashes(self, checksum: str) -> list[str]:
|
||||
"""Get chunk hashes"""
|
||||
return self.file_chunks.get(checksum, [])
|
||||
|
||||
def get_stats(self) -> Dict[str, int]:
|
||||
"""Get statistics"""
|
||||
return {
|
||||
'unique_files': len(self.hashes),
|
||||
'unique_chunks': len(self.chunks),
|
||||
'total_file_refs': len(self.hashes),
|
||||
'total_chunk_refs': sum(self.chunks.values()),
|
||||
'dedup_ratio': 1.0
|
||||
}
|
||||
return {'unique_files': len(self.hashes), 'unique_chunks': len(self.chunks), 'total_file_refs': len(self.hashes), 'total_chunk_refs': sum(self.chunks.values()), 'dedup_ratio': 1.0}
|
||||
|
||||
def close(self):
|
||||
"""No-op for compatibility"""
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user