This commit is contained in:
mike
2025-12-12 23:04:51 +01:00
parent 56b2db82fc
commit 87550e426a
14 changed files with 132 additions and 122 deletions

View File

@@ -70,17 +70,17 @@ class DeduplicationEngine:
if disk:
cursor.execute("""
SELECT path, size
FROM files_bak
WHERE disk = %s AND checksum IS NULL
FROM files
WHERE disk_label = %s AND checksum IS NULL
ORDER BY size DESC
""", (disk,))
""", (disk,))
else:
cursor.execute("""
SELECT path, size
FROM files_bak
FROM files
WHERE checksum IS NULL
ORDER BY size DESC
""")
""")
files_to_process = cursor.fetchall()
total_files = len(files_to_process)
@@ -108,10 +108,10 @@ class DeduplicationEngine:
if checksum:
# Update database
cursor.execute("""
UPDATE files_bak
UPDATE files
SET checksum = %s, duplicate_of = %s
WHERE path = %s
""", (checksum, duplicate_of, str(path)))
""", (checksum, duplicate_of, str(path)))
stats.files_succeeded += 1
stats.bytes_processed += size
@@ -225,19 +225,19 @@ class DeduplicationEngine:
if disk:
cursor.execute("""
SELECT checksum, array_agg(path ORDER BY path) as paths
FROM files_bak
WHERE disk = %s AND checksum IS NOT NULL
FROM files
WHERE disk_label = %s AND checksum IS NOT NULL
GROUP BY checksum
HAVING COUNT(*) > 1
""", (disk,))
""", (disk,))
else:
cursor.execute("""
SELECT checksum, array_agg(path ORDER BY path) as paths
FROM files_bak
FROM files
WHERE checksum IS NOT NULL
GROUP BY checksum
HAVING COUNT(*) > 1
""")
""")
duplicates = {}
for checksum, paths in cursor.fetchall():
@@ -262,18 +262,18 @@ class DeduplicationEngine:
stats = {}
# Total files
cursor.execute("SELECT COUNT(*) FROM files_bak WHERE checksum IS NOT NULL")
cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
stats['total_files'] = cursor.fetchone()[0]
# Unique files
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files_bak WHERE checksum IS NOT NULL")
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
stats['unique_files'] = cursor.fetchone()[0]
# Duplicate files
stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
# Total size
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files_bak WHERE checksum IS NOT NULL")
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
stats['total_size'] = cursor.fetchone()[0]
# Unique size
@@ -281,10 +281,10 @@ class DeduplicationEngine:
SELECT COALESCE(SUM(size), 0)
FROM (
SELECT DISTINCT ON (checksum) size
FROM files_bak
FROM files
WHERE checksum IS NOT NULL
) AS unique_files
""")
""")
stats['unique_size'] = cursor.fetchone()[0]
# Wasted space
@@ -321,14 +321,14 @@ class DeduplicationEngine:
cursor.execute("""
WITH canonical AS (
SELECT DISTINCT ON (checksum) path, checksum
FROM files_bak
FROM files
WHERE checksum IS NOT NULL
ORDER BY checksum, path
)
UPDATE files_bak
UPDATE files
SET duplicate_of = NULL
WHERE path IN (SELECT path FROM canonical)
""")
""")
count = cursor.rowcount
conn.commit()