This commit is contained in:
mike
2025-12-13 00:29:09 +01:00
parent 6449765890
commit 942e87d439
2 changed files with 270 additions and 17 deletions

View File

@@ -1,33 +1,23 @@
#!/usr/bin/env python3
"""
Disk Reorganizer - Safely restructure files across disks to free up one entire disk.
Three modes: index, plan, execute
"""
import os
import sys
from dataclasses import dataclass
import psycopg2
from psycopg2 import sql
from psycopg2.extras import RealDictCursor
import shutil
import hashlib
import argparse
import json
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Tuple
from typing import List, Dict, Optional
from datetime import datetime
import logging
import time
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('disk_reorganizer.log'),
logging.StreamHandler(sys.stdout)
]
handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
@@ -430,12 +420,186 @@ class DiskReorganizer:
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
def generate_report(self):
def run_deduplication(self, disk: Optional[str] = None, use_chunks: bool = True):
logger.info(f"Starting deduplication{' for disk ' + disk if disk else ''}")
disk_mount_map = {
'SMT': '/media/mike/SMT',
'DISK1': '/media/mike/DISK1',
'LLM': '/media/mike/LLM'
}
conn = self.get_connection()
cursor = conn.cursor()
def hash_file_local(file_path: Path) -> str:
hasher = hashlib.sha256()
with open(file_path, 'rb') as f:
while chunk := f.read(65536):
hasher.update(chunk)
return hasher.hexdigest()
try:
if disk:
cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC", (disk,))
else:
cursor.execute("SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC")
files_to_process = cursor.fetchall()
total = len(files_to_process)
logger.info(f"Found {total} files to hash")
processed = 0
skipped = 0
for path_str, size, disk_label in files_to_process:
try:
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
if not full_path.exists():
skipped += 1
continue
checksum = hash_file_local(full_path)
cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str))
dup_row = cursor.fetchone()
duplicate_of = dup_row[0] if dup_row else None
cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str))
processed += 1
if processed % 100 == 0:
conn.commit()
print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True)
except Exception as e:
skipped += 1
conn.rollback()
conn.commit()
print()
logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped")
finally:
cursor.close()
conn.close()
def plan_merge(self, sources: List[str], target: str, output_file: str,
filter_system: bool = False, network_target: str = None):
"""Plan merge of multiple source disks to target with deduplication"""
logger.info(f"Planning merge: {', '.join(sources)}{target or network_target}")
if filter_system:
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from filters import GitignoreFilter
file_filter = GitignoreFilter()
logger.info("System/build file filtering enabled")
conn = self.get_connection()
cursor = conn.cursor()
try:
placeholders = ','.join(['%s'] * len(sources))
cursor.execute(f"""
SELECT path, size, checksum, disk_label, duplicate_of
FROM files
WHERE disk_label IN ({placeholders})
ORDER BY size DESC
""", tuple(sources))
files = cursor.fetchall()
total_files = len(files)
total_size = sum(int(f[1]) for f in files)
unique_files = {}
duplicate_count = 0
duplicate_size = 0
filtered_count = 0
filtered_size = 0
for path, size, checksum, disk_label, duplicate_of in files:
if filter_system and file_filter.should_exclude(path):
filtered_count += 1
filtered_size += int(size)
continue
if checksum and checksum in unique_files:
duplicate_count += 1
duplicate_size += int(size)
else:
if checksum:
unique_files[checksum] = (path, int(size), disk_label)
unique_count = len(unique_files)
unique_size = sum(f[1] for f in unique_files.values())
plan = {
'sources': sources,
'target': target or network_target,
'network': network_target is not None,
'total_files': total_files,
'total_size': total_size,
'unique_files': unique_count,
'unique_size': unique_size,
'duplicate_files': duplicate_count,
'duplicate_size': duplicate_size,
'filtered_files': filtered_count if filter_system else 0,
'filtered_size': filtered_size if filter_system else 0,
'space_saved': duplicate_size + (filtered_size if filter_system else 0),
'operations': []
}
for checksum, (path, size, disk_label) in unique_files.items():
plan['operations'].append({
'source_disk': disk_label,
'source_path': path,
'target_disk': target or network_target,
'target_path': path,
'size': size,
'checksum': checksum
})
with open(output_file, 'w') as f:
json.dump(plan, f, indent=2)
logger.info(f"Merge plan saved to {output_file}")
print(f"\n=== MERGE PLAN SUMMARY ===")
print(f"Sources: {', '.join(sources)}")
print(f"Target: {target or network_target}")
print(f"Total files: {total_files:,} ({self.format_size(total_size)})")
if filter_system:
print(f"Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})")
print(f"Unique files: {unique_count:,} ({self.format_size(unique_size)})")
print(f"Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})")
print(f"Total space saved: {self.format_size(plan['space_saved'])}")
print(f"Space needed on target: {self.format_size(unique_size)}")
finally:
cursor.close()
conn.close()
def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
"""Generate status report"""
conn = self.get_connection()
cursor = conn.cursor()
try:
if preview_merge:
# Load and display merge plan
with open(preview_merge, 'r') as f:
plan = json.load(f)
print("\n=== MERGE PLAN PREVIEW ===")
print(f"Sources: {', '.join(plan['sources'])}")
print(f"Target: {plan['target']}")
print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
print(f"Space saved: {self.format_size(plan['space_saved'])}")
print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
return
cursor.execute("""
SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
""")
@@ -443,7 +607,43 @@ class DiskReorganizer:
print("\n=== FILE MIGRATION REPORT ===")
for row in cursor.fetchall():
status, count, size = row
print(f"{status:15}: {count:6} files, {self.format_size(size or 0)}")
print(f"{status:15}: {count:6} files, {self.format_size(int(size or 0))}")
# Disk usage summary
cursor.execute("""
SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label
""")
print("\n=== DISK USAGE ===")
for row in cursor.fetchall():
disk, count, size = row
print(f"{disk:20}: {count:6} files, {self.format_size(int(size or 0))}")
# Deduplication stats
cursor.execute("""
SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL
""")
hashed_count, hashed_size = cursor.fetchone()
cursor.execute("""
SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL
""")
dup_count, dup_size = cursor.fetchone()
print("\n=== DEDUPLICATION STATS ===")
print(f"Files with checksums: {hashed_count or 0:6}")
print(f"Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})")
if show_duplicates and dup_count:
print("\n=== DUPLICATE FILES ===")
cursor.execute("""
SELECT path, size, duplicate_of FROM files
WHERE duplicate_of IS NOT NULL
ORDER BY size DESC
LIMIT 20
""")
for path, size, dup_of in cursor.fetchall():
print(f" {path} ({self.format_size(int(size))}) → {dup_of}")
cursor.execute("""
SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
@@ -489,8 +689,24 @@ def main():
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
# Dedupe command
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
# Merge command
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
merge_parser.add_argument('--target', required=True, help='Target disk')
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
# Report command
report_parser = subparsers.add_parser('report', help='Show current status')
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
args = parser.parse_args()
tool = DiskReorganizer()
@@ -498,6 +714,13 @@ def main():
if args.command == 'index':
tool.index_disk(args.disk_root, args.disk_name)
elif args.command == 'dedupe':
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
elif args.command == 'merge':
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
filter_system=args.filter_system, network_target=args.network)
elif args.command == 'plan':
plan = tool.plan_migration(args.target_disk, args.dest_disks)
if plan:
@@ -508,7 +731,7 @@ def main():
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
elif args.command == 'report':
tool.generate_report()
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
if __name__ == '__main__':
main()