base

2025-12-13 00:29:09 +01:00
parent 6449765890
commit 942e87d439
2 changed files with 270 additions and 17 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -1,33 +1,23 @@
 #!/usr/bin/env python3
-"""
-Disk Reorganizer - Safely restructure files across disks to free up one entire disk.
-Three modes: index, plan, execute
-"""
-
 import os
 import sys
+from dataclasses import dataclass
+
 import psycopg2
-from psycopg2 import sql
-from psycopg2.extras import RealDictCursor
 import shutil
 import hashlib
 import argparse
 import json
 from pathlib import Path
-from dataclasses import dataclass, asdict
-from typing import List, Dict, Optional, Tuple
+from typing import List, Dict, Optional
 from datetime import datetime
 import logging
 import time

-# Setup logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler('disk_reorganizer.log'),
-        logging.StreamHandler(sys.stdout)
-    ]
+    handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)]
 )
 logger = logging.getLogger(__name__)

@@ -430,12 +420,186 @@ class DiskReorganizer:
            logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
            logger.info(f"  Remember to safely delete original files from {plan['target_disk']}")

-    def generate_report(self):
+    def run_deduplication(self, disk: Optional[str] = None, use_chunks: bool = True):
+        logger.info(f"Starting deduplication{' for disk ' + disk if disk else ''}")
+
+        disk_mount_map = {
+            'SMT': '/media/mike/SMT',
+            'DISK1': '/media/mike/DISK1',
+            'LLM': '/media/mike/LLM'
+        }
+
+        conn = self.get_connection()
+        cursor = conn.cursor()
+
+        def hash_file_local(file_path: Path) -> str:
+            hasher = hashlib.sha256()
+            with open(file_path, 'rb') as f:
+                while chunk := f.read(65536):
+                    hasher.update(chunk)
+            return hasher.hexdigest()
+
+        try:
+            if disk:
+                cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC", (disk,))
+            else:
+                cursor.execute("SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC")
+
+            files_to_process = cursor.fetchall()
+            total = len(files_to_process)
+            logger.info(f"Found {total} files to hash")
+
+            processed = 0
+            skipped = 0
+            for path_str, size, disk_label in files_to_process:
+                try:
+                    mount_point = disk_mount_map.get(disk_label, disk_label)
+                    full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
+
+                    if not full_path.exists():
+                        skipped += 1
+                        continue
+
+                    checksum = hash_file_local(full_path)
+
+                    cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str))
+                    dup_row = cursor.fetchone()
+                    duplicate_of = dup_row[0] if dup_row else None
+
+                    cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str))
+
+                    processed += 1
+                    if processed % 100 == 0:
+                        conn.commit()
+                        print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True)
+
+                except Exception as e:
+                    skipped += 1
+                    conn.rollback()
+
+            conn.commit()
+            print()
+            logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped")
+
+        finally:
+            cursor.close()
+            conn.close()
+
+    def plan_merge(self, sources: List[str], target: str, output_file: str,
+                   filter_system: bool = False, network_target: str = None):
+        """Plan merge of multiple source disks to target with deduplication"""
+        logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
+
+        if filter_system:
+            sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+            from filters import GitignoreFilter
+            file_filter = GitignoreFilter()
+            logger.info("System/build file filtering enabled")
+
+        conn = self.get_connection()
+        cursor = conn.cursor()
+
+        try:
+            placeholders = ','.join(['%s'] * len(sources))
+            cursor.execute(f"""
+                SELECT path, size, checksum, disk_label, duplicate_of
+                FROM files
+                WHERE disk_label IN ({placeholders})
+                ORDER BY size DESC
+            """, tuple(sources))
+
+            files = cursor.fetchall()
+            total_files = len(files)
+            total_size = sum(int(f[1]) for f in files)
+
+            unique_files = {}
+            duplicate_count = 0
+            duplicate_size = 0
+            filtered_count = 0
+            filtered_size = 0
+
+            for path, size, checksum, disk_label, duplicate_of in files:
+                if filter_system and file_filter.should_exclude(path):
+                    filtered_count += 1
+                    filtered_size += int(size)
+                    continue
+
+                if checksum and checksum in unique_files:
+                    duplicate_count += 1
+                    duplicate_size += int(size)
+                else:
+                    if checksum:
+                        unique_files[checksum] = (path, int(size), disk_label)
+
+            unique_count = len(unique_files)
+            unique_size = sum(f[1] for f in unique_files.values())
+
+            plan = {
+                'sources': sources,
+                'target': target or network_target,
+                'network': network_target is not None,
+                'total_files': total_files,
+                'total_size': total_size,
+                'unique_files': unique_count,
+                'unique_size': unique_size,
+                'duplicate_files': duplicate_count,
+                'duplicate_size': duplicate_size,
+                'filtered_files': filtered_count if filter_system else 0,
+                'filtered_size': filtered_size if filter_system else 0,
+                'space_saved': duplicate_size + (filtered_size if filter_system else 0),
+                'operations': []
+            }
+
+            for checksum, (path, size, disk_label) in unique_files.items():
+                plan['operations'].append({
+                    'source_disk': disk_label,
+                    'source_path': path,
+                    'target_disk': target or network_target,
+                    'target_path': path,
+                    'size': size,
+                    'checksum': checksum
+                })
+
+            with open(output_file, 'w') as f:
+                json.dump(plan, f, indent=2)
+
+            logger.info(f"Merge plan saved to {output_file}")
+            print(f"\n=== MERGE PLAN SUMMARY ===")
+            print(f"Sources: {', '.join(sources)}")
+            print(f"Target: {target or network_target}")
+            print(f"Total files: {total_files:,} ({self.format_size(total_size)})")
+            if filter_system:
+                print(f"Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})")
+            print(f"Unique files: {unique_count:,} ({self.format_size(unique_size)})")
+            print(f"Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})")
+            print(f"Total space saved: {self.format_size(plan['space_saved'])}")
+            print(f"Space needed on target: {self.format_size(unique_size)}")
+
+        finally:
+            cursor.close()
+            conn.close()
+
+    def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
        """Generate status report"""
        conn = self.get_connection()
        cursor = conn.cursor()

        try:
+            if preview_merge:
+                # Load and display merge plan
+                with open(preview_merge, 'r') as f:
+                    plan = json.load(f)
+
+                print("\n=== MERGE PLAN PREVIEW ===")
+                print(f"Sources: {', '.join(plan['sources'])}")
+                print(f"Target: {plan['target']}")
+                print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
+                print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
+                print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
+                print(f"Space saved: {self.format_size(plan['space_saved'])}")
+                print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
+                return
+
            cursor.execute("""
                SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
            """)
@@ -443,7 +607,43 @@ class DiskReorganizer:
            print("\n=== FILE MIGRATION REPORT ===")
            for row in cursor.fetchall():
                status, count, size = row
-                print(f"{status:15}: {count:6} files, {self.format_size(size or 0)}")
+                print(f"{status:15}: {count:6} files, {self.format_size(int(size or 0))}")
+
+            # Disk usage summary
+            cursor.execute("""
+                SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label
+            """)
+
+            print("\n=== DISK USAGE ===")
+            for row in cursor.fetchall():
+                disk, count, size = row
+                print(f"{disk:20}: {count:6} files, {self.format_size(int(size or 0))}")
+
+            # Deduplication stats
+            cursor.execute("""
+                SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL
+            """)
+            hashed_count, hashed_size = cursor.fetchone()
+
+            cursor.execute("""
+                SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL
+            """)
+            dup_count, dup_size = cursor.fetchone()
+
+            print("\n=== DEDUPLICATION STATS ===")
+            print(f"Files with checksums: {hashed_count or 0:6}")
+            print(f"Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})")
+
+            if show_duplicates and dup_count:
+                print("\n=== DUPLICATE FILES ===")
+                cursor.execute("""
+                    SELECT path, size, duplicate_of FROM files
+                    WHERE duplicate_of IS NOT NULL
+                    ORDER BY size DESC
+                    LIMIT 20
+                """)
+                for path, size, dup_of in cursor.fetchall():
+                    print(f"  {path} ({self.format_size(int(size))}) → {dup_of}")

            cursor.execute("""
                SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
@@ -489,8 +689,24 @@ def main():
    exec_parser.add_argument('plan_file', help='Path to plan JSON file')
    exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')

+    # Dedupe command
+    dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
+    dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
+    dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
+
+    # Merge command
+    merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
+    merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
+    merge_parser.add_argument('--target', required=True, help='Target disk')
+    merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
+    merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
+    merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
+
    # Report command
    report_parser = subparsers.add_parser('report', help='Show current status')
+    report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
+    report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
+    report_parser.add_argument('--preview-merge', help='Preview merge plan from file')

    args = parser.parse_args()
    tool = DiskReorganizer()
@@ -498,6 +714,13 @@ def main():
    if args.command == 'index':
        tool.index_disk(args.disk_root, args.disk_name)

+    elif args.command == 'dedupe':
+        tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
+
+    elif args.command == 'merge':
+        tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
+                       filter_system=args.filter_system, network_target=args.network)
+
    elif args.command == 'plan':
        plan = tool.plan_migration(args.target_disk, args.dest_disks)
        if plan:
@@ -508,7 +731,7 @@ def main():
        tool.execute_migration(args.plan_file, dry_run=args.dry_run)

    elif args.command == 'report':
-        tool.generate_report()
+        tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)

 if __name__ == '__main__':
    main()