base

2025-12-13 00:29:09 +01:00
parent 6449765890
commit 942e87d439
2 changed files with 270 additions and 17 deletions
--- a/app/filters/gitignore.py
+++ b/app/filters/gitignore.py
@@ -0,0 +1,30 @@
 from pathlib import Path
 from typing import Set
 import fnmatch
 DEFAULT_PATTERNS = {
    'node_modules/**', '__pycache__/**', '.git/**', 'build/**', 'dist/**',
    '.cache/**', 'target/**', 'vendor/**', '.venv/**', 'venv/**',
    '*.pyc', '*.pyo', '*.so', '*.dll', '*.dylib', '*.o', '*.a',
    '.DS_Store', 'Thumbs.db', '.pytest_cache/**', '.tox/**',
    '*.egg-info/**', '.mypy_cache/**', '.coverage', 'htmlcov/**',
    '.gradle/**', 'bin/**', 'obj/**', '.vs/**', '.idea/**'
 }
 class GitignoreFilter:
    def __init__(self, patterns: Set[str] = None):
        self.patterns = patterns or DEFAULT_PATTERNS
    def should_exclude(self, path: str) -> bool:
        path_obj = Path(path)
        for pattern in self.patterns:
            if '**' in pattern:
                clean_pattern = pattern.replace('/**', '').replace('**/', '')
                if clean_pattern in path_obj.parts:
                    return True
            elif fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(path_obj.name, pattern):
                return True
        return False
    def filter_files(self, files: list) -> list:
        return [f for f in files if not self.should_exclude(f)]
--- a/app/main.py
+++ b/app/main.py
@@ -1,33 +1,23 @@
 #!/usr/bin/env python3
 """
 Disk Reorganizer - Safely restructure files across disks to free up one entire disk.
 Three modes: index, plan, execute
 """
 import os
 import sys
 from dataclasses import dataclass
 import psycopg2
 from psycopg2 import sql
 from psycopg2.extras import RealDictCursor
 import shutil
 import hashlib
 import argparse
 import json
 from pathlib import Path
-from dataclasses import dataclass, asdict
+from typing import List, Dict, Optional
 from typing import List, Dict, Optional, Tuple
 from datetime import datetime
 import logging
 import time
 # Setup logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
-    handlers=[
+    handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)]
        logging.FileHandler('disk_reorganizer.log'),
        logging.StreamHandler(sys.stdout)
    ]
 )
 logger = logging.getLogger(__name__)
@@ -430,12 +420,186 @@ class DiskReorganizer:
            logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
            logger.info(f"  Remember to safely delete original files from {plan['target_disk']}")
-    def generate_report(self):
+    def run_deduplication(self, disk: Optional[str] = None, use_chunks: bool = True):
        logger.info(f"Starting deduplication{' for disk ' + disk if disk else ''}")
        disk_mount_map = {
            'SMT': '/media/mike/SMT',
            'DISK1': '/media/mike/DISK1',
            'LLM': '/media/mike/LLM'
        }
        conn = self.get_connection()
        cursor = conn.cursor()
        def hash_file_local(file_path: Path) -> str:
            hasher = hashlib.sha256()
            with open(file_path, 'rb') as f:
                while chunk := f.read(65536):
                    hasher.update(chunk)
            return hasher.hexdigest()
        try:
            if disk:
                cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC", (disk,))
            else:
                cursor.execute("SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC")
            files_to_process = cursor.fetchall()
            total = len(files_to_process)
            logger.info(f"Found {total} files to hash")
            processed = 0
            skipped = 0
            for path_str, size, disk_label in files_to_process:
                try:
                    mount_point = disk_mount_map.get(disk_label, disk_label)
                    full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
                    if not full_path.exists():
                        skipped += 1
                        continue
                    checksum = hash_file_local(full_path)
                    cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str))
                    dup_row = cursor.fetchone()
                    duplicate_of = dup_row[0] if dup_row else None
                    cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str))
                    processed += 1
                    if processed % 100 == 0:
                        conn.commit()
                        print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True)
                except Exception as e:
                    skipped += 1
                    conn.rollback()
            conn.commit()
            print()
            logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped")
        finally:
            cursor.close()
            conn.close()
    def plan_merge(self, sources: List[str], target: str, output_file: str,
                   filter_system: bool = False, network_target: str = None):
        """Plan merge of multiple source disks to target with deduplication"""
        logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
        if filter_system:
            sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
            from filters import GitignoreFilter
            file_filter = GitignoreFilter()
            logger.info("System/build file filtering enabled")
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            placeholders = ','.join(['%s'] * len(sources))
            cursor.execute(f"""
                SELECT path, size, checksum, disk_label, duplicate_of
                FROM files
                WHERE disk_label IN ({placeholders})
                ORDER BY size DESC
            """, tuple(sources))
            files = cursor.fetchall()
            total_files = len(files)
            total_size = sum(int(f[1]) for f in files)
            unique_files = {}
            duplicate_count = 0
            duplicate_size = 0
            filtered_count = 0
            filtered_size = 0
            for path, size, checksum, disk_label, duplicate_of in files:
                if filter_system and file_filter.should_exclude(path):
                    filtered_count += 1
                    filtered_size += int(size)
                    continue
                if checksum and checksum in unique_files:
                    duplicate_count += 1
                    duplicate_size += int(size)
                else:
                    if checksum:
                        unique_files[checksum] = (path, int(size), disk_label)
            unique_count = len(unique_files)
            unique_size = sum(f[1] for f in unique_files.values())
            plan = {
                'sources': sources,
                'target': target or network_target,
                'network': network_target is not None,
                'total_files': total_files,
                'total_size': total_size,
                'unique_files': unique_count,
                'unique_size': unique_size,
                'duplicate_files': duplicate_count,
                'duplicate_size': duplicate_size,
                'filtered_files': filtered_count if filter_system else 0,
                'filtered_size': filtered_size if filter_system else 0,
                'space_saved': duplicate_size + (filtered_size if filter_system else 0),
                'operations': []
            }
            for checksum, (path, size, disk_label) in unique_files.items():
                plan['operations'].append({
                    'source_disk': disk_label,
                    'source_path': path,
                    'target_disk': target or network_target,
                    'target_path': path,
                    'size': size,
                    'checksum': checksum
                })
            with open(output_file, 'w') as f:
                json.dump(plan, f, indent=2)
            logger.info(f"Merge plan saved to {output_file}")
            print(f"\n=== MERGE PLAN SUMMARY ===")
            print(f"Sources: {', '.join(sources)}")
            print(f"Target: {target or network_target}")
            print(f"Total files: {total_files:,} ({self.format_size(total_size)})")
            if filter_system:
                print(f"Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})")
            print(f"Unique files: {unique_count:,} ({self.format_size(unique_size)})")
            print(f"Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})")
            print(f"Total space saved: {self.format_size(plan['space_saved'])}")
            print(f"Space needed on target: {self.format_size(unique_size)}")
        finally:
            cursor.close()
            conn.close()
    def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
        """Generate status report"""
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            if preview_merge:
                # Load and display merge plan
                with open(preview_merge, 'r') as f:
                    plan = json.load(f)
                print("\n=== MERGE PLAN PREVIEW ===")
                print(f"Sources: {', '.join(plan['sources'])}")
                print(f"Target: {plan['target']}")
                print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
                print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
                print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
                print(f"Space saved: {self.format_size(plan['space_saved'])}")
                print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
                return
            cursor.execute("""
                SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
            """)
@@ -443,7 +607,43 @@ class DiskReorganizer:
            print("\n=== FILE MIGRATION REPORT ===")
            for row in cursor.fetchall():
                status, count, size = row
-                print(f"{status:15}: {count:6} files, {self.format_size(size or 0)}")
+                print(f"{status:15}: {count:6} files, {self.format_size(int(size or 0))}")
            # Disk usage summary
            cursor.execute("""
                SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label
            """)
            print("\n=== DISK USAGE ===")
            for row in cursor.fetchall():
                disk, count, size = row
                print(f"{disk:20}: {count:6} files, {self.format_size(int(size or 0))}")
            # Deduplication stats
            cursor.execute("""
                SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL
            """)
            hashed_count, hashed_size = cursor.fetchone()
            cursor.execute("""
                SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL
            """)
            dup_count, dup_size = cursor.fetchone()
            print("\n=== DEDUPLICATION STATS ===")
            print(f"Files with checksums: {hashed_count or 0:6}")
            print(f"Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})")
            if show_duplicates and dup_count:
                print("\n=== DUPLICATE FILES ===")
                cursor.execute("""
                    SELECT path, size, duplicate_of FROM files
                    WHERE duplicate_of IS NOT NULL
                    ORDER BY size DESC
                    LIMIT 20
                """)
                for path, size, dup_of in cursor.fetchall():
                    print(f"  {path} ({self.format_size(int(size))}) → {dup_of}")
            cursor.execute("""
                SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
@@ -489,8 +689,24 @@ def main():
    exec_parser.add_argument('plan_file', help='Path to plan JSON file')
    exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
    # Dedupe command
    dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
    dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
    dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
    # Merge command
    merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
    merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
    merge_parser.add_argument('--target', required=True, help='Target disk')
    merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
    merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
    merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
    # Report command
    report_parser = subparsers.add_parser('report', help='Show current status')
    report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
    report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
    report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
    args = parser.parse_args()
    tool = DiskReorganizer()
@@ -498,6 +714,13 @@ def main():
    if args.command == 'index':
        tool.index_disk(args.disk_root, args.disk_name)
    elif args.command == 'dedupe':
        tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
    elif args.command == 'merge':
        tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
                       filter_system=args.filter_system, network_target=args.network)
    elif args.command == 'plan':
        plan = tool.plan_migration(args.target_disk, args.dest_disks)
        if plan:
@@ -508,7 +731,7 @@ def main():
        tool.execute_migration(args.plan_file, dry_run=args.dry_run)
    elif args.command == 'report':
-        tool.generate_report()
+        tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
 if __name__ == '__main__':
    main()