diff --git a/app/filters/gitignore.py b/app/filters/gitignore.py new file mode 100644 index 0000000..5952bfc --- /dev/null +++ b/app/filters/gitignore.py @@ -0,0 +1,30 @@ +from pathlib import Path +from typing import Set +import fnmatch + +DEFAULT_PATTERNS = { + 'node_modules/**', '__pycache__/**', '.git/**', 'build/**', 'dist/**', + '.cache/**', 'target/**', 'vendor/**', '.venv/**', 'venv/**', + '*.pyc', '*.pyo', '*.so', '*.dll', '*.dylib', '*.o', '*.a', + '.DS_Store', 'Thumbs.db', '.pytest_cache/**', '.tox/**', + '*.egg-info/**', '.mypy_cache/**', '.coverage', 'htmlcov/**', + '.gradle/**', 'bin/**', 'obj/**', '.vs/**', '.idea/**' +} + +class GitignoreFilter: + def __init__(self, patterns: Set[str] = None): + self.patterns = patterns or DEFAULT_PATTERNS + + def should_exclude(self, path: str) -> bool: + path_obj = Path(path) + for pattern in self.patterns: + if '**' in pattern: + clean_pattern = pattern.replace('/**', '').replace('**/', '') + if clean_pattern in path_obj.parts: + return True + elif fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(path_obj.name, pattern): + return True + return False + + def filter_files(self, files: list) -> list: + return [f for f in files if not self.should_exclude(f)] diff --git a/app/main.py b/app/main.py index 3181775..81be092 100644 --- a/app/main.py +++ b/app/main.py @@ -1,33 +1,23 @@ #!/usr/bin/env python3 -""" -Disk Reorganizer - Safely restructure files across disks to free up one entire disk. -Three modes: index, plan, execute -""" - import os import sys +from dataclasses import dataclass + import psycopg2 -from psycopg2 import sql -from psycopg2.extras import RealDictCursor import shutil import hashlib import argparse import json from pathlib import Path -from dataclasses import dataclass, asdict -from typing import List, Dict, Optional, Tuple +from typing import List, Dict, Optional from datetime import datetime import logging import time -# Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('disk_reorganizer.log'), - logging.StreamHandler(sys.stdout) - ] + handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger(__name__) @@ -430,12 +420,186 @@ class DiskReorganizer: logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!") logger.info(f" Remember to safely delete original files from {plan['target_disk']}") - def generate_report(self): + def run_deduplication(self, disk: Optional[str] = None, use_chunks: bool = True): + logger.info(f"Starting deduplication{' for disk ' + disk if disk else ''}") + + disk_mount_map = { + 'SMT': '/media/mike/SMT', + 'DISK1': '/media/mike/DISK1', + 'LLM': '/media/mike/LLM' + } + + conn = self.get_connection() + cursor = conn.cursor() + + def hash_file_local(file_path: Path) -> str: + hasher = hashlib.sha256() + with open(file_path, 'rb') as f: + while chunk := f.read(65536): + hasher.update(chunk) + return hasher.hexdigest() + + try: + if disk: + cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC", (disk,)) + else: + cursor.execute("SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC") + + files_to_process = cursor.fetchall() + total = len(files_to_process) + logger.info(f"Found {total} files to hash") + + processed = 0 + skipped = 0 + for path_str, size, disk_label in files_to_process: + try: + mount_point = disk_mount_map.get(disk_label, disk_label) + full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str) + + if not full_path.exists(): + skipped += 1 + continue + + checksum = hash_file_local(full_path) + + cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str)) + dup_row = cursor.fetchone() + duplicate_of = dup_row[0] if dup_row else None + + cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str)) + + processed += 1 + if processed % 100 == 0: + conn.commit() + print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True) + + except Exception as e: + skipped += 1 + conn.rollback() + + conn.commit() + print() + logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped") + + finally: + cursor.close() + conn.close() + + def plan_merge(self, sources: List[str], target: str, output_file: str, + filter_system: bool = False, network_target: str = None): + """Plan merge of multiple source disks to target with deduplication""" + logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}") + + if filter_system: + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from filters import GitignoreFilter + file_filter = GitignoreFilter() + logger.info("System/build file filtering enabled") + + conn = self.get_connection() + cursor = conn.cursor() + + try: + placeholders = ','.join(['%s'] * len(sources)) + cursor.execute(f""" + SELECT path, size, checksum, disk_label, duplicate_of + FROM files + WHERE disk_label IN ({placeholders}) + ORDER BY size DESC + """, tuple(sources)) + + files = cursor.fetchall() + total_files = len(files) + total_size = sum(int(f[1]) for f in files) + + unique_files = {} + duplicate_count = 0 + duplicate_size = 0 + filtered_count = 0 + filtered_size = 0 + + for path, size, checksum, disk_label, duplicate_of in files: + if filter_system and file_filter.should_exclude(path): + filtered_count += 1 + filtered_size += int(size) + continue + + if checksum and checksum in unique_files: + duplicate_count += 1 + duplicate_size += int(size) + else: + if checksum: + unique_files[checksum] = (path, int(size), disk_label) + + unique_count = len(unique_files) + unique_size = sum(f[1] for f in unique_files.values()) + + plan = { + 'sources': sources, + 'target': target or network_target, + 'network': network_target is not None, + 'total_files': total_files, + 'total_size': total_size, + 'unique_files': unique_count, + 'unique_size': unique_size, + 'duplicate_files': duplicate_count, + 'duplicate_size': duplicate_size, + 'filtered_files': filtered_count if filter_system else 0, + 'filtered_size': filtered_size if filter_system else 0, + 'space_saved': duplicate_size + (filtered_size if filter_system else 0), + 'operations': [] + } + + for checksum, (path, size, disk_label) in unique_files.items(): + plan['operations'].append({ + 'source_disk': disk_label, + 'source_path': path, + 'target_disk': target or network_target, + 'target_path': path, + 'size': size, + 'checksum': checksum + }) + + with open(output_file, 'w') as f: + json.dump(plan, f, indent=2) + + logger.info(f"Merge plan saved to {output_file}") + print(f"\n=== MERGE PLAN SUMMARY ===") + print(f"Sources: {', '.join(sources)}") + print(f"Target: {target or network_target}") + print(f"Total files: {total_files:,} ({self.format_size(total_size)})") + if filter_system: + print(f"Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})") + print(f"Unique files: {unique_count:,} ({self.format_size(unique_size)})") + print(f"Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})") + print(f"Total space saved: {self.format_size(plan['space_saved'])}") + print(f"Space needed on target: {self.format_size(unique_size)}") + + finally: + cursor.close() + conn.close() + + def generate_report(self, format='text', show_duplicates=False, preview_merge=None): """Generate status report""" conn = self.get_connection() cursor = conn.cursor() try: + if preview_merge: + # Load and display merge plan + with open(preview_merge, 'r') as f: + plan = json.load(f) + + print("\n=== MERGE PLAN PREVIEW ===") + print(f"Sources: {', '.join(plan['sources'])}") + print(f"Target: {plan['target']}") + print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})") + print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})") + print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})") + print(f"Space saved: {self.format_size(plan['space_saved'])}") + print(f"Space needed on target: {self.format_size(plan['unique_size'])}") + return + cursor.execute(""" SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status """) @@ -443,7 +607,43 @@ class DiskReorganizer: print("\n=== FILE MIGRATION REPORT ===") for row in cursor.fetchall(): status, count, size = row - print(f"{status:15}: {count:6} files, {self.format_size(size or 0)}") + print(f"{status:15}: {count:6} files, {self.format_size(int(size or 0))}") + + # Disk usage summary + cursor.execute(""" + SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label + """) + + print("\n=== DISK USAGE ===") + for row in cursor.fetchall(): + disk, count, size = row + print(f"{disk:20}: {count:6} files, {self.format_size(int(size or 0))}") + + # Deduplication stats + cursor.execute(""" + SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL + """) + hashed_count, hashed_size = cursor.fetchone() + + cursor.execute(""" + SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL + """) + dup_count, dup_size = cursor.fetchone() + + print("\n=== DEDUPLICATION STATS ===") + print(f"Files with checksums: {hashed_count or 0:6}") + print(f"Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})") + + if show_duplicates and dup_count: + print("\n=== DUPLICATE FILES ===") + cursor.execute(""" + SELECT path, size, duplicate_of FROM files + WHERE duplicate_of IS NOT NULL + ORDER BY size DESC + LIMIT 20 + """) + for path, size, dup_of in cursor.fetchall(): + print(f" {path} ({self.format_size(int(size))}) → {dup_of}") cursor.execute(""" SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified @@ -489,8 +689,24 @@ def main(): exec_parser.add_argument('plan_file', help='Path to plan JSON file') exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations') + # Dedupe command + dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums') + dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk') + dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication') + + # Merge command + merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication') + merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge') + merge_parser.add_argument('--target', required=True, help='Target disk') + merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file') + merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files') + merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)') + # Report command report_parser = subparsers.add_parser('report', help='Show current status') + report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format') + report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files') + report_parser.add_argument('--preview-merge', help='Preview merge plan from file') args = parser.parse_args() tool = DiskReorganizer() @@ -498,6 +714,13 @@ def main(): if args.command == 'index': tool.index_disk(args.disk_root, args.disk_name) + elif args.command == 'dedupe': + tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks) + + elif args.command == 'merge': + tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output, + filter_system=args.filter_system, network_target=args.network) + elif args.command == 'plan': plan = tool.plan_migration(args.target_disk, args.dest_disks) if plan: @@ -508,7 +731,7 @@ def main(): tool.execute_migration(args.plan_file, dry_run=args.dry_run) elif args.command == 'report': - tool.generate_report() + tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge) if __name__ == '__main__': main() \ No newline at end of file