base
This commit is contained in:
257
app/main.py
257
app/main.py
@@ -1,33 +1,23 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Disk Reorganizer - Safely restructure files across disks to free up one entire disk.
|
||||
Three modes: index, plan, execute
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
|
||||
import psycopg2
|
||||
from psycopg2 import sql
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import shutil
|
||||
import hashlib
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import time
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler('disk_reorganizer.log'),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -430,12 +420,186 @@ class DiskReorganizer:
|
||||
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
|
||||
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
|
||||
|
||||
def generate_report(self):
|
||||
def run_deduplication(self, disk: Optional[str] = None, use_chunks: bool = True):
|
||||
logger.info(f"Starting deduplication{' for disk ' + disk if disk else ''}")
|
||||
|
||||
disk_mount_map = {
|
||||
'SMT': '/media/mike/SMT',
|
||||
'DISK1': '/media/mike/DISK1',
|
||||
'LLM': '/media/mike/LLM'
|
||||
}
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
def hash_file_local(file_path: Path) -> str:
|
||||
hasher = hashlib.sha256()
|
||||
with open(file_path, 'rb') as f:
|
||||
while chunk := f.read(65536):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
try:
|
||||
if disk:
|
||||
cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC", (disk,))
|
||||
else:
|
||||
cursor.execute("SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC")
|
||||
|
||||
files_to_process = cursor.fetchall()
|
||||
total = len(files_to_process)
|
||||
logger.info(f"Found {total} files to hash")
|
||||
|
||||
processed = 0
|
||||
skipped = 0
|
||||
for path_str, size, disk_label in files_to_process:
|
||||
try:
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
|
||||
|
||||
if not full_path.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
checksum = hash_file_local(full_path)
|
||||
|
||||
cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str))
|
||||
dup_row = cursor.fetchone()
|
||||
duplicate_of = dup_row[0] if dup_row else None
|
||||
|
||||
cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str))
|
||||
|
||||
processed += 1
|
||||
if processed % 100 == 0:
|
||||
conn.commit()
|
||||
print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True)
|
||||
|
||||
except Exception as e:
|
||||
skipped += 1
|
||||
conn.rollback()
|
||||
|
||||
conn.commit()
|
||||
print()
|
||||
logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def plan_merge(self, sources: List[str], target: str, output_file: str,
|
||||
filter_system: bool = False, network_target: str = None):
|
||||
"""Plan merge of multiple source disks to target with deduplication"""
|
||||
logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
|
||||
|
||||
if filter_system:
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from filters import GitignoreFilter
|
||||
file_filter = GitignoreFilter()
|
||||
logger.info("System/build file filtering enabled")
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
placeholders = ','.join(['%s'] * len(sources))
|
||||
cursor.execute(f"""
|
||||
SELECT path, size, checksum, disk_label, duplicate_of
|
||||
FROM files
|
||||
WHERE disk_label IN ({placeholders})
|
||||
ORDER BY size DESC
|
||||
""", tuple(sources))
|
||||
|
||||
files = cursor.fetchall()
|
||||
total_files = len(files)
|
||||
total_size = sum(int(f[1]) for f in files)
|
||||
|
||||
unique_files = {}
|
||||
duplicate_count = 0
|
||||
duplicate_size = 0
|
||||
filtered_count = 0
|
||||
filtered_size = 0
|
||||
|
||||
for path, size, checksum, disk_label, duplicate_of in files:
|
||||
if filter_system and file_filter.should_exclude(path):
|
||||
filtered_count += 1
|
||||
filtered_size += int(size)
|
||||
continue
|
||||
|
||||
if checksum and checksum in unique_files:
|
||||
duplicate_count += 1
|
||||
duplicate_size += int(size)
|
||||
else:
|
||||
if checksum:
|
||||
unique_files[checksum] = (path, int(size), disk_label)
|
||||
|
||||
unique_count = len(unique_files)
|
||||
unique_size = sum(f[1] for f in unique_files.values())
|
||||
|
||||
plan = {
|
||||
'sources': sources,
|
||||
'target': target or network_target,
|
||||
'network': network_target is not None,
|
||||
'total_files': total_files,
|
||||
'total_size': total_size,
|
||||
'unique_files': unique_count,
|
||||
'unique_size': unique_size,
|
||||
'duplicate_files': duplicate_count,
|
||||
'duplicate_size': duplicate_size,
|
||||
'filtered_files': filtered_count if filter_system else 0,
|
||||
'filtered_size': filtered_size if filter_system else 0,
|
||||
'space_saved': duplicate_size + (filtered_size if filter_system else 0),
|
||||
'operations': []
|
||||
}
|
||||
|
||||
for checksum, (path, size, disk_label) in unique_files.items():
|
||||
plan['operations'].append({
|
||||
'source_disk': disk_label,
|
||||
'source_path': path,
|
||||
'target_disk': target or network_target,
|
||||
'target_path': path,
|
||||
'size': size,
|
||||
'checksum': checksum
|
||||
})
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(plan, f, indent=2)
|
||||
|
||||
logger.info(f"Merge plan saved to {output_file}")
|
||||
print(f"\n=== MERGE PLAN SUMMARY ===")
|
||||
print(f"Sources: {', '.join(sources)}")
|
||||
print(f"Target: {target or network_target}")
|
||||
print(f"Total files: {total_files:,} ({self.format_size(total_size)})")
|
||||
if filter_system:
|
||||
print(f"Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})")
|
||||
print(f"Unique files: {unique_count:,} ({self.format_size(unique_size)})")
|
||||
print(f"Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})")
|
||||
print(f"Total space saved: {self.format_size(plan['space_saved'])}")
|
||||
print(f"Space needed on target: {self.format_size(unique_size)}")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
|
||||
"""Generate status report"""
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
if preview_merge:
|
||||
# Load and display merge plan
|
||||
with open(preview_merge, 'r') as f:
|
||||
plan = json.load(f)
|
||||
|
||||
print("\n=== MERGE PLAN PREVIEW ===")
|
||||
print(f"Sources: {', '.join(plan['sources'])}")
|
||||
print(f"Target: {plan['target']}")
|
||||
print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
|
||||
print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
|
||||
print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
|
||||
print(f"Space saved: {self.format_size(plan['space_saved'])}")
|
||||
print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
|
||||
return
|
||||
|
||||
cursor.execute("""
|
||||
SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
|
||||
""")
|
||||
@@ -443,7 +607,43 @@ class DiskReorganizer:
|
||||
print("\n=== FILE MIGRATION REPORT ===")
|
||||
for row in cursor.fetchall():
|
||||
status, count, size = row
|
||||
print(f"{status:15}: {count:6} files, {self.format_size(size or 0)}")
|
||||
print(f"{status:15}: {count:6} files, {self.format_size(int(size or 0))}")
|
||||
|
||||
# Disk usage summary
|
||||
cursor.execute("""
|
||||
SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label
|
||||
""")
|
||||
|
||||
print("\n=== DISK USAGE ===")
|
||||
for row in cursor.fetchall():
|
||||
disk, count, size = row
|
||||
print(f"{disk:20}: {count:6} files, {self.format_size(int(size or 0))}")
|
||||
|
||||
# Deduplication stats
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL
|
||||
""")
|
||||
hashed_count, hashed_size = cursor.fetchone()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL
|
||||
""")
|
||||
dup_count, dup_size = cursor.fetchone()
|
||||
|
||||
print("\n=== DEDUPLICATION STATS ===")
|
||||
print(f"Files with checksums: {hashed_count or 0:6}")
|
||||
print(f"Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})")
|
||||
|
||||
if show_duplicates and dup_count:
|
||||
print("\n=== DUPLICATE FILES ===")
|
||||
cursor.execute("""
|
||||
SELECT path, size, duplicate_of FROM files
|
||||
WHERE duplicate_of IS NOT NULL
|
||||
ORDER BY size DESC
|
||||
LIMIT 20
|
||||
""")
|
||||
for path, size, dup_of in cursor.fetchall():
|
||||
print(f" {path} ({self.format_size(int(size))}) → {dup_of}")
|
||||
|
||||
cursor.execute("""
|
||||
SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
|
||||
@@ -489,8 +689,24 @@ def main():
|
||||
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
|
||||
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
|
||||
|
||||
# Dedupe command
|
||||
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
|
||||
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
|
||||
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
|
||||
|
||||
# Merge command
|
||||
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
|
||||
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
|
||||
merge_parser.add_argument('--target', required=True, help='Target disk')
|
||||
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
|
||||
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
|
||||
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
|
||||
|
||||
# Report command
|
||||
report_parser = subparsers.add_parser('report', help='Show current status')
|
||||
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
|
||||
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
|
||||
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
|
||||
|
||||
args = parser.parse_args()
|
||||
tool = DiskReorganizer()
|
||||
@@ -498,6 +714,13 @@ def main():
|
||||
if args.command == 'index':
|
||||
tool.index_disk(args.disk_root, args.disk_name)
|
||||
|
||||
elif args.command == 'dedupe':
|
||||
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
|
||||
|
||||
elif args.command == 'merge':
|
||||
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
|
||||
filter_system=args.filter_system, network_target=args.network)
|
||||
|
||||
elif args.command == 'plan':
|
||||
plan = tool.plan_migration(args.target_disk, args.dest_disks)
|
||||
if plan:
|
||||
@@ -508,7 +731,7 @@ def main():
|
||||
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
|
||||
|
||||
elif args.command == 'report':
|
||||
tool.generate_report()
|
||||
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user