Files
defrag/app/main.py
2025-12-13 04:23:04 +01:00

680 lines
37 KiB
Python

import os
import sys
from dataclasses import dataclass
import psycopg2
import shutil
import hashlib
import argparse
import json
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
import logging
import time
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)])
logger = logging.getLogger(__name__)
@dataclass
class FileRecord:
path: str
size: int
modified_time: float
disk_label: str
checksum: Optional[str] = None
status: str = 'indexed'
class DiskReorganizer:
def __init__(self, db_config: Dict=None):
if db_config is None:
db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
self.db_config = db_config
self.init_database()
def get_connection(self):
return psycopg2.connect(**self.db_config)
def init_database(self):
try:
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute("\n SELECT table_name FROM information_schema.tables\n WHERE table_schema = 'public' AND table_name IN ('files', 'operations')\n ")
tables = cursor.fetchall()
if len(tables) < 2:
logger.error('Database tables not found! Please run setup_database.sh first.')
raise Exception('Database not properly initialized. Run setup_database.sh')
cursor.close()
conn.close()
logger.info('Database connection verified successfully')
except psycopg2.Error as e:
logger.error(f'Database connection failed: {e}')
raise
def index_disk(self, disk_root: str, disk_name: str):
logger.info(f'Indexing disk: {disk_name} at {disk_root}')
disk_path = Path(disk_root)
if not disk_path.exists():
logger.error(f'Disk path {disk_root} does not exist!')
return
files_count = 0
total_size = 0
start_time = time.time()
conn = self.get_connection()
cursor = conn.cursor()
try:
for root, dirs, files in os.walk(disk_path):
dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))]
for file in files:
try:
file_path = Path(root) / file
if not file_path.is_file():
continue
stat = file_path.stat()
size = stat.st_size
mtime = datetime.fromtimestamp(stat.st_mtime)
rel_path = str(file_path.relative_to(disk_path))
cursor.execute('\n INSERT INTO files (path, size, modified_time, disk_label, checksum, status)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n disk_label = EXCLUDED.disk_label,\n status = EXCLUDED.status\n ', (rel_path, size, mtime, disk_name, None, 'indexed'))
files_count += 1
total_size += size
if files_count % 100 == 0:
elapsed = time.time() - start_time
rate = files_count / elapsed if elapsed > 0 else 0
display_path = str(file_path)
if len(display_path) > 60:
display_path = '...' + display_path[-57:]
print(f'\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}', end='', flush=True)
if files_count % 1000 == 0:
conn.commit()
except Exception as e:
conn.rollback()
logger.warning(f'\nSkipping {file_path}: {e}')
continue
conn.commit()
print()
logger.info(f'Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}')
finally:
cursor.close()
conn.close()
def calculate_disk_usage(self) -> Dict[str, Dict]:
conn = self.get_connection()
cursor = conn.cursor()
try:
cursor.execute('\n SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count\n FROM files\n GROUP BY disk_label\n ')
usage = {}
for row in cursor.fetchall():
disk = row[0]
size = int(row[1] or 0)
count = int(row[2])
usage[disk] = {'size': size, 'count': count, 'formatted_size': self.format_size(size)}
return usage
finally:
cursor.close()
conn.close()
def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict:
logger.info(f'Planning migration to free up {target_disk}')
usage = self.calculate_disk_usage()
if target_disk not in usage:
logger.error(f'Target disk {target_disk} not found in index!')
return {}
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute('SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC', (target_disk,))
files_to_move = cursor.fetchall()
cursor.close()
conn.close()
target_disk_usage = usage[target_disk]['size']
logger.info(f'Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}')
dest_availability = []
for disk in destination_disks:
if disk not in usage:
available = float('inf')
else:
available = float('inf')
dest_availability.append({'disk': disk, 'available': available, 'planned_usage': 0})
plan = {'target_disk': target_disk, 'total_size': target_disk_usage, 'file_count': len(files_to_move), 'operations': [], 'destination_disks': destination_disks}
conn = self.get_connection()
cursor = conn.cursor()
try:
for file_info in files_to_move:
rel_path, size, mtime = file_info
dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)]
op = {'source_disk': target_disk, 'source_path': rel_path, 'dest_disk': dest_disk, 'target_path': rel_path, 'size': int(size)}
plan['operations'].append(op)
cursor.execute('INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)', (f'{target_disk}:{rel_path}', f'{dest_disk}:{rel_path}', 'move', 'pending'))
conn.commit()
finally:
cursor.close()
conn.close()
plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(plan_file, 'w') as f:
json.dump(plan, f, indent=2)
logger.info(f"Plan created with {len(plan['operations'])} operations")
logger.info(f'Plan saved to {plan_file}')
return plan
def verify_operation(self, source: Path, dest: Path) -> bool:
if not dest.exists():
return False
try:
source_stat = source.stat()
dest_stat = dest.stat()
if source_stat.st_size != dest_stat.st_size:
return False
return True
except Exception as e:
logger.error(f'Verification error: {e}')
return False
@staticmethod
def file_checksum(path: Path) -> str:
hash_md5 = hashlib.md5()
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def execute_migration(self, plan_file: str, dry_run: bool=True):
logger.info(f"{('DRY RUN' if dry_run else 'EXECUTING')} migration from {plan_file}")
with open(plan_file, 'r') as f:
plan = json.load(f)
operations = plan['operations']
logger.info(f'Processing {len(operations)} operations...')
success_count = 0
error_count = 0
start_time = time.time()
conn = self.get_connection()
cursor = conn.cursor()
try:
for i, op in enumerate(operations, 1):
source_disk = op['source_disk']
source_path = op['source_path']
dest_disk = op['dest_disk']
target_path = op['target_path']
source_full = Path(source_disk) / source_path
dest_full = Path(dest_disk) / target_path
elapsed = time.time() - start_time
rate = i / elapsed if elapsed > 0 else 0
eta = (len(operations) - i) / rate if rate > 0 else 0
display_path = str(source_path)
if len(display_path) > 50:
display_path = '...' + display_path[-47:]
print(f'\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}', end='', flush=True)
if dry_run:
if source_full.exists():
success_count += 1
else:
logger.warning(f'\n Source does not exist: {source_full}')
error_count += 1
continue
try:
dest_full.parent.mkdir(parents=True, exist_ok=True)
if source_full.exists():
shutil.copy2(source_full, dest_full)
if self.verify_operation(source_full, dest_full):
cursor.execute("UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s", (dest_disk, source_path, source_disk))
cursor.execute('UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s', (f'{source_disk}:{source_path}',))
success_count += 1
else:
raise Exception('Verification failed')
else:
logger.warning(f'\n Source missing: {source_full}')
error_count += 1
except Exception as e:
logger.error(f'\n Error processing {source_path}: {e}')
cursor.execute('UPDATE operations SET error = %s WHERE source_path = %s', (str(e), f'{source_disk}:{source_path}'))
error_count += 1
if i % 10 == 0:
conn.commit()
conn.commit()
print()
finally:
cursor.close()
conn.close()
logger.info(f'Migration complete: {success_count} success, {error_count} errors')
if not dry_run and error_count == 0:
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
def run_deduplication(self, disk: Optional[str]=None, use_chunks: bool=True):
logger.info(f"Starting deduplication{(' for disk ' + disk if disk else '')}")
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
conn = self.get_connection()
cursor = conn.cursor()
def hash_file_local(file_path: Path) -> str:
hasher = hashlib.sha256()
with open(file_path, 'rb') as f:
while (chunk := f.read(65536)):
hasher.update(chunk)
return hasher.hexdigest()
try:
if disk:
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC', (disk,))
else:
cursor.execute('SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC')
files_to_process = cursor.fetchall()
total = len(files_to_process)
logger.info(f'Found {total} files to hash')
processed = 0
skipped = 0
start_time = time.time()
batch = []
print(f'Phase 1: Computing checksums...')
for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
try:
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
if not full_path.exists():
skipped += 1
if idx % 100 == 0:
elapsed = time.time() - start_time
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
remaining = (total - idx) / rate if rate > 0 else 0
pct = 100 * idx / total
print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
continue
checksum = hash_file_local(full_path)
batch.append((checksum, path_str))
processed += 1
if len(batch) >= 1000:
try:
cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
conn.commit()
batch.clear()
except Exception as e:
conn.rollback()
batch.clear()
print(f'\nBatch update failed: {e}')
if idx % 100 == 0:
elapsed = time.time() - start_time
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
remaining = (total - idx) / rate if rate > 0 else 0
pct = 100 * idx / total
print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
except Exception as e:
skipped += 1
if idx <= 5:
print(f'\nDebug: {full_path} - {e}')
if batch:
try:
cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
conn.commit()
except Exception as e:
conn.rollback()
print(f'\nFinal batch failed: {e}')
print()
elapsed = time.time() - start_time
logger.info(f'Phase 1 done: {processed:,} files in {int(elapsed / 60)}m{int(elapsed % 60):02d}s ({skipped:,} skipped)')
print('Phase 2: Finding duplicates...')
cursor.execute('\n UPDATE files f1 SET duplicate_of = (\n SELECT MIN(path) FROM files f2\n WHERE f2.checksum = f1.checksum AND f2.path < f1.path\n )\n WHERE checksum IS NOT NULL\n ')
conn.commit()
cursor.execute('SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL')
dup_count = cursor.fetchone()[0]
logger.info(f'Phase 2 done: Found {dup_count:,} duplicates')
finally:
cursor.close()
conn.close()
def plan_merge(self, sources: List[str], target: str, output_file: str, filter_system: bool=False, network_target: str=None):
logger.info(f"Planning merge: {', '.join(sources)}{target or network_target}")
if filter_system:
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from filters import GitignoreFilter
file_filter = GitignoreFilter()
logger.info('System/build file filtering enabled')
conn = self.get_connection()
cursor = conn.cursor()
try:
placeholders = ','.join(['%s'] * len(sources))
cursor.execute(f'\n SELECT path, size, checksum, disk_label, duplicate_of\n FROM files\n WHERE disk_label IN ({placeholders})\n ORDER BY size DESC\n ', tuple(sources))
files = cursor.fetchall()
total_files = len(files)
total_size = sum((int(f[1]) for f in files))
unique_files = {}
duplicate_count = 0
duplicate_size = 0
filtered_count = 0
filtered_size = 0
for path, size, checksum, disk_label, duplicate_of in files:
if filter_system and file_filter.should_exclude(path):
filtered_count += 1
filtered_size += int(size)
continue
if checksum and checksum in unique_files:
duplicate_count += 1
duplicate_size += int(size)
elif checksum:
unique_files[checksum] = (path, int(size), disk_label)
unique_count = len(unique_files)
unique_size = sum((f[1] for f in unique_files.values()))
plan = {'sources': sources, 'target': target or network_target, 'network': network_target is not None, 'total_files': total_files, 'total_size': total_size, 'unique_files': unique_count, 'unique_size': unique_size, 'duplicate_files': duplicate_count, 'duplicate_size': duplicate_size, 'filtered_files': filtered_count if filter_system else 0, 'filtered_size': filtered_size if filter_system else 0, 'space_saved': duplicate_size + (filtered_size if filter_system else 0), 'operations': []}
for checksum, (path, size, disk_label) in unique_files.items():
plan['operations'].append({'source_disk': disk_label, 'source_path': path, 'target_disk': target or network_target, 'target_path': path, 'size': size, 'checksum': checksum})
with open(output_file, 'w') as f:
json.dump(plan, f, indent=2)
logger.info(f'Merge plan saved to {output_file}')
print(f'\n=== MERGE PLAN SUMMARY ===')
print(f"Sources: {', '.join(sources)}")
print(f'Target: {target or network_target}')
print(f'Total files: {total_files:,} ({self.format_size(total_size)})')
if filter_system:
print(f'Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})')
print(f'Unique files: {unique_count:,} ({self.format_size(unique_size)})')
print(f'Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})')
print(f"Total space saved: {self.format_size(plan['space_saved'])}")
print(f'Space needed on target: {self.format_size(unique_size)}')
finally:
cursor.close()
conn.close()
def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
conn = self.get_connection()
cursor = conn.cursor()
try:
if preview_merge:
with open(preview_merge, 'r') as f:
plan = json.load(f)
print('\n=== MERGE PLAN PREVIEW ===')
print(f"Sources: {', '.join(plan['sources'])}")
print(f"Target: {plan['target']}")
print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
print(f"Space saved: {self.format_size(plan['space_saved'])}")
print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
return
cursor.execute('\n SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status\n ')
print('\n=== FILE MIGRATION REPORT ===')
for row in cursor.fetchall():
status, count, size = row
print(f'{status:15}: {count:6} files, {self.format_size(int(size or 0))}')
cursor.execute('\n SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label\n ')
print('\n=== DISK USAGE ===')
for row in cursor.fetchall():
disk, count, size = row
print(f'{disk:20}: {count:6} files, {self.format_size(int(size or 0))}')
cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL\n ')
hashed_count, hashed_size = cursor.fetchone()
cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL\n ')
dup_count, dup_size = cursor.fetchone()
print('\n=== DEDUPLICATION STATS ===')
print(f'Files with checksums: {hashed_count or 0:6}')
print(f'Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})')
if show_duplicates and dup_count:
print('\n=== DUPLICATE FILES ===')
cursor.execute('\n SELECT path, size, duplicate_of FROM files\n WHERE duplicate_of IS NOT NULL\n ORDER BY size DESC\n LIMIT 20\n ')
for path, size, dup_of in cursor.fetchall():
print(f' {path} ({self.format_size(int(size))}) → {dup_of}')
cursor.execute('\n SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified\n ')
print('\n=== OPERATIONS REPORT ===')
for row in cursor.fetchall():
op_type, executed, verified, count = row
status = 'EXECUTED' if executed else 'PENDING'
if verified:
status += '+VERIFIED'
print(f'{op_type:10} {status:15}: {count} operations')
finally:
cursor.close()
conn.close()
def profile_content(self, disk: Optional[str]=None, update_db: bool=False, limit: Optional[int]=None):
from content.profiler import ContentProfiler
profiler = ContentProfiler()
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
conn = self.get_connection()
cursor = conn.cursor()
try:
query = 'SELECT path, size, disk_label FROM files WHERE 1=1'
params = []
if disk:
query += ' AND disk_label = %s'
params.append(disk)
if limit:
query += f' LIMIT {limit}'
cursor.execute(query, params)
files = cursor.fetchall()
total = len(files)
logger.info(f'Profiling {total:,} files...')
kind_stats = {}
processable = 0
batch = []
for idx, (path, size, disk_label) in enumerate(files, 1):
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
if not full_path.exists():
continue
profile = profiler.profile_file(full_path)
if 'error' not in profile:
kind = profile['kind']
if kind not in kind_stats:
kind_stats[kind] = {'count': 0, 'processable': 0}
kind_stats[kind]['count'] += 1
if profile['processable']:
kind_stats[kind]['processable'] += 1
processable += 1
if update_db:
profile_json = json.dumps(profile)
batch.append((kind, profile_json, path))
if len(batch) >= 500:
cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
conn.commit()
batch.clear()
if idx % 100 == 0:
print(f'\rProfiled: {idx:,}/{total:,}', end='', flush=True)
if update_db and batch:
cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
conn.commit()
print()
print(f'\n=== CONTENT PROFILE SUMMARY ===')
print(f'Total files: {total:,}')
print(f'Processable: {processable:,}\n')
print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
print('-' * 60)
for kind in sorted(kind_stats.keys()):
stats = kind_stats[kind]
extractor = profiler._suggest_extractor(kind, '')
print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
finally:
cursor.close()
conn.close()
def extract_content(self, kind: Optional[str]=None, limit: int=10):
from content.profiler import ContentProfiler
from content.extractors import ContentExtractor
profiler = ContentProfiler()
extractor = ContentExtractor()
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
conn = self.get_connection()
cursor = conn.cursor()
try:
query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
params = []
if kind:
query += " AND metadata->'profile'->>'kind' = %s"
params.append(kind)
query += f' LIMIT {limit}'
cursor.execute(query, params)
files = cursor.fetchall()
print(f'\n=== EXTRACTING CONTENT ===')
print(f'Processing {len(files)} files\n')
for path, size, disk_label, metadata in files:
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
if not full_path.exists():
continue
profile = metadata.get('profile', {}) if metadata else {}
extractor_type = profile.get('extractor')
if not extractor_type:
continue
print(f'Extracting: {path}')
print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}")
result = extractor.extract(full_path, extractor_type)
if 'text' in result:
preview = result['text'][:200]
print(f' Preview: {preview}...')
elif 'pipeline' in result:
print(f" Pipeline: {''.join(result['pipeline'])}")
print(f" Status: {result.get('status', 'pending')}")
print()
finally:
cursor.close()
conn.close()
def classify_files(self, disk: Optional[str]=None, update_db: bool=False):
from classification.classifier import FileClassifier
classifier = FileClassifier()
conn = self.get_connection()
cursor = conn.cursor()
try:
if disk:
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s', (disk,))
else:
cursor.execute('SELECT path, size, disk_label FROM files')
files = cursor.fetchall()
total = len(files)
logger.info(f'Classifying {total:,} files...')
categories = {}
build_artifacts = 0
batch = []
for idx, (path, size, disk_label) in enumerate(files, 1):
labels, category, is_build = classifier.classify_path(path, int(size))
if is_build:
build_artifacts += 1
if category not in categories:
categories[category] = {'count': 0, 'size': 0}
categories[category]['count'] += 1
categories[category]['size'] += int(size)
if update_db:
labels_str = ','.join(labels)
batch.append((category, labels_str, path))
if len(batch) >= 1000:
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
conn.commit()
batch.clear()
if idx % 1000 == 0:
print(f'\rClassified: {idx:,}/{total:,}', end='', flush=True)
if update_db and batch:
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
conn.commit()
print()
print(f'\n=== CLASSIFICATION SUMMARY ===')
print(f'Total files: {total:,}')
print(f'Build artifacts: {build_artifacts:,}')
print(f'\nCategories:')
for category in sorted(categories.keys()):
info = categories[category]
print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
finally:
cursor.close()
conn.close()
def review_migration(self, category: Optional[str]=None, show_build: bool=False):
from classification.classifier import FileClassifier
classifier = FileClassifier()
conn = self.get_connection()
cursor = conn.cursor()
try:
query = 'SELECT path, size, category FROM files WHERE 1=1'
params = []
if category:
query += ' AND category = %s'
params.append(category)
if not show_build:
query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"
query += ' ORDER BY category, size DESC LIMIT 100'
cursor.execute(query, params)
files = cursor.fetchall()
if not files:
print('No files found matching criteria')
return
print(f'\n=== MIGRATION PREVIEW ===')
print(f'Showing {len(files)} files\n')
current_category = None
for path, size, cat in files:
if cat != current_category:
current_category = cat
print(f'\n{cat}:')
labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
target = classifier.suggest_target_path(path, suggested_cat, labels)
print(f' {path}')
print(f'{target} ({self.format_size(int(size))})')
finally:
cursor.close()
conn.close()
@staticmethod
def format_size(size: int) -> str:
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size < 1024:
return f'{size:.1f}{unit}'
size /= 1024
return f'{size:.1f}PB'
def main():
parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
subparsers = parser.add_subparsers(dest='command', required=True)
index_parser = subparsers.add_parser('index', help='Index files on a disk')
index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
index_parser.add_argument('disk_name', help='Logical name for the disk')
plan_parser = subparsers.add_parser('plan', help='Create migration plan')
plan_parser.add_argument('target_disk', help='Disk to free up')
plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
merge_parser.add_argument('--target', required=True, help='Target disk')
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
profile_parser.add_argument('--disk', help='Profile specific disk')
profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
profile_parser.add_argument('--limit', type=int, help='Limit number of files')
extract_parser = subparsers.add_parser('extract', help='Extract content from files')
extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
classify_parser.add_argument('--disk', help='Classify specific disk')
classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
review_parser.add_argument('--category', help='Review specific category')
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
report_parser = subparsers.add_parser('report', help='Show current status')
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
args = parser.parse_args()
tool = DiskReorganizer()
if args.command == 'index':
tool.index_disk(args.disk_root, args.disk_name)
elif args.command == 'dedupe':
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
elif args.command == 'merge':
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output, filter_system=args.filter_system, network_target=args.network)
elif args.command == 'plan':
plan = tool.plan_migration(args.target_disk, args.dest_disks)
if plan:
print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
print(f"Destination disks: {', '.join(plan['destination_disks'])}")
elif args.command == 'execute':
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
elif args.command == 'profile':
tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
elif args.command == 'extract':
tool.extract_content(kind=args.kind, limit=args.limit)
elif args.command == 'classify':
tool.classify_files(disk=args.disk, update_db=args.update)
elif args.command == 'review':
tool.review_migration(category=args.category, show_build=args.show_build)
elif args.command == 'report':
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
if __name__ == '__main__':
main()