initial

2025-12-13 11:56:06 +01:00
commit 2b2c575385
57 changed files with 6505 additions and 0 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,918 @@
+import os
+import sys
+from dataclasses import dataclass
+import psycopg2
+import shutil
+import hashlib
+import argparse
+import json
+from pathlib import Path
+from typing import List, Dict, Optional
+from datetime import datetime
+import logging
+import time
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)])
+logger = logging.getLogger(__name__)
+
+@dataclass
+class FileRecord:
+    path: str
+    size: int
+    modified_time: float
+    disk_label: str
+    checksum: Optional[str] = None
+    status: str = 'indexed'
+
+class DiskReorganizer:
+
+    def __init__(self, db_config: Dict=None):
+        if db_config is None:
+            db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
+        self.db_config = db_config
+        self.init_database()
+
+    def get_connection(self):
+        return psycopg2.connect(**self.db_config)
+
+    def init_database(self):
+        try:
+            conn = self.get_connection()
+            cursor = conn.cursor()
+            cursor.execute("\n                SELECT table_name FROM information_schema.tables\n                WHERE table_schema = 'public' AND table_name IN ('files', 'operations')\n            ")
+            tables = cursor.fetchall()
+            if len(tables) < 2:
+                logger.error('Database tables not found! Please run setup_database.sh first.')
+                raise Exception('Database not properly initialized. Run setup_database.sh')
+            cursor.close()
+            conn.close()
+            logger.info('Database connection verified successfully')
+        except psycopg2.Error as e:
+            logger.error(f'Database connection failed: {e}')
+            raise
+
+    def index_disk(self, disk_root: str, disk_name: str):
+        logger.info(f'Indexing disk: {disk_name} at {disk_root}')
+        disk_path = Path(disk_root)
+        if not disk_path.exists():
+            logger.error(f'Disk path {disk_root} does not exist!')
+            return
+        files_count = 0
+        total_size = 0
+        start_time = time.time()
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            for root, dirs, files in os.walk(disk_path):
+                dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))]
+                for file in files:
+                    try:
+                        file_path = Path(root) / file
+                        if not file_path.is_file():
+                            continue
+                        stat = file_path.stat()
+                        size = stat.st_size
+                        mtime = datetime.fromtimestamp(stat.st_mtime)
+                        rel_path = str(file_path.relative_to(disk_path))
+                        cursor.execute('\n                            INSERT INTO files (path, size, modified_time, disk_label, checksum, status)\n                            VALUES (%s, %s, %s, %s, %s, %s)\n                            ON CONFLICT (path) DO UPDATE SET\n                                size = EXCLUDED.size,\n                                modified_time = EXCLUDED.modified_time,\n                                disk_label = EXCLUDED.disk_label,\n                                status = EXCLUDED.status\n                        ', (rel_path, size, mtime, disk_name, None, 'indexed'))
+                        files_count += 1
+                        total_size += size
+                        if files_count % 100 == 0:
+                            elapsed = time.time() - start_time
+                            rate = files_count / elapsed if elapsed > 0 else 0
+                            display_path = str(file_path)
+                            if len(display_path) > 60:
+                                display_path = '...' + display_path[-57:]
+                            print(f'\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}', end='', flush=True)
+                        if files_count % 1000 == 0:
+                            conn.commit()
+                    except Exception as e:
+                        conn.rollback()
+                        logger.warning(f'\nSkipping {file_path}: {e}')
+                        continue
+            conn.commit()
+            print()
+            logger.info(f'Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}')
+        finally:
+            cursor.close()
+            conn.close()
+
+    def calculate_disk_usage(self) -> Dict[str, Dict]:
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            cursor.execute('\n                SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count\n                FROM files\n                GROUP BY disk_label\n            ')
+            usage = {}
+            for row in cursor.fetchall():
+                disk = row[0]
+                size = int(row[1] or 0)
+                count = int(row[2])
+                usage[disk] = {'size': size, 'count': count, 'formatted_size': self.format_size(size)}
+            return usage
+        finally:
+            cursor.close()
+            conn.close()
+
+    def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict:
+        logger.info(f'Planning migration to free up {target_disk}')
+        usage = self.calculate_disk_usage()
+        if target_disk not in usage:
+            logger.error(f'Target disk {target_disk} not found in index!')
+            return {}
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        cursor.execute('SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC', (target_disk,))
+        files_to_move = cursor.fetchall()
+        cursor.close()
+        conn.close()
+        target_disk_usage = usage[target_disk]['size']
+        logger.info(f'Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}')
+        dest_availability = []
+        for disk in destination_disks:
+            if disk not in usage:
+                available = float('inf')
+            else:
+                available = float('inf')
+            dest_availability.append({'disk': disk, 'available': available, 'planned_usage': 0})
+        plan = {'target_disk': target_disk, 'total_size': target_disk_usage, 'file_count': len(files_to_move), 'operations': [], 'destination_disks': destination_disks}
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            for file_info in files_to_move:
+                rel_path, size, mtime = file_info
+                dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)]
+                op = {'source_disk': target_disk, 'source_path': rel_path, 'dest_disk': dest_disk, 'target_path': rel_path, 'size': int(size)}
+                plan['operations'].append(op)
+                cursor.execute('INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)', (f'{target_disk}:{rel_path}', f'{dest_disk}:{rel_path}', 'move', 'pending'))
+            conn.commit()
+        finally:
+            cursor.close()
+            conn.close()
+        plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        with open(plan_file, 'w') as f:
+            json.dump(plan, f, indent=2)
+        logger.info(f"Plan created with {len(plan['operations'])} operations")
+        logger.info(f'Plan saved to {plan_file}')
+        return plan
+
+    def verify_operation(self, source: Path, dest: Path) -> bool:
+        if not dest.exists():
+            return False
+        try:
+            source_stat = source.stat()
+            dest_stat = dest.stat()
+            if source_stat.st_size != dest_stat.st_size:
+                return False
+            return True
+        except Exception as e:
+            logger.error(f'Verification error: {e}')
+            return False
+
+    @staticmethod
+    def file_checksum(path: Path) -> str:
+        hash_md5 = hashlib.md5()
+        with open(path, 'rb') as f:
+            for chunk in iter(lambda: f.read(4096), b''):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+
+    def execute_migration(self, plan_file: str, dry_run: bool=True):
+        logger.info(f"{('DRY RUN' if dry_run else 'EXECUTING')} migration from {plan_file}")
+        with open(plan_file, 'r') as f:
+            plan = json.load(f)
+        operations = plan['operations']
+        logger.info(f'Processing {len(operations)} operations...')
+        success_count = 0
+        error_count = 0
+        start_time = time.time()
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            for i, op in enumerate(operations, 1):
+                source_disk = op['source_disk']
+                source_path = op['source_path']
+                dest_disk = op['dest_disk']
+                target_path = op['target_path']
+                source_full = Path(source_disk) / source_path
+                dest_full = Path(dest_disk) / target_path
+                elapsed = time.time() - start_time
+                rate = i / elapsed if elapsed > 0 else 0
+                eta = (len(operations) - i) / rate if rate > 0 else 0
+                display_path = str(source_path)
+                if len(display_path) > 50:
+                    display_path = '...' + display_path[-47:]
+                print(f'\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}', end='', flush=True)
+                if dry_run:
+                    if source_full.exists():
+                        success_count += 1
+                    else:
+                        logger.warning(f'\n  Source does not exist: {source_full}')
+                        error_count += 1
+                    continue
+                try:
+                    dest_full.parent.mkdir(parents=True, exist_ok=True)
+                    if source_full.exists():
+                        shutil.copy2(source_full, dest_full)
+                        if self.verify_operation(source_full, dest_full):
+                            cursor.execute("UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s", (dest_disk, source_path, source_disk))
+                            cursor.execute('UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s', (f'{source_disk}:{source_path}',))
+                            success_count += 1
+                        else:
+                            raise Exception('Verification failed')
+                    else:
+                        logger.warning(f'\n  Source missing: {source_full}')
+                        error_count += 1
+                except Exception as e:
+                    logger.error(f'\n  Error processing {source_path}: {e}')
+                    cursor.execute('UPDATE operations SET error = %s WHERE source_path = %s', (str(e), f'{source_disk}:{source_path}'))
+                    error_count += 1
+                if i % 10 == 0:
+                    conn.commit()
+            conn.commit()
+            print()
+        finally:
+            cursor.close()
+            conn.close()
+        logger.info(f'Migration complete: {success_count} success, {error_count} errors')
+        if not dry_run and error_count == 0:
+            logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
+            logger.info(f"  Remember to safely delete original files from {plan['target_disk']}")
+
+    def run_deduplication(self, disk: Optional[str]=None, use_chunks: bool=True):
+        logger.info(f"Starting deduplication{(' for disk ' + disk if disk else '')}")
+        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
+        conn = self.get_connection()
+        cursor = conn.cursor()
+
+        def hash_file_local(file_path: Path) -> str:
+            hasher = hashlib.sha256()
+            with open(file_path, 'rb') as f:
+                while (chunk := f.read(65536)):
+                    hasher.update(chunk)
+            return hasher.hexdigest()
+        try:
+            if disk:
+                cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC', (disk,))
+            else:
+                cursor.execute('SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC')
+            files_to_process = cursor.fetchall()
+            total = len(files_to_process)
+            logger.info(f'Found {total} files to hash')
+            processed = 0
+            skipped = 0
+            start_time = time.time()
+            batch = []
+            print(f'Phase 1: Computing checksums...')
+            for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
+                try:
+                    mount_point = disk_mount_map.get(disk_label, disk_label)
+                    full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
+                    if not full_path.exists():
+                        skipped += 1
+                        if idx % 100 == 0:
+                            elapsed = time.time() - start_time
+                            rate = (processed + skipped) / elapsed if elapsed > 0 else 0
+                            remaining = (total - idx) / rate if rate > 0 else 0
+                            pct = 100 * idx / total
+                            print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
+                        continue
+                    checksum = hash_file_local(full_path)
+                    batch.append((checksum, path_str))
+                    processed += 1
+                    if len(batch) >= 1000:
+                        try:
+                            cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
+                            conn.commit()
+                            batch.clear()
+                        except Exception as e:
+                            conn.rollback()
+                            batch.clear()
+                            print(f'\nBatch update failed: {e}')
+                    if idx % 100 == 0:
+                        elapsed = time.time() - start_time
+                        rate = (processed + skipped) / elapsed if elapsed > 0 else 0
+                        remaining = (total - idx) / rate if rate > 0 else 0
+                        pct = 100 * idx / total
+                        print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
+                except Exception as e:
+                    skipped += 1
+                    if idx <= 5:
+                        print(f'\nDebug: {full_path} - {e}')
+            if batch:
+                try:
+                    cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
+                    conn.commit()
+                except Exception as e:
+                    conn.rollback()
+                    print(f'\nFinal batch failed: {e}')
+            print()
+            elapsed = time.time() - start_time
+            logger.info(f'Phase 1 done: {processed:,} files in {int(elapsed / 60)}m{int(elapsed % 60):02d}s ({skipped:,} skipped)')
+            print('Phase 2: Finding duplicates...')
+            cursor.execute('\n                UPDATE files f1 SET duplicate_of = (\n                    SELECT MIN(path) FROM files f2\n                    WHERE f2.checksum = f1.checksum AND f2.path < f1.path\n                )\n                WHERE checksum IS NOT NULL\n            ')
+            conn.commit()
+            cursor.execute('SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL')
+            dup_count = cursor.fetchone()[0]
+            logger.info(f'Phase 2 done: Found {dup_count:,} duplicates')
+        finally:
+            cursor.close()
+            conn.close()
+
+    def plan_merge(self, sources: List[str], target: str, output_file: str, filter_system: bool=False, network_target: str=None):
+        logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
+        if filter_system:
+            sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+            from filters import GitignoreFilter
+            file_filter = GitignoreFilter()
+            logger.info('System/build file filtering enabled')
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            placeholders = ','.join(['%s'] * len(sources))
+            cursor.execute(f'\n                SELECT path, size, checksum, disk_label, duplicate_of\n                FROM files\n                WHERE disk_label IN ({placeholders})\n                ORDER BY size DESC\n            ', tuple(sources))
+            files = cursor.fetchall()
+            total_files = len(files)
+            total_size = sum((int(f[1]) for f in files))
+            unique_files = {}
+            duplicate_count = 0
+            duplicate_size = 0
+            filtered_count = 0
+            filtered_size = 0
+            for path, size, checksum, disk_label, duplicate_of in files:
+                if filter_system and file_filter.should_exclude(path):
+                    filtered_count += 1
+                    filtered_size += int(size)
+                    continue
+                if checksum and checksum in unique_files:
+                    duplicate_count += 1
+                    duplicate_size += int(size)
+                elif checksum:
+                    unique_files[checksum] = (path, int(size), disk_label)
+            unique_count = len(unique_files)
+            unique_size = sum((f[1] for f in unique_files.values()))
+            plan = {'sources': sources, 'target': target or network_target, 'network': network_target is not None, 'total_files': total_files, 'total_size': total_size, 'unique_files': unique_count, 'unique_size': unique_size, 'duplicate_files': duplicate_count, 'duplicate_size': duplicate_size, 'filtered_files': filtered_count if filter_system else 0, 'filtered_size': filtered_size if filter_system else 0, 'space_saved': duplicate_size + (filtered_size if filter_system else 0), 'operations': []}
+            for checksum, (path, size, disk_label) in unique_files.items():
+                plan['operations'].append({'source_disk': disk_label, 'source_path': path, 'target_disk': target or network_target, 'target_path': path, 'size': size, 'checksum': checksum})
+            with open(output_file, 'w') as f:
+                json.dump(plan, f, indent=2)
+            logger.info(f'Merge plan saved to {output_file}')
+            print(f'\n=== MERGE PLAN SUMMARY ===')
+            print(f"Sources: {', '.join(sources)}")
+            print(f'Target: {target or network_target}')
+            print(f'Total files: {total_files:,} ({self.format_size(total_size)})')
+            if filter_system:
+                print(f'Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})')
+            print(f'Unique files: {unique_count:,} ({self.format_size(unique_size)})')
+            print(f'Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})')
+            print(f"Total space saved: {self.format_size(plan['space_saved'])}")
+            print(f'Space needed on target: {self.format_size(unique_size)}')
+        finally:
+            cursor.close()
+            conn.close()
+
+    def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            if preview_merge:
+                with open(preview_merge, 'r') as f:
+                    plan = json.load(f)
+                print('\n=== MERGE PLAN PREVIEW ===')
+                print(f"Sources: {', '.join(plan['sources'])}")
+                print(f"Target: {plan['target']}")
+                print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
+                print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
+                print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
+                print(f"Space saved: {self.format_size(plan['space_saved'])}")
+                print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
+                return
+            cursor.execute('\n                SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status\n            ')
+            print('\n=== FILE MIGRATION REPORT ===')
+            for row in cursor.fetchall():
+                status, count, size = row
+                print(f'{status:15}: {count:6} files, {self.format_size(int(size or 0))}')
+            cursor.execute('\n                SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label\n            ')
+            print('\n=== DISK USAGE ===')
+            for row in cursor.fetchall():
+                disk, count, size = row
+                print(f'{disk:20}: {count:6} files, {self.format_size(int(size or 0))}')
+            cursor.execute('\n                SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL\n            ')
+            hashed_count, hashed_size = cursor.fetchone()
+            cursor.execute('\n                SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL\n            ')
+            dup_count, dup_size = cursor.fetchone()
+            print('\n=== DEDUPLICATION STATS ===')
+            print(f'Files with checksums: {hashed_count or 0:6}')
+            print(f'Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})')
+            if show_duplicates and dup_count:
+                print('\n=== DUPLICATE FILES ===')
+                cursor.execute('\n                    SELECT path, size, duplicate_of FROM files\n                    WHERE duplicate_of IS NOT NULL\n                    ORDER BY size DESC\n                    LIMIT 20\n                ')
+                for path, size, dup_of in cursor.fetchall():
+                    print(f'  {path} ({self.format_size(int(size))}) → {dup_of}')
+            cursor.execute('\n                SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified\n            ')
+            print('\n=== OPERATIONS REPORT ===')
+            for row in cursor.fetchall():
+                op_type, executed, verified, count = row
+                status = 'EXECUTED' if executed else 'PENDING'
+                if verified:
+                    status += '+VERIFIED'
+                print(f'{op_type:10} {status:15}: {count} operations')
+        finally:
+            cursor.close()
+            conn.close()
+
+    def profile_content(self, disk: Optional[str]=None, update_db: bool=False, limit: Optional[int]=None):
+        from content.profiler import ContentProfiler
+        profiler = ContentProfiler()
+        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            query = 'SELECT path, size, disk_label FROM files WHERE 1=1'
+            params = []
+            if disk:
+                query += ' AND disk_label = %s'
+                params.append(disk)
+            if limit:
+                query += f' LIMIT {limit}'
+            cursor.execute(query, params)
+            files = cursor.fetchall()
+            total = len(files)
+            logger.info(f'Profiling {total:,} files...')
+            kind_stats = {}
+            processable = 0
+            batch = []
+            for idx, (path, size, disk_label) in enumerate(files, 1):
+                mount_point = disk_mount_map.get(disk_label, disk_label)
+                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
+                if not full_path.exists():
+                    continue
+                profile = profiler.profile_file(full_path)
+                if 'error' not in profile:
+                    kind = profile['kind']
+                    if kind not in kind_stats:
+                        kind_stats[kind] = {'count': 0, 'processable': 0}
+                    kind_stats[kind]['count'] += 1
+                    if profile['processable']:
+                        kind_stats[kind]['processable'] += 1
+                        processable += 1
+                    if update_db:
+                        profile_json = json.dumps(profile)
+                        batch.append((kind, profile_json, path))
+                        if len(batch) >= 500:
+                            cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
+                            conn.commit()
+                            batch.clear()
+                if idx % 100 == 0:
+                    print(f'\rProfiled: {idx:,}/{total:,}', end='', flush=True)
+            if update_db and batch:
+                cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
+                conn.commit()
+            print()
+            print(f'\n=== CONTENT PROFILE SUMMARY ===')
+            print(f'Total files: {total:,}')
+            print(f'Processable: {processable:,}\n')
+            print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
+            print('-' * 60)
+            for kind in sorted(kind_stats.keys()):
+                stats = kind_stats[kind]
+                extractor = profiler._suggest_extractor(kind, '')
+                print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
+        finally:
+            cursor.close()
+            conn.close()
+
+    def extract_content(self, kind: Optional[str]=None, limit: int=10):
+        from content.profiler import ContentProfiler
+        from content.extractors import ContentExtractor
+        profiler = ContentProfiler()
+        extractor = ContentExtractor()
+        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
+            params = []
+            if kind:
+                query += " AND metadata->'profile'->>'kind' = %s"
+                params.append(kind)
+            query += f' LIMIT {limit}'
+            cursor.execute(query, params)
+            files = cursor.fetchall()
+            print(f'\n=== EXTRACTING CONTENT ===')
+            print(f'Processing {len(files)} files\n')
+            for path, size, disk_label, metadata in files:
+                mount_point = disk_mount_map.get(disk_label, disk_label)
+                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
+                if not full_path.exists():
+                    continue
+                profile = metadata.get('profile', {}) if metadata else {}
+                extractor_type = profile.get('extractor')
+                if not extractor_type:
+                    continue
+                print(f'Extracting: {path}')
+                print(f"  Type: {profile.get('kind')} | Extractor: {extractor_type}")
+                result = extractor.extract(full_path, extractor_type)
+                if 'text' in result:
+                    preview = result['text'][:200]
+                    print(f'  Preview: {preview}...')
+                elif 'pipeline' in result:
+                    print(f"  Pipeline: {' → '.join(result['pipeline'])}")
+                    print(f"  Status: {result.get('status', 'pending')}")
+                print()
+        finally:
+            cursor.close()
+            conn.close()
+
+    def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False):
+        from parsers.text_parser import TextParser
+        from parsers.code_parser import CodeParser
+        from parsers.pdf_parser import PDFParser
+
+        parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()}
+        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
+
+        conn = self.get_connection()
+        cursor = conn.cursor()
+
+        try:
+            query = "SELECT path, size, disk_label FROM files WHERE 1=1"
+            params = []
+            if kind:
+                suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
+                if kind in suffix_map:
+                    query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
+            query += f" LIMIT {limit}"
+
+            cursor.execute(query, params)
+            files = cursor.fetchall()
+
+            print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
+
+            parsed_count = 0
+            for path, size, disk_label in files:
+                mount_point = disk_mount_map.get(disk_label, disk_label)
+                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
+
+                if not full_path.exists() or int(size) > 10 * 1024 * 1024:
+                    continue
+
+                file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text'
+                parser = parsers.get(file_kind)
+                if not parser:
+                    continue
+
+                result = parser.parse(full_path)
+                if 'error' not in result:
+                    text = result.get('text', '')
+                    quality = result.get('quality', 'unknown')
+                    print(f"{path[:60]} | {file_kind} | {len(text):,} chars")
+
+                    if update_db and text:
+                        cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path))
+                        parsed_count += 1
+                        if parsed_count % 10 == 0:
+                            conn.commit()
+
+            if update_db:
+                conn.commit()
+            print(f"\nParsed {parsed_count} files")
+
+        finally:
+            cursor.close()
+            conn.close()
+
+    def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
+        from enrichment.enricher import ContentEnricher
+
+        enricher = ContentEnricher()
+        conn = self.get_connection()
+        cursor = conn.cursor()
+
+        try:
+            cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
+            files = cursor.fetchall()
+
+            print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
+
+            for path, text in files:
+                enrichment = enricher.enrich(text[:5000], use_llm=False)
+                print(f"{path[:60]}")
+                print(f"  Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
+                print(f"  PII: {list(enrichment.get('has_pii', {}).keys())}")
+                print(f"  Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
+
+                cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
+
+            conn.commit()
+            print(f"Enriched {len(files)} files")
+
+        finally:
+            cursor.close()
+            conn.close()
+
+    def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True):
+        from classification.classifier import FileClassifier
+        classifier = FileClassifier()
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            task_name = f"classify_{disk or 'all'}"
+            skip_count = 0
+
+            if resume and update_db:
+                cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,))
+                checkpoint = cursor.fetchone()
+                if checkpoint:
+                    last_path, skip_count = checkpoint
+                    logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed')
+
+            if disk:
+                cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,))
+            else:
+                cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path')
+            files = cursor.fetchall()
+            total = len(files)
+            logger.info(f'Classifying {total:,} files...')
+
+            categories = {}
+            build_artifacts = 0
+            batch = []
+            processed = 0
+
+            for idx, (path, size, disk_label) in enumerate(files, 1):
+                if idx <= skip_count:
+                    continue
+
+                labels, category, is_build = classifier.classify_path(path, int(size))
+                if is_build:
+                    build_artifacts += 1
+                if category not in categories:
+                    categories[category] = {'count': 0, 'size': 0}
+                categories[category]['count'] += 1
+                categories[category]['size'] += int(size)
+
+                if update_db:
+                    labels_str = ','.join(labels)
+                    batch.append((category, labels_str, path))
+
+                    if len(batch) >= 1000:
+                        cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
+                        cursor.execute('''
+                            INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
+                            VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
+                            ON CONFLICT (task_name) DO UPDATE SET
+                                last_processed_path = EXCLUDED.last_processed_path,
+                                processed_count = EXCLUDED.processed_count,
+                                updated_at = CURRENT_TIMESTAMP
+                        ''', (task_name, path, idx))
+                        conn.commit()
+                        batch.clear()
+
+                processed += 1
+                if idx % 1000 == 0:
+                    print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True)
+
+            if update_db and batch:
+                cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
+                cursor.execute('''
+                    INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
+                    VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
+                    ON CONFLICT (task_name) DO UPDATE SET
+                        last_processed_path = EXCLUDED.last_processed_path,
+                        processed_count = EXCLUDED.processed_count,
+                        updated_at = CURRENT_TIMESTAMP
+                ''', (task_name, files[-1][0] if files else '', total))
+                conn.commit()
+
+            print()
+            print(f'\n=== CLASSIFICATION SUMMARY ===')
+            print(f'Total files: {total:,}')
+            print(f'Build artifacts: {build_artifacts:,}')
+            print(f'\nCategories:')
+            for category in sorted(categories.keys()):
+                info = categories[category]
+                print(f"  {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
+        finally:
+            cursor.close()
+            conn.close()
+
+    def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
+        from analysis.folder_analyzer import FolderAnalyzer
+        analyzer = FolderAnalyzer()
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            query = '''
+                SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label
+                FROM files
+                WHERE 1=1
+            '''
+            params = []
+            if disk:
+                query += ' AND disk_label = %s'
+                params.append(disk)
+
+            cursor.execute(query, params)
+            potential_folders = cursor.fetchall()
+
+            logger.info(f'Found {len(potential_folders)} potential folders to analyze')
+
+            processed = 0
+            for folder_name, disk_label in potential_folders:
+                cursor.execute('''
+                    SELECT path, size FROM files
+                    WHERE disk_label = %s AND path LIKE %s
+                ''', (disk_label, f'{folder_name}%'))
+
+                files = cursor.fetchall()
+                if len(files) < min_files:
+                    continue
+
+                files_list = [{'path': f[0], 'size': int(f[1])} for f in files]
+                folder_path = Path(folder_name)
+
+                analysis = analyzer.analyze_folder(folder_path, files_list)
+
+                readme_text = None
+                for file_dict in files_list:
+                    if 'readme' in file_dict['path'].lower():
+                        readme_text = f"Found README at {file_dict['path']}"
+                        break
+
+                summary = analyzer.generate_summary(analysis, readme_text)
+
+                cursor.execute('''
+                    INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary,
+                                        has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure)
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+                    ON CONFLICT (path) DO UPDATE SET
+                        file_count = EXCLUDED.file_count,
+                        total_size = EXCLUDED.total_size,
+                        project_type = EXCLUDED.project_type,
+                        intent = EXCLUDED.intent,
+                        summary = EXCLUDED.summary,
+                        has_readme = EXCLUDED.has_readme,
+                        has_git = EXCLUDED.has_git,
+                        has_manifest = EXCLUDED.has_manifest,
+                        manifest_types = EXCLUDED.manifest_types,
+                        dominant_file_types = EXCLUDED.dominant_file_types,
+                        structure = EXCLUDED.structure,
+                        updated_at = CURRENT_TIMESTAMP
+                ''', (
+                    str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list),
+                    analysis.get('project_type'), analysis.get('intent'), summary,
+                    analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'),
+                    analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})),
+                    json.dumps(analysis.get('structure', {}))
+                ))
+
+                processed += 1
+                if processed % 100 == 0:
+                    conn.commit()
+                    print(f'\rAnalyzed: {processed} folders', end='', flush=True)
+
+            conn.commit()
+            print()
+            logger.info(f'Completed folder analysis: {processed} folders')
+
+            cursor.execute('''
+                SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size)
+                FROM folders
+                GROUP BY project_type
+            ''')
+            print(f'\n=== FOLDER ANALYSIS SUMMARY ===')
+            for row in cursor.fetchall():
+                proj_type, count, files, size = row
+                print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}')
+
+        finally:
+            cursor.close()
+            conn.close()
+
+    def review_migration(self, category: Optional[str]=None, show_build: bool=False):
+        from classification.classifier import FileClassifier
+        classifier = FileClassifier()
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            query = 'SELECT path, size, category FROM files WHERE 1=1'
+            params = []
+            if category:
+                query += ' AND category = %s'
+                params.append(category)
+            if not show_build:
+                query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"
+            query += ' ORDER BY category, size DESC LIMIT 100'
+            cursor.execute(query, params)
+            files = cursor.fetchall()
+            if not files:
+                print('No files found matching criteria')
+                return
+            print(f'\n=== MIGRATION PREVIEW ===')
+            print(f'Showing {len(files)} files\n')
+            current_category = None
+            for path, size, cat in files:
+                if cat != current_category:
+                    current_category = cat
+                    print(f'\n{cat}:')
+                labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
+                target = classifier.suggest_target_path(path, suggested_cat, labels)
+                print(f'  {path}')
+                print(f'    → {target} ({self.format_size(int(size))})')
+        finally:
+            cursor.close()
+            conn.close()
+
+    @staticmethod
+    def format_size(size: int) -> str:
+        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+            if size < 1024:
+                return f'{size:.1f}{unit}'
+            size /= 1024
+        return f'{size:.1f}PB'
+
+def main():
+    parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
+    subparsers = parser.add_subparsers(dest='command', required=True)
+    index_parser = subparsers.add_parser('index', help='Index files on a disk')
+    index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
+    index_parser.add_argument('disk_name', help='Logical name for the disk')
+    plan_parser = subparsers.add_parser('plan', help='Create migration plan')
+    plan_parser.add_argument('target_disk', help='Disk to free up')
+    plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
+    exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
+    exec_parser.add_argument('plan_file', help='Path to plan JSON file')
+    exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
+    dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
+    dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
+    dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
+    merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
+    merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
+    merge_parser.add_argument('--target', required=True, help='Target disk')
+    merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
+    merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
+    merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
+    profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
+    profile_parser.add_argument('--disk', help='Profile specific disk')
+    profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
+    profile_parser.add_argument('--limit', type=int, help='Limit number of files')
+    extract_parser = subparsers.add_parser('extract', help='Extract content from files')
+    extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
+    extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
+
+    parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
+    parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
+    parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
+    parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
+
+    enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
+    enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
+    enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
+    enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
+
+    classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
+    classify_parser.add_argument('--disk', help='Classify specific disk')
+    classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
+    classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch instead of resuming')
+    folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
+    folders_parser.add_argument('--disk', help='Analyze specific disk')
+    folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
+    review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
+    review_parser.add_argument('--category', help='Review specific category')
+    review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
+    report_parser = subparsers.add_parser('report', help='Show current status')
+    report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
+    report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
+    report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
+    args = parser.parse_args()
+    tool = DiskReorganizer()
+    if args.command == 'index':
+        tool.index_disk(args.disk_root, args.disk_name)
+    elif args.command == 'dedupe':
+        tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
+    elif args.command == 'merge':
+        tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output, filter_system=args.filter_system, network_target=args.network)
+    elif args.command == 'plan':
+        plan = tool.plan_migration(args.target_disk, args.dest_disks)
+        if plan:
+            print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
+            print(f"Destination disks: {', '.join(plan['destination_disks'])}")
+    elif args.command == 'execute':
+        tool.execute_migration(args.plan_file, dry_run=args.dry_run)
+    elif args.command == 'profile':
+        tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
+    elif args.command == 'extract':
+        tool.extract_content(kind=args.kind, limit=args.limit)
+    elif args.command == 'parse':
+        tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
+    elif args.command == 'enrich':
+        tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
+    elif args.command == 'classify':
+        tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
+    elif args.command == 'analyze-folders':
+        tool.analyze_folders(disk=args.disk, min_files=args.min_files)
+    elif args.command == 'review':
+        tool.review_migration(category=args.category, show_build=args.show_build)
+    elif args.command == 'report':
+        tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
+if __name__ == '__main__':
+    main()