defrag/app/main.py

#!/usr/bin/env python3
import os
import sys
from dataclasses import dataclass

import psycopg2
import shutil
import hashlib
import argparse
import json
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
import logging
import time

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

@dataclass
class FileRecord:
    """Represents a file in the index"""
    path: str
    size: int
    modified_time: float
    disk_label: str
    checksum: Optional[str] = None
    status: str = 'indexed'  # indexed, planned, moved, verified

class DiskReorganizer:
    def __init__(self, db_config: Dict = None):
        """
        Initialize DiskReorganizer with PostgreSQL connection
        :param db_config: Database configuration dict with host, port, database, user, password
        """
        if db_config is None:
            db_config = {
                'host': os.getenv('DB_HOST', '192.168.1.159'),
                'port': int(os.getenv('DB_PORT', 5432)),
                'database': os.getenv('DB_NAME', 'disk_reorganizer_db'),
                'user': os.getenv('DB_USER', 'disk_reorg_user'),
                'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')
            }
        self.db_config = db_config
        self.init_database()

    def get_connection(self):
        """Get PostgreSQL database connection"""
        return psycopg2.connect(**self.db_config)

    def init_database(self):
        """Verify PostgreSQL database connection and tables exist"""
        try:
            conn = self.get_connection()
            cursor = conn.cursor()

            # Test connection and verify tables exist
            cursor.execute("""
                SELECT table_name FROM information_schema.tables
                WHERE table_schema = 'public' AND table_name IN ('files', 'operations')
            """)
            tables = cursor.fetchall()

            if len(tables) < 2:
                logger.error("Database tables not found! Please run setup_database.sh first.")
                raise Exception("Database not properly initialized. Run setup_database.sh")

            cursor.close()
            conn.close()
            logger.info("Database connection verified successfully")
        except psycopg2.Error as e:
            logger.error(f"Database connection failed: {e}")
            raise

    def index_disk(self, disk_root: str, disk_name: str):
        """
        Index all files on a disk/partition with dynamic progress display
        :param disk_root: Root path of disk (e.g., 'D:\\')
        :param disk_name: Logical name for the disk
        """
        logger.info(f"Indexing disk: {disk_name} at {disk_root}")
        disk_path = Path(disk_root)

        if not disk_path.exists():
            logger.error(f"Disk path {disk_root} does not exist!")
            return

        files_count = 0
        total_size = 0
        start_time = time.time()

        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            # Walk through all files
            for root, dirs, files in os.walk(disk_path):
                # Skip system directories
                dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))]

                for file in files:
                    try:
                        file_path = Path(root) / file
                        if not file_path.is_file():
                            continue

                        stat = file_path.stat()
                        size = stat.st_size
                        mtime = datetime.fromtimestamp(stat.st_mtime)

                        # Calculate relative path for portability
                        rel_path = str(file_path.relative_to(disk_path))

                        # PostgreSQL INSERT ... ON CONFLICT for upsert
                        cursor.execute("""
                            INSERT INTO files (path, size, modified_time, disk_label, checksum, status)
                            VALUES (%s, %s, %s, %s, %s, %s)
                            ON CONFLICT (path) DO UPDATE SET
                                size = EXCLUDED.size,
                                modified_time = EXCLUDED.modified_time,
                                disk_label = EXCLUDED.disk_label,
                                status = EXCLUDED.status
                        """, (rel_path, size, mtime, disk_name, None, 'indexed'))

                        files_count += 1
                        total_size += size

                        # Dynamic progress display - update every 100 files
                        if files_count % 100 == 0:
                            elapsed = time.time() - start_time
                            rate = files_count / elapsed if elapsed > 0 else 0
                            # Truncate path for display
                            display_path = str(file_path)
                            if len(display_path) > 60:
                                display_path = '...' + display_path[-57:]

                            # Use \r to overwrite the line
                            print(f"\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}", end='', flush=True)

                        # Commit every 1000 files for performance
                        if files_count % 1000 == 0:
                            conn.commit()

                    except Exception as e:
                        conn.rollback()
                        logger.warning(f"\nSkipping {file_path}: {e}")
                        continue

            conn.commit()
            print()  # New line after progress display
            logger.info(f"Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}")

        finally:
            cursor.close()
            conn.close()

    def calculate_disk_usage(self) -> Dict[str, Dict]:
        """Calculate current usage per disk"""
        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            cursor.execute("""
                SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count
                FROM files
                GROUP BY disk_label
            """)

            usage = {}
            for row in cursor.fetchall():
                disk = row[0]
                size = int(row[1] or 0)
                count = int(row[2])
                usage[disk] = {
                    'size': size,
                    'count': count,
                    'formatted_size': self.format_size(size)
                }

            return usage
        finally:
            cursor.close()
            conn.close()

    def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict:
        """
        Create a migration plan to free up target_disk
        :param target_disk: Disk to free up (e.g., 'D:')
        :param destination_disks: List of disks to move files to
        :return: Migration plan dictionary
        """
        logger.info(f"Planning migration to free up {target_disk}")

        usage = self.calculate_disk_usage()

        if target_disk not in usage:
            logger.error(f"Target disk {target_disk} not found in index!")
            return {}

        # Get files on target disk
        conn = self.get_connection()
        cursor = conn.cursor()

        cursor.execute(
            "SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC",
            (target_disk,)
        )
        files_to_move = cursor.fetchall()
        cursor.close()
        conn.close()

        target_disk_usage = usage[target_disk]['size']
        logger.info(f"Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}")

        # Calculate available space on destination disks
        dest_availability = []
        for disk in destination_disks:
            if disk not in usage:
                # Assume empty disk
                available = float('inf')
            else:
                # In real scenario, query actual disk free space
                available = float('inf')  # Placeholder

            dest_availability.append({
                'disk': disk,
                'available': available,
                'planned_usage': 0
            })

        # Generate move plan
        plan = {
            'target_disk': target_disk,
            'total_size': target_disk_usage,
            'file_count': len(files_to_move),
            'operations': [],
            'destination_disks': destination_disks
        }

        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            for file_info in files_to_move:
                rel_path, size, mtime = file_info

                # Find best destination (simple round-robin for balance)
                dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)]

                # Record operation
                op = {
                    'source_disk': target_disk,
                    'source_path': rel_path,
                    'dest_disk': dest_disk,
                    'target_path': rel_path,  # Keep same relative path
                    'size': int(size)
                }
                plan['operations'].append(op)

                # Store in database
                cursor.execute(
                    "INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)",
                    (f"{target_disk}:{rel_path}", f"{dest_disk}:{rel_path}", 'move', 'pending')
                )

            conn.commit()
        finally:
            cursor.close()
            conn.close()

        # Save plan to JSON
        plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(plan_file, 'w') as f:
            json.dump(plan, f, indent=2)

        logger.info(f"Plan created with {len(plan['operations'])} operations")
        logger.info(f"Plan saved to {plan_file}")

        return plan

    def verify_operation(self, source: Path, dest: Path) -> bool:
        """Verify file was copied correctly (size + optional checksum)"""
        if not dest.exists():
            return False

        try:
            source_stat = source.stat()
            dest_stat = dest.stat()

            if source_stat.st_size != dest_stat.st_size:
                return False

            # Optional: checksum verification for critical files
            # if source_stat.st_size < 100*1024*1024:  # Only for files < 100MB
            #     return self.file_checksum(source) == self.file_checksum(dest)

            return True
        except Exception as e:
            logger.error(f"Verification error: {e}")
            return False

    @staticmethod
    def file_checksum(path: Path) -> str:
        """Calculate MD5 checksum of file"""
        hash_md5 = hashlib.md5()
        with open(path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()

    def execute_migration(self, plan_file: str, dry_run: bool = True):
        """
        Execute migration plan
        :param plan_file: Path to plan JSON file
        :param dry_run: If True, only simulate operations
        """
        logger.info(f"{'DRY RUN' if dry_run else 'EXECUTING'} migration from {plan_file}")

        with open(plan_file, 'r') as f:
            plan = json.load(f)

        operations = plan['operations']
        logger.info(f"Processing {len(operations)} operations...")

        success_count = 0
        error_count = 0
        start_time = time.time()

        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            for i, op in enumerate(operations, 1):
                source_disk = op['source_disk']
                source_path = op['source_path']
                dest_disk = op['dest_disk']
                target_path = op['target_path']

                source_full = Path(source_disk) / source_path
                dest_full = Path(dest_disk) / target_path

                # Dynamic progress display
                elapsed = time.time() - start_time
                rate = i / elapsed if elapsed > 0 else 0
                eta = (len(operations) - i) / rate if rate > 0 else 0
                display_path = str(source_path)
                if len(display_path) > 50:
                    display_path = '...' + display_path[-47:]

                print(f"\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}", end='', flush=True)

                if dry_run:
                    # Simulate
                    if source_full.exists():
                        success_count += 1
                    else:
                        logger.warning(f"\n  Source does not exist: {source_full}")
                        error_count += 1
                    continue

                try:
                    # Create destination directory
                    dest_full.parent.mkdir(parents=True, exist_ok=True)

                    # Move file (copy + verify + delete)
                    if source_full.exists():
                        # Copy with metadata
                        shutil.copy2(source_full, dest_full)

                        # Verify
                        if self.verify_operation(source_full, dest_full):
                            # Update database
                            cursor.execute(
                                "UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s",
                                (dest_disk, source_path, source_disk)
                            )

                            # Safe delete (could be made optional)
                            # source_full.unlink()

                            # Log operation as executed
                            cursor.execute(
                                "UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s",
                                (f"{source_disk}:{source_path}",)
                            )

                            success_count += 1
                        else:
                            raise Exception("Verification failed")
                    else:
                        logger.warning(f"\n  Source missing: {source_full}")
                        error_count += 1

                except Exception as e:
                    logger.error(f"\n  Error processing {source_path}: {e}")
                    cursor.execute(
                        "UPDATE operations SET error = %s WHERE source_path = %s",
                        (str(e), f"{source_disk}:{source_path}")
                    )
                    error_count += 1

                # Commit every 10 operations
                if i % 10 == 0:
                    conn.commit()

            conn.commit()
            print()  # New line after progress display

        finally:
            cursor.close()
            conn.close()

        logger.info(f"Migration complete: {success_count} success, {error_count} errors")

        if not dry_run and error_count == 0:
            logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
            logger.info(f"  Remember to safely delete original files from {plan['target_disk']}")

    def run_deduplication(self, disk: Optional[str] = None, use_chunks: bool = True):
        logger.info(f"Starting deduplication{' for disk ' + disk if disk else ''}")

        disk_mount_map = {
            'SMT': '/media/mike/SMT',
            'DISK1': '/media/mike/DISK1',
            'LLM': '/media/mike/LLM'
        }

        conn = self.get_connection()
        cursor = conn.cursor()

        def hash_file_local(file_path: Path) -> str:
            hasher = hashlib.sha256()
            with open(file_path, 'rb') as f:
                while chunk := f.read(65536):
                    hasher.update(chunk)
            return hasher.hexdigest()

        try:
            if disk:
                cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC", (disk,))
            else:
                cursor.execute("SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC")

            files_to_process = cursor.fetchall()
            total = len(files_to_process)
            logger.info(f"Found {total} files to hash")

            processed = 0
            skipped = 0
            start_time = time.time()
            batch = []

            print(f"Phase 1: Computing checksums...")

            for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
                try:
                    mount_point = disk_mount_map.get(disk_label, disk_label)
                    full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)

                    if not full_path.exists():
                        skipped += 1
                        if idx % 100 == 0:
                            elapsed = time.time() - start_time
                            rate = (processed + skipped) / elapsed if elapsed > 0 else 0
                            remaining = (total - idx) / rate if rate > 0 else 0
                            pct = 100 * idx / total
                            print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
                        continue

                    checksum = hash_file_local(full_path)
                    batch.append((checksum, path_str))

                    processed += 1
                    if len(batch) >= 1000:
                        try:
                            cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
                            conn.commit()
                            batch.clear()
                        except Exception as e:
                            conn.rollback()
                            batch.clear()
                            print(f"\nBatch update failed: {e}")

                    if idx % 100 == 0:
                        elapsed = time.time() - start_time
                        rate = (processed + skipped) / elapsed if elapsed > 0 else 0
                        remaining = (total - idx) / rate if rate > 0 else 0
                        pct = 100 * idx / total
                        print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)

                except Exception as e:
                    skipped += 1
                    if idx <= 5:
                        print(f"\nDebug: {full_path} - {e}")

            if batch:
                try:
                    cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
                    conn.commit()
                except Exception as e:
                    conn.rollback()
                    print(f"\nFinal batch failed: {e}")

            print()
            elapsed = time.time() - start_time
            logger.info(f"Phase 1 done: {processed:,} files in {int(elapsed/60)}m{int(elapsed%60):02d}s ({skipped:,} skipped)")

            print("Phase 2: Finding duplicates...")
            cursor.execute("""
                UPDATE files f1 SET duplicate_of = (
                    SELECT MIN(path) FROM files f2
                    WHERE f2.checksum = f1.checksum AND f2.path < f1.path
                )
                WHERE checksum IS NOT NULL
            """)
            conn.commit()

            cursor.execute("SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL")
            dup_count = cursor.fetchone()[0]
            logger.info(f"Phase 2 done: Found {dup_count:,} duplicates")

        finally:
            cursor.close()
            conn.close()

    def plan_merge(self, sources: List[str], target: str, output_file: str,
                   filter_system: bool = False, network_target: str = None):
        """Plan merge of multiple source disks to target with deduplication"""
        logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")

        if filter_system:
            sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
            from filters import GitignoreFilter
            file_filter = GitignoreFilter()
            logger.info("System/build file filtering enabled")

        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            placeholders = ','.join(['%s'] * len(sources))
            cursor.execute(f"""
                SELECT path, size, checksum, disk_label, duplicate_of
                FROM files
                WHERE disk_label IN ({placeholders})
                ORDER BY size DESC
            """, tuple(sources))

            files = cursor.fetchall()
            total_files = len(files)
            total_size = sum(int(f[1]) for f in files)

            unique_files = {}
            duplicate_count = 0
            duplicate_size = 0
            filtered_count = 0
            filtered_size = 0

            for path, size, checksum, disk_label, duplicate_of in files:
                if filter_system and file_filter.should_exclude(path):
                    filtered_count += 1
                    filtered_size += int(size)
                    continue

                if checksum and checksum in unique_files:
                    duplicate_count += 1
                    duplicate_size += int(size)
                else:
                    if checksum:
                        unique_files[checksum] = (path, int(size), disk_label)

            unique_count = len(unique_files)
            unique_size = sum(f[1] for f in unique_files.values())

            plan = {
                'sources': sources,
                'target': target or network_target,
                'network': network_target is not None,
                'total_files': total_files,
                'total_size': total_size,
                'unique_files': unique_count,
                'unique_size': unique_size,
                'duplicate_files': duplicate_count,
                'duplicate_size': duplicate_size,
                'filtered_files': filtered_count if filter_system else 0,
                'filtered_size': filtered_size if filter_system else 0,
                'space_saved': duplicate_size + (filtered_size if filter_system else 0),
                'operations': []
            }

            for checksum, (path, size, disk_label) in unique_files.items():
                plan['operations'].append({
                    'source_disk': disk_label,
                    'source_path': path,
                    'target_disk': target or network_target,
                    'target_path': path,
                    'size': size,
                    'checksum': checksum
                })

            with open(output_file, 'w') as f:
                json.dump(plan, f, indent=2)

            logger.info(f"Merge plan saved to {output_file}")
            print(f"\n=== MERGE PLAN SUMMARY ===")
            print(f"Sources: {', '.join(sources)}")
            print(f"Target: {target or network_target}")
            print(f"Total files: {total_files:,} ({self.format_size(total_size)})")
            if filter_system:
                print(f"Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})")
            print(f"Unique files: {unique_count:,} ({self.format_size(unique_size)})")
            print(f"Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})")
            print(f"Total space saved: {self.format_size(plan['space_saved'])}")
            print(f"Space needed on target: {self.format_size(unique_size)}")

        finally:
            cursor.close()
            conn.close()

    def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
        """Generate status report"""
        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            if preview_merge:
                # Load and display merge plan
                with open(preview_merge, 'r') as f:
                    plan = json.load(f)

                print("\n=== MERGE PLAN PREVIEW ===")
                print(f"Sources: {', '.join(plan['sources'])}")
                print(f"Target: {plan['target']}")
                print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
                print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
                print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
                print(f"Space saved: {self.format_size(plan['space_saved'])}")
                print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
                return

            cursor.execute("""
                SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
            """)

            print("\n=== FILE MIGRATION REPORT ===")
            for row in cursor.fetchall():
                status, count, size = row
                print(f"{status:15}: {count:6} files, {self.format_size(int(size or 0))}")

            # Disk usage summary
            cursor.execute("""
                SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label
            """)

            print("\n=== DISK USAGE ===")
            for row in cursor.fetchall():
                disk, count, size = row
                print(f"{disk:20}: {count:6} files, {self.format_size(int(size or 0))}")

            # Deduplication stats
            cursor.execute("""
                SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL
            """)
            hashed_count, hashed_size = cursor.fetchone()

            cursor.execute("""
                SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL
            """)
            dup_count, dup_size = cursor.fetchone()

            print("\n=== DEDUPLICATION STATS ===")
            print(f"Files with checksums: {hashed_count or 0:6}")
            print(f"Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})")

            if show_duplicates and dup_count:
                print("\n=== DUPLICATE FILES ===")
                cursor.execute("""
                    SELECT path, size, duplicate_of FROM files
                    WHERE duplicate_of IS NOT NULL
                    ORDER BY size DESC
                    LIMIT 20
                """)
                for path, size, dup_of in cursor.fetchall():
                    print(f"  {path} ({self.format_size(int(size))}) → {dup_of}")

            cursor.execute("""
                SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
            """)

            print("\n=== OPERATIONS REPORT ===")
            for row in cursor.fetchall():
                op_type, executed, verified, count = row
                status = "EXECUTED" if executed else "PENDING"
                if verified:
                    status += "+VERIFIED"
                print(f"{op_type:10} {status:15}: {count} operations")

        finally:
            cursor.close()
            conn.close()

    def profile_content(self, disk: Optional[str] = None, update_db: bool = False, limit: Optional[int] = None):
        from content.profiler import ContentProfiler

        profiler = ContentProfiler()
        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}

        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            query = "SELECT path, size, disk_label FROM files WHERE 1=1"
            params = []
            if disk:
                query += " AND disk_label = %s"
                params.append(disk)
            if limit:
                query += f" LIMIT {limit}"

            cursor.execute(query, params)
            files = cursor.fetchall()
            total = len(files)
            logger.info(f"Profiling {total:,} files...")

            kind_stats = {}
            processable = 0
            batch = []

            for idx, (path, size, disk_label) in enumerate(files, 1):
                mount_point = disk_mount_map.get(disk_label, disk_label)
                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)

                if not full_path.exists():
                    continue

                profile = profiler.profile_file(full_path)

                if 'error' not in profile:
                    kind = profile['kind']
                    if kind not in kind_stats:
                        kind_stats[kind] = {'count': 0, 'processable': 0}
                    kind_stats[kind]['count'] += 1
                    if profile['processable']:
                        kind_stats[kind]['processable'] += 1
                        processable += 1

                    if update_db:
                        profile_json = json.dumps(profile)
                        batch.append((kind, profile_json, path))

                        if len(batch) >= 500:
                            cursor.executemany(
                                "UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s",
                                [(pj, p) for k, pj, p in batch]
                            )
                            conn.commit()
                            batch.clear()

                if idx % 100 == 0:
                    print(f"\rProfiled: {idx:,}/{total:,}", end='', flush=True)

            if update_db and batch:
                cursor.executemany(
                    "UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s",
                    [(pj, p) for k, pj, p in batch]
                )
                conn.commit()

            print()
            print(f"\n=== CONTENT PROFILE SUMMARY ===")
            print(f"Total files: {total:,}")
            print(f"Processable: {processable:,}\n")
            print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
            print("-" * 60)
            for kind in sorted(kind_stats.keys()):
                stats = kind_stats[kind]
                extractor = profiler._suggest_extractor(kind, '')
                print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")

        finally:
            cursor.close()
            conn.close()

    def extract_content(self, kind: Optional[str] = None, limit: int = 10):
        from content.profiler import ContentProfiler
        from content.extractors import ContentExtractor

        profiler = ContentProfiler()
        extractor = ContentExtractor()
        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}

        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
            params = []
            if kind:
                query += " AND metadata->'profile'->>'kind' = %s"
                params.append(kind)
            query += f" LIMIT {limit}"

            cursor.execute(query, params)
            files = cursor.fetchall()

            print(f"\n=== EXTRACTING CONTENT ===")
            print(f"Processing {len(files)} files\n")

            for path, size, disk_label, metadata in files:
                mount_point = disk_mount_map.get(disk_label, disk_label)
                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)

                if not full_path.exists():
                    continue

                profile = metadata.get('profile', {}) if metadata else {}
                extractor_type = profile.get('extractor')

                if not extractor_type:
                    continue

                print(f"Extracting: {path}")
                print(f"  Type: {profile.get('kind')} | Extractor: {extractor_type}")

                result = extractor.extract(full_path, extractor_type)

                if 'text' in result:
                    preview = result['text'][:200]
                    print(f"  Preview: {preview}...")
                elif 'pipeline' in result:
                    print(f"  Pipeline: {' → '.join(result['pipeline'])}")
                    print(f"  Status: {result.get('status', 'pending')}")

                print()

        finally:
            cursor.close()
            conn.close()

    def classify_files(self, disk: Optional[str] = None, update_db: bool = False):
        from classification.classifier import FileClassifier

        classifier = FileClassifier()
        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            if disk:
                cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s", (disk,))
            else:
                cursor.execute("SELECT path, size, disk_label FROM files")

            files = cursor.fetchall()
            total = len(files)
            logger.info(f"Classifying {total:,} files...")

            categories = {}
            build_artifacts = 0
            batch = []

            for idx, (path, size, disk_label) in enumerate(files, 1):
                labels, category, is_build = classifier.classify_path(path, int(size))

                if is_build:
                    build_artifacts += 1

                if category not in categories:
                    categories[category] = {'count': 0, 'size': 0}
                categories[category]['count'] += 1
                categories[category]['size'] += int(size)

                if update_db:
                    labels_str = ','.join(labels)
                    batch.append((category, labels_str, path))

                    if len(batch) >= 1000:
                        cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch])
                        conn.commit()
                        batch.clear()

                if idx % 1000 == 0:
                    print(f"\rClassified: {idx:,}/{total:,}", end='', flush=True)

            if update_db and batch:
                cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch])
                conn.commit()

            print()
            print(f"\n=== CLASSIFICATION SUMMARY ===")
            print(f"Total files: {total:,}")
            print(f"Build artifacts: {build_artifacts:,}")
            print(f"\nCategories:")
            for category in sorted(categories.keys()):
                info = categories[category]
                print(f"  {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")

        finally:
            cursor.close()
            conn.close()

    def review_migration(self, category: Optional[str] = None, show_build: bool = False):
        from classification.classifier import FileClassifier

        classifier = FileClassifier()
        conn = self.get_connection()
        cursor = conn.cursor()

        try:
            query = "SELECT path, size, category FROM files WHERE 1=1"
            params = []

            if category:
                query += " AND category = %s"
                params.append(category)

            if not show_build:
                query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"

            query += " ORDER BY category, size DESC LIMIT 100"

            cursor.execute(query, params)
            files = cursor.fetchall()

            if not files:
                print("No files found matching criteria")
                return

            print(f"\n=== MIGRATION PREVIEW ===")
            print(f"Showing {len(files)} files\n")

            current_category = None
            for path, size, cat in files:
                if cat != current_category:
                    current_category = cat
                    print(f"\n{cat}:")

                labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
                target = classifier.suggest_target_path(path, suggested_cat, labels)
                print(f"  {path}")
                print(f"    → {target} ({self.format_size(int(size))})")

        finally:
            cursor.close()
            conn.close()

    @staticmethod
    def format_size(size: int) -> str:
        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
            if size < 1024:
                return f"{size:.1f}{unit}"
            size /= 1024
        return f"{size:.1f}PB"

def main():
    parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
    subparsers = parser.add_subparsers(dest='command', required=True)

    # Index command
    index_parser = subparsers.add_parser('index', help='Index files on a disk')
    index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
    index_parser.add_argument('disk_name', help='Logical name for the disk')

    # Plan command
    plan_parser = subparsers.add_parser('plan', help='Create migration plan')
    plan_parser.add_argument('target_disk', help='Disk to free up')
    plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')

    # Execute command
    exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
    exec_parser.add_argument('plan_file', help='Path to plan JSON file')
    exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')

    # Dedupe command
    dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
    dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
    dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')

    # Merge command
    merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
    merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
    merge_parser.add_argument('--target', required=True, help='Target disk')
    merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
    merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
    merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')

    # Profile command
    profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
    profile_parser.add_argument('--disk', help='Profile specific disk')
    profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
    profile_parser.add_argument('--limit', type=int, help='Limit number of files')

    # Extract command
    extract_parser = subparsers.add_parser('extract', help='Extract content from files')
    extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
    extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')

    # Classify command
    classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
    classify_parser.add_argument('--disk', help='Classify specific disk')
    classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')

    # Review command
    review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
    review_parser.add_argument('--category', help='Review specific category')
    review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')

    # Report command
    report_parser = subparsers.add_parser('report', help='Show current status')
    report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
    report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
    report_parser.add_argument('--preview-merge', help='Preview merge plan from file')

    args = parser.parse_args()
    tool = DiskReorganizer()

    if args.command == 'index':
        tool.index_disk(args.disk_root, args.disk_name)

    elif args.command == 'dedupe':
        tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)

    elif args.command == 'merge':
        tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
                       filter_system=args.filter_system, network_target=args.network)

    elif args.command == 'plan':
        plan = tool.plan_migration(args.target_disk, args.dest_disks)
        if plan:
            print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
            print(f"Destination disks: {', '.join(plan['destination_disks'])}")

    elif args.command == 'execute':
        tool.execute_migration(args.plan_file, dry_run=args.dry_run)

    elif args.command == 'profile':
        tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)

    elif args.command == 'extract':
        tool.extract_content(kind=args.kind, limit=args.limit)

    elif args.command == 'classify':
        tool.classify_files(disk=args.disk, update_db=args.update)

    elif args.command == 'review':
        tool.review_migration(category=args.category, show_build=args.show_build)

    elif args.command == 'report':
        tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)

if __name__ == '__main__':
    main()