initial

2025-12-13 11:56:06 +01:00
commit 2b2c575385
57 changed files with 6505 additions and 0 deletions
--- a/.aiignore
+++ b/.aiignore
@@ -0,0 +1,18 @@
 .DS_Store
 *.log
 *.tmp
 dist/
 build/
 out/
 .idea
 node_modules/
 .vscode/
 .git
 .github
 scripts
 .pytest_cache/
 __pycache__
 .aiignore
 *.iml
 .env
 .bundle.md
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,44 @@
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 *.sqlite3
 *.db
 *.log
 coverage.xml
 *.coverage
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 pytest.xml
 htmlcov/
 .tox/
 .pytest_cache/
 .mypy_cache/
 .pyre/
 .idea
 *.imlbackup_*.dump
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -0,0 +1,340 @@
 # Data Reorganization Architecture: "Project Defrag"
 ## Executive Summary
 This document outlines the architecture for reorganizing 20TB of backup data across multiple NVMe drives and servers. The solution implements intelligent deduplication, systematic categorization, and optimized storage patterns for enhanced performance and maintainability.
 ## System Architecture Overview
 ```mermaid
 graph TB
    subgraph "Source Environment"
        A["Local Machine<br/>8x NVMe + 1 HDD<br/>~10TB"]
        B["Server Machine<br/>Mixed Storage<br/>~10TB"]
    end
    subgraph "Processing Layer"
        C["Discovery Engine"]
        D["Classification Engine"]
        E["Deduplication Engine"]
        F["Migration Engine"]
    end
    subgraph "Target Architecture"
        G["App Volumes"]
        H["Gitea Repository"]
        I["Build Cache (.maven, pycache)"]
        J["Artifactories"]
        K["Databases"]
        L["Backups"]
        M["LLM Model Cache"]
        N["Git Infrastructure"]
    end
    A --> C
    B --> C
    C --> D
    D --> E
    E --> F
    F --> G
    F --> H
    F --> I
    F --> J
    F --> K
    F --> L
    F --> M
    F --> N
 ```
 ## Data Flow Architecture
 ### Phase 1: Discovery & Assessment
 ```mermaid
 sequenceDiagram
    participant D as Discovery Engine
    participant FS as File System Scanner
    participant DB as Metadata Database
    participant API as System APIs
    D->>FS: Scan directory structures
    FS->>FS: Identify file types, sizes, dates
    FS->>DB: Store file metadata
    D->>API: Query system information
    API->>DB: Store system context
    DB->>D: Return analysis summary
 ```
 ### Phase 2: Classification & Deduplication
 ```mermaid
 sequenceDiagram
    participant C as Classifier
    participant DH as Deduplication Hash
    participant CDB as Canonical DB
    participant MAP as Mapping Store
    C->>C: Analyze file signatures
    C->>DH: Generate content hashes
    DH->>CDB: Check for duplicates
    CDB->>DH: Return canonical reference
    DH->>MAP: Store deduplication map
    C->>C: Apply categorization rules
 ```
 ## Target Directory Structure
 ```
 /mnt/organized/
 ├── apps/
 │   ├── volumes/
 │   │   ├── docker-volumes/
 │   │   ├── app-data/
 │   │   └── user-profiles/
 │   └── runtime/
 ├── development/
 │   ├── gitea/
 │   │   ├── repositories/
 │   │   ├── lfs-objects/
 │   │   └── avatars/
 │   ├── git-infrastructure/
 │   │   ├── hooks/
 │   │   ├── templates/
 │   │   └── config/
 │   └── build-tools/
 │       ├── .maven/repository/
 │       ├── gradle-cache/
 │       └── sbt-cache/
 ├── artifacts/
 │   ├── java/
 │   │   ├── maven-central-cache/
 │   │   ├── jfrog-artifactory/
 │   │   └── gradle-build-cache/
 │   ├── python/
 │   │   ├── pypi-cache/
 │   │   ├── wheelhouse/
 │   │   └── pip-cache/
 │   ├── node/
 │   │   ├── npm-registry/
 │   │   ├── yarn-cache/
 │   │   └── pnpm-store/
 │   └── go/
 │       ├── goproxy-cache/
 │       ├── module-cache/
 │       └── sumdb-cache/
 ├── cache/
 │   ├── llm-models/
 │   │   ├── hugging-face/
 │   │   ├── openai-cache/
 │   │   └── local-llm/
 │   ├── pycache/
 │   ├── node_modules-archive/
 │   └── browser-cache/
 ├── databases/
 │   ├── postgresql/
 │   ├── mysql/
 │   ├── mongodb/
 │   └── redis/
 ├── backups/
 │   ├── system/
 │   ├── application/
 │   ├── database/
 │   └── archive/
 └── temp/
    ├── processing/
    ├── staging/
    └── cleanup/
 ```
 ## Technology Stack Recommendation
 ### Primary Language: **Python 3.11+**
 **Rationale:**
 - Excellent file system handling capabilities
 - Rich ecosystem for data processing (pandas, pyarrow)
 - Built-in multiprocessing for I/O operations
 - Superior hash library support for deduplication
 - Cross-platform compatibility
 ### Key Libraries:
 ```python
 # Core processing
 import asyncio
 import hashlib
 import multiprocessing as mp
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 # Data handling
 import pandas as pd
 import pyarrow as pa
 import sqlite3
 import json
 # File analysis
 import magic  # python-magic
 import mimetypes
 import filetype
 # System integration
 import psutil
 import shutil
 import os
 ```
 ## Deduplication Strategy
 ### Algorithm Selection: **Variable-Size Chunking with Rabin Fingerprinting**
 ```python
 class AdvancedDeduplication:
    def __init__(self, avg_chunk_size=8192):
        self.chunker = RabinChunker(avg_chunk_size)
        self.hash_store = HashStore()
    def deduplicate_file(self, file_path):
        chunks = self.chunker.chunk_file(file_path)
        file_hash = self.compute_file_hash(chunks)
        if self.hash_store.exists(file_hash):
            return self.create_reference(file_hash)
        else:
            self.store_canonical(file_path, file_hash)
            return file_hash
 ```
 ### Performance Optimization:
 - **Parallel Processing**: Utilize all CPU cores for hashing
 - **Memory Mapping**: For large files (>100MB)
 - **Incremental Hashing**: Process files in streams
 - **Cache Layer**: Redis for frequently accessed hashes
 ## Classification Engine
 ### Rule-Based Classification System:
 ```yaml
 classification_rules:
  build_artifacts:
    patterns:
      - "**/target/**"
      - "**/build/**"
      - "**/dist/**"
      - "**/node_modules/**"
    action: categorize_as_build_cache
  development_tools:
    patterns:
      - "**/.maven/**"
      - "**/.gradle/**"
      - "**/.npm/**"
      - "**/.cache/**"
    action: categorize_as_tool_cache
  repositories:
    patterns:
      - "**/.git/**"
      - "**/repositories/**"
      - "**/gitea/**"
    action: categorize_as_vcs
  database_files:
    patterns:
      - "**/*.db"
      - "**/*.sqlite"
      - "**/postgresql/**"
      - "**/mysql/**"
    action: categorize_as_database
  model_files:
    patterns:
      - "**/*.bin"
      - "**/*.onnx"
      - "**/models/**"
      - "**/llm*/**"
    action: categorize_as_ai_model
 ```
 ## Performance Considerations
 ### NVMe Optimization Strategies:
 1. **Parallel I/O Operations**
    - Queue depth optimization (32-64 operations)
    - Async I/O with io_uring where available
    - Multi-threaded directory traversal
 2. **Memory Management**
    - Streaming processing for large files
    - Memory-mapped file access
    - Buffer pool for frequent operations
 3. **CPU Optimization**
    - SIMD instructions for hashing (AVX2/NEON)
    - Process pool for parallel processing
    - NUMA-aware memory allocation
 ## Migration Strategy
 ### Three-Phase Approach:
 ```mermaid
 graph LR
    A[Phase 1: Analysis] --> B[Phase 2: Staging]
    B --> C[Phase 3: Migration]
    A --> A1[Discovery Scan]
    A --> A2[Deduplication Analysis]
    A --> A3[Space Calculation]
    B --> B1[Create Target Structure]
    B --> B2[Hard Link Staging]
    B --> B3[Validation Check]
    C --> C1[Atomic Move Operations]
    C --> C2[Symlink Updates]
    C --> C3[Cleanup Verification]
 ```
 ## Monitoring & Validation
 ### Key Metrics:
 - **Processing Rate**: Files/second, GB/hour
 - **Deduplication Ratio**: Original vs. Final size
 - **Error Rate**: Failed operations percentage
 - **Resource Usage**: CPU, Memory, I/O utilization
 ### Validation Checks:
 - File integrity verification (hash comparison)
 - Directory structure validation
 - Symlink resolution testing
 - Permission preservation audit
 ## Risk Mitigation
 ### Safety Measures:
 1. **Read-First Approach**: Never modify source until validation
 2. **Incremental Processing**: Process in small batches
 3. **Backup Verification**: Ensure backup integrity before operations
 4. **Rollback Capability**: Maintain reverse mapping for recovery
 5. **Dry-Run Mode**: Preview all operations before execution
 ## Implementation Timeline
 ### Phase 1: Tool Development (2-3 weeks)
 - Core discovery engine
 - Classification system
 - Basic deduplication
 - Testing framework
 ### Phase 2: Staging & Validation (1-2 weeks)
 - Target structure creation
 - Sample data processing
 - Performance optimization
 - Safety verification
 ### Phase 3: Production Migration (2-4 weeks)
 - Full data processing
 - Continuous monitoring
 - Issue resolution
 - Final validation
 This architecture provides a robust, scalable solution for your data reorganization needs while maintaining data integrity and optimizing for your NVMe storage infrastructure.
--- a/38
+++ b/38
@@ -0,0 +1,38 @@
 # Dockerfile for Project Defrag with PostgreSQL integration
 FROM python:3.11-slim
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    gcc \
    g++ \
    libpq-dev \
    postgresql-client \
    && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /app
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PYTHONPATH=/app
 # Install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
 # Create non-root user
 RUN useradd -m -u 1000 appuser && \
    chown -R appuser:appuser /app
 USER appuser
 # Health check
 HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
    CMD python -c "import psycopg2; psycopg2.connect(dbname='${POSTGRES_DB:-disk_reorganizer_db}', user='${POSTGRES_USER:-disk_reorg_user}', password='${POSTGRES_PASSWORD}', host='${DB_HOST:-db}', port='${DB_PORT:-5432}')" || exit 1
 # Default command (can be overridden in docker-compose)
 CMD ["python", "app/main.py", "--help"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,114 @@
 Hier is je **extreme short, sharp, architectural** versie — volledig gecomprimeerd, professioneel, helder.
 Bron verwerkt uit je bestand 
 ---
 # Disk Reorganizer — Architectural Summary
 ## Core Outcome
 Migration from **SQLite → PostgreSQL** completed.
 System is now **network-capable**, **auditable**, **scalable**, and offers **real-time operational telemetry**.
 ---
 ## Architecture
 ### Database Layer (PostgreSQL)
 * Central DB: `disk_reorganizer_db`
 * User: `disk_reorg_user`
 * Tables: `files`, `operations`
 * Features: indexes, triggers, conflict-upserts, audit fields
 * Deployment: SQL + Windows/Linux setup scripts
 ### Application Layer
 * Python driver migrated to **psycopg2**
 * Unified DB config + connection pooling
 * Refactored CRUD + batch commits
 * Robust error handling + transactional execution
 ### Operational Layer
 * **Dynamic in-screen logging** during indexing + migration
  * File/sec, GB processed, ETA, success/error counters
  * Clean single-line, non-spamming UI updates
 ---
 ## Workflow
 1. **Setup**
 ```json
     {
  "host": "192.168.1.159",
  "port": 5432,
  "database": "disk_reorganizer_db",
  "user": "disk_reorg_user",
  "password": "heel-goed-wachtwoord"
 }
 ```
   ```bash
   ./setup_database.sh      # or setup_database.bat
   pip install -r requirements.txt
   ```
 2. **Index**
   ```bash
   python app/main.py index "D:\\" disk_d
   ```
 3. **Plan**
   ```bash
   python app/main.py plan disk_d disk_e
   ```
 4. **Dry-Run**
   ```bash
   python app/main.py execute plan.json --dry-run
   ```
 5. **Execute**
   ```bash
   python app/main.py execute plan.json
   ```
 6. **Report**
   ```bash
   python src/main.py report
   ```
 ---
 ## Guarantees
 * No destructive actions by default
 * Originals preserved
 * Every action logged in DB
 * Error-resilient, continues safely
 * Suitable for millions of file records
 ---
 ## Failure Points to Check
 * PostgreSQL reachable on 5432
 * Correct credentials
 * Disk permissions
 * Python + psycopg2 installed
 ---
 ## Essence
 A lean, safe, high-visibility disk migration tool running on a proper relational backbone, engineered for clarity, scale, and operational certainty.
 Wil je ook een **ultrakorte executive one-pager** of een **diagram-versie**?
--- a/app/analysis/folder_analyzer.py
+++ b/app/analysis/folder_analyzer.py
@@ -0,0 +1,63 @@
 from pathlib import Path
 from typing import Dict, Set, List
 from collections import Counter
 class FolderAnalyzer:
    def __init__(self):
        self.manifest_files = {'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'], 'javascript': ['package.json', 'yarn.lock', 'package-lock.json'], 'python': ['pyproject.toml', 'setup.py', 'requirements.txt', 'Pipfile'], 'go': ['go.mod', 'go.sum'], 'rust': ['Cargo.toml', 'Cargo.lock'], 'docker': ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml'], 'k8s': ['helm', 'kustomization.yaml', 'deployment.yaml']}
        self.intent_keywords = {'infrastructure': ['infra', 'deploy', 'k8s', 'docker', 'terraform', 'ansible'], 'application': ['app', 'service', 'api', 'server', 'client'], 'data': ['data', 'dataset', 'models', 'training', 'ml'], 'documentation': ['docs', 'documentation', 'wiki', 'readme'], 'testing': ['test', 'tests', 'spec', 'e2e', 'integration'], 'build': ['build', 'dist', 'target', 'out', 'bin'], 'config': ['config', 'conf', 'settings', 'env']}
    def analyze_folder(self, folder_path: Path, files: List[Dict]) -> Dict:
        files_list = [Path(f['path']) for f in files]
        has_readme = any(('readme' in f.name.lower() for f in files_list))
        has_git = any(('.git' in str(f) for f in files_list))
        manifest_types = self._detect_manifests(files_list)
        has_manifest = len(manifest_types) > 0
        file_types = Counter((f.suffix.lower() for f in files_list if f.suffix))
        dominant_types = dict(file_types.most_common(10))
        intent = self._infer_intent(folder_path.name.lower(), files_list)
        project_type = self._infer_project_type(manifest_types, dominant_types)
        structure = {'depth': len(folder_path.parts), 'has_src': any(('src' in str(f) for f in files_list[:20])), 'has_tests': any(('test' in str(f) for f in files_list[:20])), 'has_docs': any(('doc' in str(f) for f in files_list[:20]))}
        return {'has_readme': has_readme, 'has_git': has_git, 'has_manifest': has_manifest, 'manifest_types': manifest_types, 'dominant_file_types': dominant_types, 'project_type': project_type, 'intent': intent, 'structure': structure}
    def _detect_manifests(self, files: List[Path]) -> List[str]:
        detected = []
        file_names = {f.name for f in files}
        for tech, manifests in self.manifest_files.items():
            if any((m in file_names for m in manifests)):
                detected.append(tech)
        return detected
    def _infer_intent(self, folder_name: str, files: List[Path]) -> str:
        file_str = ' '.join((str(f) for f in files[:50]))
        for intent, keywords in self.intent_keywords.items():
            if any((kw in folder_name or kw in file_str.lower() for kw in keywords)):
                return intent
        return 'unknown'
    def _infer_project_type(self, manifests: List[str], file_types: Dict) -> str:
        if manifests:
            return manifests[0]
        if '.py' in file_types and file_types.get('.py', 0) > 5:
            return 'python'
        if '.js' in file_types or '.ts' in file_types:
            return 'javascript'
        if '.java' in file_types:
            return 'java'
        if '.go' in file_types:
            return 'go'
        return 'mixed'
    def generate_summary(self, folder_analysis: Dict, readme_text: str=None) -> str:
        parts = []
        if folder_analysis.get('project_type'):
            parts.append(f"{folder_analysis['project_type']} project")
        if folder_analysis.get('intent'):
            parts.append(f"for {folder_analysis['intent']}")
        if folder_analysis.get('manifest_types'):
            parts.append(f"using {', '.join(folder_analysis['manifest_types'])}")
        if readme_text:
            first_para = readme_text.split('\n\n')[0][:200]
            parts.append(f'Description: {first_para}')
        return ' '.join(parts) if parts else 'Mixed content folder'
--- a/app/classification/init.py
+++ b/app/classification/init.py
@@ -0,0 +1,2 @@
 from .classifier import FileClassifier
 __all__ = ['FileClassifier']
--- a/app/classification/_protocols.py
+++ b/app/classification/_protocols.py
@@ -0,0 +1,30 @@
 from typing import Protocol, Optional
 from pathlib import Path
 from dataclasses import dataclass
@dataclass
 class ClassificationRule:
    name: str
    category: str
    patterns: list[str]
    priority: int = 0
    description: str = ''
 class IClassifier(Protocol):
    def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
        ...
    def get_category_rules(self, category: str) -> list[ClassificationRule]:
        ...
 class IRuleEngine(Protocol):
    def add_rule(self, rule: ClassificationRule) -> None:
        ...
    def remove_rule(self, rule_name: str) -> None:
        ...
    def match_path(self, path: Path) -> Optional[str]:
        ...
--- a/app/classification/classifier.py
+++ b/app/classification/classifier.py
@@ -0,0 +1,74 @@
 from pathlib import Path
 from typing import List, Set, Dict, Tuple
 import re
 class FileClassifier:
    def __init__(self):
        self.build_patterns = {'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', 'site-packages', 'bower_components', 'jspm_packages'}
        self.artifact_patterns = {'java': {'.jar', '.war', '.ear', '.class'}, 'python': {'.pyc', '.pyo', '.whl', '.egg'}, 'node': {'node_modules'}, 'go': {'vendor', 'pkg'}, 'rust': {'target'}, 'docker': {'.dockerignore', 'Dockerfile'}}
        self.category_keywords = {'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, 'cache': {'cache', 'temp', 'tmp', '.cache'}, 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, 'backups': {'backup', 'bak', 'snapshot', 'archive'}, 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, 'artifacts': {'build', 'dist', 'release', 'output'}, 'temp': {'tmp', 'temp', 'staging', 'processing'}}
        self.media_extensions = {'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, 'presentation': {'.ppt', '.pptx', '.odp'}}
        self.code_extensions = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'}
    def classify_path(self, path: str, size: int=0) -> Tuple[Set[str], str, bool]:
        p = Path(path)
        labels = set()
        primary_category = 'misc'
        is_build_artifact = False
        parts = p.parts
        name_lower = p.name.lower()
        for part in parts:
            part_lower = part.lower()
            if part_lower in self.build_patterns:
                is_build_artifact = True
                labels.add('build-artifact')
                break
        if is_build_artifact:
            for artifact_type, patterns in self.artifact_patterns.items():
                if any((part.lower() in patterns for part in parts)) or p.suffix in patterns:
                    primary_category = f'artifacts/{artifact_type}'
                    labels.add('artifact')
                    return (labels, primary_category, is_build_artifact)
        if '.git' in parts:
            labels.add('vcs')
            primary_category = 'infra/git-infrastructure'
            return (labels, primary_category, False)
        for category, keywords in self.category_keywords.items():
            if any((kw in name_lower or any((kw in part.lower() for part in parts)) for kw in keywords)):
                labels.add(category)
                primary_category = category
                break
        for media_type, extensions in self.media_extensions.items():
            if p.suffix.lower() in extensions:
                labels.add(media_type)
                labels.add('media')
                primary_category = f'user/{media_type}'
                break
        if p.suffix.lower() in self.code_extensions:
            labels.add('code')
            if primary_category == 'misc':
                primary_category = 'dev'
        if size > 100 * 1024 * 1024:
            labels.add('large-file')
        if any((kw in name_lower for kw in ['test', 'spec', 'mock'])):
            labels.add('test')
        if any((kw in name_lower for kw in ['config', 'settings', 'env'])):
            labels.add('config')
        return (labels, primary_category, is_build_artifact)
    def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str:
        p = Path(source_path)
        if 'build-artifact' in labels:
            return f'trash/build-artifacts/{source_path}'
        if category.startswith('artifacts/'):
            artifact_type = category.split('/')[-1]
            return f'artifacts/{artifact_type}/{p.name}'
        if category.startswith('user/'):
            media_type = category.split('/')[-1]
            return f'user/{media_type}/{p.name}'
        parts = [part for part in p.parts if part not in self.build_patterns]
        if len(parts) > 3:
            project_name = parts[0] if parts else 'misc'
            return f"{category}/{project_name}/{'/'.join(parts[1:])}"
        return f'{category}/{source_path}'
--- a/app/classification/engine.py
+++ b/app/classification/engine.py
@@ -0,0 +1,148 @@
 from pathlib import Path
 from typing import Optional, Callable
 import psycopg2
 from .rules import RuleBasedClassifier
 from .ml import create_ml_classifier, DummyMLClassifier
 from ..shared.models import ProcessingStats
 from ..shared.config import DatabaseConfig
 from ..shared.logger import ProgressLogger
 class ClassificationEngine:
    def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, use_ml: bool=False):
        self.db_config = db_config
        self.logger = logger
        self.rule_classifier = RuleBasedClassifier()
        self.ml_classifier = create_ml_classifier() if use_ml else None
        self.use_ml = use_ml and (not isinstance(self.ml_classifier, DummyMLClassifier))
        self._connection = None
    def _get_connection(self):
        if self._connection is None or self._connection.closed:
            self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
        return self._connection
    def classify_all(self, disk: Optional[str]=None, batch_size: int=1000, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
        self.logger.section('Starting Classification')
        conn = self._get_connection()
        cursor = conn.cursor()
        if disk:
            cursor.execute('\n                SELECT path, checksum\n                FROM files\n                WHERE disk_label = %s AND category IS NULL\n            ', (disk,))
        else:
            cursor.execute('\n                SELECT path, checksum\n                FROM files\n                WHERE category IS NULL\n            ')
        files_to_classify = cursor.fetchall()
        total_files = len(files_to_classify)
        self.logger.info(f'Found {total_files} files to classify')
        stats = ProcessingStats()
        batch = []
        for path_str, checksum in files_to_classify:
            path = Path(path_str)
            category = self.rule_classifier.classify(path)
            if category is None and self.use_ml and self.ml_classifier:
                category = self.ml_classifier.classify(path)
            if category is None:
                category = 'temp/processing'
            batch.append((category, str(path)))
            stats.files_processed += 1
            if len(batch) >= batch_size:
                self._update_categories(cursor, batch)
                conn.commit()
                batch.clear()
                if progress_callback:
                    progress_callback(stats.files_processed, total_files, stats)
                if stats.files_processed % (batch_size * 10) == 0:
                    self.logger.progress(stats.files_processed, total_files, prefix='Files classified', elapsed_seconds=stats.elapsed_seconds)
        if batch:
            self._update_categories(cursor, batch)
            conn.commit()
        stats.files_succeeded = stats.files_processed
        cursor.close()
        self.logger.info(f'Classification complete: {stats.files_processed} files in {stats.elapsed_seconds:.1f}s')
        return stats
    def _update_categories(self, cursor, batch: list[tuple[str, str]]):
        from psycopg2.extras import execute_batch
        query = '\n            UPDATE files\n            SET category = %s\n            WHERE path = %s\n        '
        execute_batch(cursor, query, batch)
    def classify_path(self, path: Path) -> Optional[str]:
        category = self.rule_classifier.classify(path)
        if category is None and self.use_ml and self.ml_classifier:
            category = self.ml_classifier.classify(path)
        return category
    def get_category_stats(self) -> dict[str, dict]:
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute('\n            SELECT\n                category,\n                COUNT(*) as file_count,\n                SUM(size) as total_size\n            FROM files\n            WHERE category IS NOT NULL\n            GROUP BY category\n            ORDER BY total_size DESC\n        ')
        stats = {}
        for category, file_count, total_size in cursor.fetchall():
            stats[category] = {'file_count': file_count, 'total_size': total_size}
        cursor.close()
        return stats
    def get_uncategorized_count(self) -> int:
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute('SELECT COUNT(*) FROM files WHERE category IS NULL')
        count = cursor.fetchone()[0]
        cursor.close()
        return count
    def reclassify_category(self, old_category: str, new_category: str) -> int:
        self.logger.info(f'Reclassifying {old_category} -> {new_category}')
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute('\n            UPDATE files\n            SET category = %s\n            WHERE category = %s\n        ', (new_category, old_category))
        count = cursor.rowcount
        conn.commit()
        cursor.close()
        self.logger.info(f'Reclassified {count} files')
        return count
    def train_ml_classifier(self, min_samples: int=10) -> bool:
        if not self.use_ml or self.ml_classifier is None:
            self.logger.warning('ML classifier not available')
            return False
        self.logger.subsection('Training ML Classifier')
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute('\n            SELECT path, category\n            FROM files\n            WHERE category IS NOT NULL\n        ')
        training_data = [(Path(path), category) for path, category in cursor.fetchall()]
        cursor.close()
        if not training_data:
            self.logger.warning('No training data available')
            return False
        category_counts = {}
        for _, category in training_data:
            category_counts[category] = category_counts.get(category, 0) + 1
        filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples]
        if not filtered_data:
            self.logger.warning(f'No categories with >= {min_samples} samples')
            return False
        self.logger.info(f'Training with {len(filtered_data)} samples')
        try:
            self.ml_classifier.train(filtered_data)
            self.logger.info('ML classifier trained successfully')
            return True
        except Exception as e:
            self.logger.error(f'Failed to train ML classifier: {e}')
            return False
    def get_all_categories(self) -> list[str]:
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute('\n            SELECT DISTINCT category\n            FROM files\n            WHERE category IS NOT NULL\n            ORDER BY category\n        ')
        categories = [row[0] for row in cursor.fetchall()]
        cursor.close()
        return categories
    def close(self):
        if self._connection and (not self._connection.closed):
            self._connection.close()
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
--- a/app/classification/ml.py
+++ b/app/classification/ml.py
@@ -0,0 +1,127 @@
 from pathlib import Path
 from typing import Optional, List, Tuple
 import pickle
 try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.pipeline import Pipeline
    SKLEARN_AVAILABLE = True
 except ImportError:
    SKLEARN_AVAILABLE = False
 class MLClassifier:
    def __init__(self):
        if not SKLEARN_AVAILABLE:
            raise ImportError('scikit-learn is required for ML classification. Install with: pip install scikit-learn')
        self.model: Optional[Pipeline] = None
        self.categories: List[str] = []
        self._is_trained = False
    def _extract_features(self, path: Path) -> str:
        parts = path.parts
        extension = path.suffix
        filename = path.name
        features = []
        features.extend(parts)
        if extension:
            features.append(f'ext:{extension}')
        name_parts = filename.replace('-', ' ').replace('_', ' ').replace('.', ' ').split()
        features.extend([f'name:{part}' for part in name_parts])
        return ' '.join(features)
    def train(self, training_data: List[Tuple[Path, str]]) -> None:
        if not training_data:
            raise ValueError('Training data cannot be empty')
        X = [self._extract_features(path) for path, _ in training_data]
        y = [category for _, category in training_data]
        self.categories = sorted(set(y))
        self.model = Pipeline([('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 2), min_df=1)), ('classifier', MultinomialNB())])
        self.model.fit(X, y)
        self._is_trained = True
    def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
        if not self._is_trained or self.model is None:
            return None
        features = self._extract_features(path)
        try:
            prediction = self.model.predict([features])[0]
            return prediction
        except Exception:
            return None
    def predict_proba(self, path: Path) -> dict[str, float]:
        if not self._is_trained or self.model is None:
            return {}
        features = self._extract_features(path)
        try:
            probabilities = self.model.predict_proba([features])[0]
            return {category: float(prob) for category, prob in zip(self.categories, probabilities)}
        except Exception:
            return {}
    def save_model(self, model_path: Path) -> None:
        if not self._is_trained:
            raise ValueError('Cannot save untrained model')
        model_data = {'model': self.model, 'categories': self.categories, 'is_trained': self._is_trained}
        with open(model_path, 'wb') as f:
            pickle.dump(model_data, f)
    def load_model(self, model_path: Path) -> None:
        with open(model_path, 'rb') as f:
            model_data = pickle.load(f)
        self.model = model_data['model']
        self.categories = model_data['categories']
        self._is_trained = model_data['is_trained']
    @property
    def is_trained(self) -> bool:
        return self._is_trained
 class DummyMLClassifier:
    def __init__(self):
        pass
    def train(self, training_data: List[Tuple[Path, str]]) -> None:
        raise NotImplementedError('ML classification requires scikit-learn. Install with: pip install scikit-learn')
    def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
        return None
    def predict_proba(self, path: Path) -> dict[str, float]:
        return {}
    def save_model(self, model_path: Path) -> None:
        raise NotImplementedError('ML classification not available')
    def load_model(self, model_path: Path) -> None:
        raise NotImplementedError('ML classification not available')
    @property
    def is_trained(self) -> bool:
        return False
 def create_ml_classifier() -> MLClassifier | DummyMLClassifier:
    if SKLEARN_AVAILABLE:
        return MLClassifier()
    else:
        return DummyMLClassifier()
 def train_from_database(db_connection, min_samples_per_category: int=10) -> MLClassifier | DummyMLClassifier:
    classifier = create_ml_classifier()
    if isinstance(classifier, DummyMLClassifier):
        return classifier
    cursor = db_connection.cursor()
    cursor.execute('\n        SELECT path, category\n        FROM files\n        WHERE category IS NOT NULL\n    ')
    training_data = [(Path(path), category) for path, category in cursor.fetchall()]
    cursor.close()
    if not training_data:
        return classifier
    category_counts = {}
    for _, category in training_data:
        category_counts[category] = category_counts.get(category, 0) + 1
    filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples_per_category]
    if filtered_data:
        classifier.train(filtered_data)
    return classifier
--- a/app/classification/rules.py
+++ b/app/classification/rules.py
@@ -0,0 +1,60 @@
 from pathlib import Path
 from typing import Optional
 import fnmatch
 from ._protocols import ClassificationRule
 class RuleBasedClassifier:
    def __init__(self):
        self.rules: list[ClassificationRule] = []
        self._load_default_rules()
    def _load_default_rules(self):
        self.add_rule(ClassificationRule(name='maven_cache', category='artifacts/java/maven', patterns=['**/.m2/**', '**/.maven/**', '**/maven-central-cache/**'], priority=10, description='Maven repository and cache'))
        self.add_rule(ClassificationRule(name='gradle_cache', category='artifacts/java/gradle', patterns=['**/.gradle/**', '**/gradle-cache/**', '**/gradle-build-cache/**'], priority=10, description='Gradle cache and artifacts'))
        self.add_rule(ClassificationRule(name='python_cache', category='cache/pycache', patterns=['**/__pycache__/**', '**/*.pyc', '**/*.pyo'], priority=10, description='Python cache files'))
        self.add_rule(ClassificationRule(name='python_artifacts', category='artifacts/python', patterns=['**/pip-cache/**', '**/pypi-cache/**', '**/wheelhouse/**'], priority=10, description='Python package artifacts'))
        self.add_rule(ClassificationRule(name='node_modules', category='cache/node_modules-archive', patterns=['**/node_modules/**'], priority=10, description='Node.js modules'))
        self.add_rule(ClassificationRule(name='node_cache', category='artifacts/node', patterns=['**/.npm/**', '**/npm-registry/**', '**/yarn-cache/**', '**/pnpm-store/**'], priority=10, description='Node.js package managers cache'))
        self.add_rule(ClassificationRule(name='go_cache', category='artifacts/go', patterns=['**/goproxy-cache/**', '**/go/pkg/mod/**', '**/go-module-cache/**'], priority=10, description='Go module cache'))
        self.add_rule(ClassificationRule(name='git_repos', category='development/git-infrastructure', patterns=['**/.git/**', '**/gitea/repositories/**'], priority=15, description='Git repositories and infrastructure'))
        self.add_rule(ClassificationRule(name='gitea', category='development/gitea', patterns=['**/gitea/**'], priority=12, description='Gitea server data'))
        self.add_rule(ClassificationRule(name='postgresql', category='databases/postgresql', patterns=['**/postgresql/**', '**/postgres/**', '**/*.sql'], priority=10, description='PostgreSQL databases'))
        self.add_rule(ClassificationRule(name='mysql', category='databases/mysql', patterns=['**/mysql/**', '**/mariadb/**'], priority=10, description='MySQL/MariaDB databases'))
        self.add_rule(ClassificationRule(name='mongodb', category='databases/mongodb', patterns=['**/mongodb/**', '**/mongo/**'], priority=10, description='MongoDB databases'))
        self.add_rule(ClassificationRule(name='redis', category='databases/redis', patterns=['**/redis/**', '**/*.rdb'], priority=10, description='Redis databases'))
        self.add_rule(ClassificationRule(name='sqlite', category='databases/sqlite', patterns=['**/*.db', '**/*.sqlite', '**/*.sqlite3'], priority=8, description='SQLite databases'))
        self.add_rule(ClassificationRule(name='llm_models', category='cache/llm-models', patterns=['**/hugging-face/**', '**/huggingface/**', '**/.cache/huggingface/**', '**/models/**/*.bin', '**/models/**/*.onnx', '**/models/**/*.safetensors', '**/llm*/**', '**/openai-cache/**'], priority=12, description='LLM and AI model files'))
        self.add_rule(ClassificationRule(name='docker_volumes', category='apps/volumes/docker-volumes', patterns=['**/docker/volumes/**', '**/var/lib/docker/volumes/**'], priority=10, description='Docker volumes'))
        self.add_rule(ClassificationRule(name='app_data', category='apps/volumes/app-data', patterns=['**/app-data/**', '**/application-data/**'], priority=8, description='Application data'))
        self.add_rule(ClassificationRule(name='build_output', category='development/build-tools', patterns=['**/target/**', '**/build/**', '**/dist/**', '**/out/**'], priority=5, description='Build output directories'))
        self.add_rule(ClassificationRule(name='system_backups', category='backups/system', patterns=['**/backup/**', '**/backups/**', '**/*.bak', '**/*.backup'], priority=10, description='System backups'))
        self.add_rule(ClassificationRule(name='database_backups', category='backups/database', patterns=['**/*.sql.gz', '**/*.dump', '**/db-backup/**'], priority=11, description='Database backups'))
        self.add_rule(ClassificationRule(name='archives', category='backups/archive', patterns=['**/*.tar', '**/*.tar.gz', '**/*.tgz', '**/*.zip', '**/*.7z'], priority=5, description='Archive files'))
    def add_rule(self, rule: ClassificationRule) -> None:
        self.rules.append(rule)
        self.rules.sort(key=lambda r: r.priority, reverse=True)
    def remove_rule(self, rule_name: str) -> None:
        self.rules = [r for r in self.rules if r.name != rule_name]
    def match_path(self, path: Path) -> Optional[str]:
        path_str = str(path)
        for rule in self.rules:
            for pattern in rule.patterns:
                if fnmatch.fnmatch(path_str, pattern):
                    return rule.category
        return None
    def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
        return self.match_path(path)
    def get_category_rules(self, category: str) -> list[ClassificationRule]:
        return [r for r in self.rules if r.category == category]
    def get_all_categories(self) -> set[str]:
        return {r.category for r in self.rules}
    def get_rules_by_priority(self, min_priority: int=0) -> list[ClassificationRule]:
        return [r for r in self.rules if r.priority >= min_priority]
--- a/app/content/init.py
+++ b/app/content/init.py
@@ -0,0 +1,3 @@
 from .profiler import ContentProfiler
 from .extractors import ContentExtractor
 __all__ = ['ContentProfiler', 'ContentExtractor']
--- a/app/content/extractors.py
+++ b/app/content/extractors.py
@@ -0,0 +1,62 @@
 from pathlib import Path
 from typing import Dict, Optional
 import json
 class ContentExtractor:
    def __init__(self):
        self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}
    def extract(self, file_path: Path, extractor_type: str) -> Dict:
        extractor = self.extractors.get(extractor_type)
        if not extractor:
            return {'error': f'Unknown extractor: {extractor_type}'}
        try:
            return extractor(file_path)
        except Exception as e:
            return {'error': str(e)}
    def _extract_text(self, file_path: Path) -> Dict:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read(1024 * 1024)
            return {'text': content, 'char_count': len(content), 'needs_llm': False}
        except Exception as e:
            return {'error': str(e)}
    def _extract_code(self, file_path: Path) -> Dict:
        result = self._extract_text(file_path)
        if 'error' not in result:
            result['type'] = 'code'
            result['needs_llm'] = True
        return result
    def _extract_pdf(self, file_path: Path) -> Dict:
        try:
            import PyPDF2
            text_parts = []
            with open(file_path, 'rb') as f:
                pdf = PyPDF2.PdfReader(f)
                for page in pdf.pages[:10]:
                    text_parts.append(page.extract_text())
            text = '\n'.join(text_parts)
            return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
        except Exception as e:
            return {'error': str(e), 'needs_ocr': True}
    def _extract_image(self, file_path: Path) -> Dict:
        return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}
    def _extract_audio(self, file_path: Path) -> Dict:
        return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}
    def _extract_video(self, file_path: Path) -> Dict:
        return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}
    def _extract_document(self, file_path: Path) -> Dict:
        try:
            import textract
            text = textract.process(str(file_path)).decode('utf-8')
            return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
        except:
            return {'error': 'textract failed', 'needs_llm': True}
--- a/app/content/profiler.py
+++ b/app/content/profiler.py
@@ -0,0 +1,108 @@
 from pathlib import Path
 from typing import Dict, Optional, Tuple
 import mimetypes
 import magic
 import json
 from datetime import datetime
 class ContentProfiler:
    def __init__(self):
        self.mime_detector = magic.Magic(mime=True)
        self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']}
        self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
        self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
        self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
    def profile_file(self, file_path: Path) -> Dict:
        try:
            stat = file_path.stat()
            size = stat.st_size
            mtime = datetime.fromtimestamp(stat.st_mtime)
            mime_type = self._detect_mime(file_path)
            kind = self._determine_kind(file_path, mime_type)
            profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)}
            return profile
        except Exception as e:
            return {'path': str(file_path), 'error': str(e), 'processable': False}
    def _detect_mime(self, file_path: Path) -> str:
        try:
            return self.mime_detector.from_file(str(file_path))
        except:
            guess = mimetypes.guess_type(str(file_path))[0]
            return guess or 'application/octet-stream'
    def _determine_kind(self, file_path: Path, mime_type: str) -> str:
        for kind, mimes in self.kind_mapping.items():
            if any((mime in mime_type for mime in mimes)):
                return kind
        suffix = file_path.suffix.lower()
        if suffix in self.text_exts:
            return 'text'
        if suffix in self.code_exts:
            return 'code'
        return 'unknown'
    def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
        extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'}
        return extractors.get(kind)
    def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
        hints = {}
        if kind == 'text' or kind == 'code':
            hints['language'] = self._guess_language(file_path)
            if size < 1024 * 1024:
                hints['lines'] = self._count_lines(file_path)
        if kind == 'pdf':
            hints['page_count'] = self._get_pdf_pages(file_path)
        if kind in ['audio', 'video']:
            hints['duration'] = self._get_media_duration(file_path)
        if kind == 'image':
            hints['has_exif'] = self._has_exif(file_path)
            hints['dimensions'] = self._get_image_dimensions(file_path)
        return hints
    def _guess_language(self, file_path: Path) -> Optional[str]:
        lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'}
        return lang_map.get(file_path.suffix.lower())
    def _count_lines(self, file_path: Path) -> Optional[int]:
        try:
            with open(file_path, 'rb') as f:
                return sum((1 for _ in f))
        except:
            return None
    def _get_pdf_pages(self, file_path: Path) -> Optional[int]:
        try:
            import PyPDF2
            with open(file_path, 'rb') as f:
                pdf = PyPDF2.PdfReader(f)
                return len(pdf.pages)
        except:
            return None
    def _get_media_duration(self, file_path: Path) -> Optional[float]:
        try:
            import ffmpeg
            probe = ffmpeg.probe(str(file_path))
            return float(probe['format']['duration'])
        except:
            return None
    def _has_exif(self, file_path: Path) -> bool:
        try:
            from PIL import Image
            img = Image.open(file_path)
            return hasattr(img, '_getexif') and img._getexif() is not None
        except:
            return False
    def _get_image_dimensions(self, file_path: Path) -> Optional[Tuple[int, int]]:
        try:
            from PIL import Image
            with Image.open(file_path) as img:
                return img.size
        except:
            return None
--- a/app/deduplication/init.py
+++ b/app/deduplication/init.py
@@ -0,0 +1,21 @@
 """Deduplication package exports"""
 from .chunker import (
    RabinChunker,
    SimpleChunker,
    hash_chunk,
    hash_file,
    compute_file_signature
 )
 from .store import HashStore, MemoryHashStore
 from .engine import DeduplicationEngine
 __all__ = [
    'RabinChunker',
    'SimpleChunker',
    'hash_chunk',
    'hash_file',
    'compute_file_signature',
    'HashStore',
    'MemoryHashStore',
    'DeduplicationEngine',
 ]
--- a/app/deduplication/_protocols.py
+++ b/app/deduplication/_protocols.py
--- a/app/deduplication/chunker.py
+++ b/app/deduplication/chunker.py
@@ -0,0 +1,241 @@
 """Rabin fingerprint chunker for content-defined chunking"""
 import hashlib
 from pathlib import Path
 from typing import Iterator, Optional
 class RabinChunker:
    """Content-defined chunking using Rabin fingerprinting
    Uses a rolling hash to identify chunk boundaries based on content,
    allowing for efficient deduplication even when data is modified.
    """
    def __init__(
        self,
        avg_chunk_size: int = 8192,
        min_chunk_size: Optional[int] = None,
        max_chunk_size: Optional[int] = None,
        window_size: int = 48
    ):
        """Initialize Rabin chunker
        Args:
            avg_chunk_size: Target average chunk size in bytes
            min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
            max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
            window_size: Rolling hash window size
        """
        self.avg_chunk_size = avg_chunk_size
        self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4)
        self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8)
        self.window_size = window_size
        # Calculate mask for boundary detection
        # For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
        bits = 0
        size = avg_chunk_size
        while size > 1:
            bits += 1
            size >>= 1
        self.mask = (1 << bits) - 1
        # Polynomial for rolling hash (prime number)
        self.poly = 0x3DA3358B4DC173
    def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
        """Chunk a file using Rabin fingerprinting
        Args:
            file_path: Path to file to chunk
            chunk_size: If provided, use fixed-size chunking instead
        Yields:
            Chunk data as bytes
        """
        if chunk_size:
            # Use fixed-size chunking
            yield from self._chunk_fixed(file_path, chunk_size)
        else:
            # Use content-defined chunking
            yield from self._chunk_rabin(file_path)
    def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
        """Fixed-size chunking
        Args:
            file_path: Path to file
            chunk_size: Chunk size in bytes
        Yields:
            Fixed-size chunks
        """
        with open(file_path, 'rb') as f:
            while True:
                chunk = f.read(chunk_size)
                if not chunk:
                    break
                yield chunk
    def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
        """Content-defined chunking using Rabin fingerprinting
        Args:
            file_path: Path to file
        Yields:
            Variable-size chunks based on content
        """
        with open(file_path, 'rb') as f:
            chunk_data = bytearray()
            window = bytearray()
            hash_value = 0
            while True:
                byte = f.read(1)
                if not byte:
                    # End of file - yield remaining data
                    if chunk_data:
                        yield bytes(chunk_data)
                    break
                chunk_data.extend(byte)
                window.extend(byte)
                # Maintain window size
                if len(window) > self.window_size:
                    window.pop(0)
                # Update rolling hash
                hash_value = self._rolling_hash(window)
                # Check if we should create a boundary
                should_break = (
                    len(chunk_data) >= self.min_chunk_size and
                    (
                        (hash_value & self.mask) == 0 or
                        len(chunk_data) >= self.max_chunk_size
                    )
                )
                if should_break:
                    yield bytes(chunk_data)
                    chunk_data = bytearray()
                    window = bytearray()
                    hash_value = 0
    def _rolling_hash(self, window: bytearray) -> int:
        """Calculate rolling hash for window
        Args:
            window: Byte window
        Returns:
            Hash value
        """
        hash_value = 0
        for byte in window:
            hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF
        return hash_value
 class SimpleChunker:
    """Simple fixed-size chunker for comparison"""
    def __init__(self, chunk_size: int = 8192):
        """Initialize simple chunker
        Args:
            chunk_size: Fixed chunk size in bytes
        """
        self.chunk_size = chunk_size
    def chunk_file(self, file_path: Path) -> Iterator[bytes]:
        """Chunk file into fixed-size pieces
        Args:
            file_path: Path to file
        Yields:
            Fixed-size chunks
        """
        with open(file_path, 'rb') as f:
            while True:
                chunk = f.read(self.chunk_size)
                if not chunk:
                    break
                yield chunk
 def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
    """Hash a chunk of data
    Args:
        chunk: Chunk data
        algorithm: Hash algorithm (default: sha256)
    Returns:
        Hex digest of hash
    """
    hasher = hashlib.new(algorithm)
    hasher.update(chunk)
    return hasher.hexdigest()
 def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
    """Hash entire file
    Args:
        file_path: Path to file
        algorithm: Hash algorithm (default: sha256)
        chunk_size: Size of chunks to read
    Returns:
        Hex digest of file hash
    """
    hasher = hashlib.new(algorithm)
    with open(file_path, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            hasher.update(chunk)
    return hasher.hexdigest()
 def compute_file_signature(
    file_path: Path,
    use_rabin: bool = True,
    avg_chunk_size: int = 8192
 ) -> tuple[str, list[str]]:
    """Compute file signature with chunk hashes
    Args:
        file_path: Path to file
        use_rabin: Whether to use Rabin chunking (vs fixed-size)
        avg_chunk_size: Average chunk size for Rabin or fixed size
    Returns:
        Tuple of (file_hash, list of chunk hashes)
    """
    if use_rabin:
        chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
    else:
        chunker = SimpleChunker(chunk_size=avg_chunk_size)
    chunk_hashes = []
    file_hasher = hashlib.sha256()
    for chunk in chunker.chunk_file(file_path):
        # Hash individual chunk
        chunk_hash = hash_chunk(chunk)
        chunk_hashes.append(chunk_hash)
        # Update file hash
        file_hasher.update(chunk)
    file_hash = file_hasher.hexdigest()
    return file_hash, chunk_hashes
--- a/app/deduplication/engine.py
+++ b/app/deduplication/engine.py
@@ -0,0 +1,353 @@
 """Deduplication engine"""
 from pathlib import Path
 from typing import Optional, Callable
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import psycopg2
 from .chunker import compute_file_signature, hash_file
 from .store import HashStore
 from ..shared.models import FileRecord, ProcessingStats
 from ..shared.config import DatabaseConfig, ProcessingConfig
 from ..shared.logger import ProgressLogger
 class DeduplicationEngine:
    """Engine for deduplicating files"""
    def __init__(
        self,
        db_config: DatabaseConfig,
        processing_config: ProcessingConfig,
        logger: ProgressLogger
    ):
        """Initialize deduplication engine
        Args:
            db_config: Database configuration
            processing_config: Processing configuration
            logger: Progress logger
        """
        self.db_config = db_config
        self.processing_config = processing_config
        self.logger = logger
        self.hash_store = HashStore(db_config)
        self._connection = None
    def _get_connection(self):
        """Get or create database connection"""
        if self._connection is None or self._connection.closed:
            self._connection = psycopg2.connect(
                host=self.db_config.host,
                port=self.db_config.port,
                database=self.db_config.database,
                user=self.db_config.user,
                password=self.db_config.password
            )
        return self._connection
    def deduplicate_all(
        self,
        disk: Optional[str] = None,
        use_chunks: bool = True,
        progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
    ) -> ProcessingStats:
        """Deduplicate all files in database
        Args:
            disk: Optional disk filter
            use_chunks: Whether to use chunk-level deduplication
            progress_callback: Optional callback for progress updates
        Returns:
            ProcessingStats with deduplication statistics
        """
        self.logger.section("Starting Deduplication")
        conn = self._get_connection()
        cursor = conn.cursor()
        # Get files without checksums
        if disk:
            cursor.execute("""
                SELECT path, size
                FROM files
                WHERE disk_label = %s AND checksum IS NULL
                ORDER BY size DESC
                           """, (disk,))
        else:
            cursor.execute("""
                SELECT path, size
                FROM files
                WHERE checksum IS NULL
                ORDER BY size DESC
                           """)
        files_to_process = cursor.fetchall()
        total_files = len(files_to_process)
        self.logger.info(f"Found {total_files} files to process")
        stats = ProcessingStats()
        # Process files with thread pool
        with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
            futures = {}
            for path_str, size in files_to_process:
                path = Path(path_str)
                future = executor.submit(self._process_file, path, use_chunks)
                futures[future] = (path, size)
            # Process completed futures
            for future in as_completed(futures):
                path, size = futures[future]
                try:
                    checksum, duplicate_of = future.result()
                    if checksum:
                        # Update database
                        cursor.execute("""
                            UPDATE files
                            SET checksum = %s, duplicate_of = %s
                            WHERE path = %s
                                       """, (checksum, duplicate_of, str(path)))
                        stats.files_succeeded += 1
                        stats.bytes_processed += size
                    stats.files_processed += 1
                    # Commit periodically
                    if stats.files_processed % self.processing_config.commit_interval == 0:
                        conn.commit()
                        # Progress callback
                        if progress_callback:
                            progress_callback(stats.files_processed, total_files, stats)
                        # Log progress
                        self.logger.progress(
                            stats.files_processed,
                            total_files,
                            prefix="Files processed",
                            bytes_processed=stats.bytes_processed,
                            elapsed_seconds=stats.elapsed_seconds
                        )
                except Exception as e:
                    self.logger.warning(f"Failed to process {path}: {e}")
                    stats.files_failed += 1
                    stats.files_processed += 1
        # Final commit
        conn.commit()
        cursor.close()
        self.logger.info(
            f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
            f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
        )
        return stats
    def _process_file(
        self,
        path: Path,
        use_chunks: bool
    ) -> tuple[Optional[str], Optional[str]]:
        """Process a single file for deduplication
        Args:
            path: Path to file
            use_chunks: Whether to use chunk-level deduplication
        Returns:
            Tuple of (checksum, duplicate_of_path)
        """
        if not path.exists():
            return None, None
        try:
            if use_chunks:
                # Compute file signature with chunks
                checksum, chunk_hashes = compute_file_signature(
                    path,
                    use_rabin=True,
                    avg_chunk_size=self.processing_config.chunk_size
                )
            else:
                # Just compute file hash
                checksum = hash_file(
                    path,
                    algorithm=self.processing_config.hash_algorithm
                )
                chunk_hashes = None
            # Check if hash exists
            if self.hash_store.exists(checksum):
                # Duplicate found
                canonical_path = self.hash_store.get_canonical(checksum)
                return checksum, canonical_path
            else:
                # New unique file
                size = path.stat().st_size
                self.hash_store.store_canonical(
                    checksum,
                    path,
                    size,
                    chunk_hashes
                )
                return checksum, None
        except Exception as e:
            self.logger.debug(f"Error processing {path}: {e}")
            raise
    def find_duplicates(
        self,
        disk: Optional[str] = None
    ) -> dict[str, list[str]]:
        """Find all duplicate files
        Args:
            disk: Optional disk filter
        Returns:
            Dictionary mapping canonical path to list of duplicate paths
        """
        self.logger.subsection("Finding Duplicates")
        conn = self._get_connection()
        cursor = conn.cursor()
        # Query for duplicates
        if disk:
            cursor.execute("""
                SELECT checksum, array_agg(path ORDER BY path) as paths
                FROM files
                WHERE disk_label = %s AND checksum IS NOT NULL
                GROUP BY checksum
                HAVING COUNT(*) > 1
                           """, (disk,))
        else:
            cursor.execute("""
                SELECT checksum, array_agg(path ORDER BY path) as paths
                FROM files
                WHERE checksum IS NOT NULL
                GROUP BY checksum
                HAVING COUNT(*) > 1
                           """)
        duplicates = {}
        for checksum, paths in cursor.fetchall():
            canonical = paths[0]
            duplicates[canonical] = paths[1:]
        cursor.close()
        self.logger.info(f"Found {len(duplicates)} sets of duplicates")
        return duplicates
    def get_deduplication_stats(self) -> dict:
        """Get deduplication statistics
        Returns:
            Dictionary with statistics
        """
        conn = self._get_connection()
        cursor = conn.cursor()
        stats = {}
        # Total files
        cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
        stats['total_files'] = cursor.fetchone()[0]
        # Unique files
        cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
        stats['unique_files'] = cursor.fetchone()[0]
        # Duplicate files
        stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
        # Total size
        cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
        stats['total_size'] = cursor.fetchone()[0]
        # Unique size
        cursor.execute("""
            SELECT COALESCE(SUM(size), 0)
            FROM (
                SELECT DISTINCT ON (checksum) size
                FROM files
                WHERE checksum IS NOT NULL
            ) AS unique_files
                       """)
        stats['unique_size'] = cursor.fetchone()[0]
        # Wasted space
        stats['wasted_space'] = stats['total_size'] - stats['unique_size']
        # Deduplication ratio
        if stats['total_size'] > 0:
            stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
        else:
            stats['dedup_ratio'] = 1.0
        # Space saved percentage
        if stats['total_size'] > 0:
            stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100
        else:
            stats['space_saved_percent'] = 0.0
        cursor.close()
        return stats
    def mark_canonical_files(self) -> int:
        """Mark canonical (first occurrence) files in database
        Returns:
            Number of canonical files marked
        """
        self.logger.subsection("Marking Canonical Files")
        conn = self._get_connection()
        cursor = conn.cursor()
        # Find first occurrence of each checksum and mark as canonical
        cursor.execute("""
            WITH canonical AS (
                SELECT DISTINCT ON (checksum) path, checksum
                FROM files
                WHERE checksum IS NOT NULL
                ORDER BY checksum, path
            )
            UPDATE files
            SET duplicate_of = NULL
            WHERE path IN (SELECT path FROM canonical)
                       """)
        count = cursor.rowcount
        conn.commit()
        cursor.close()
        self.logger.info(f"Marked {count} canonical files")
        return count
    def close(self):
        """Close connections"""
        self.hash_store.close()
        if self._connection and not self._connection.closed:
            self._connection.close()
    def __enter__(self):
        """Context manager entry"""
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        self.close()
--- a/app/deduplication/store.py
+++ b/app/deduplication/store.py
@@ -0,0 +1,412 @@
 """Hash store for deduplication with optional Redis support"""
 from typing import Optional, Dict, Set
 from pathlib import Path
 import psycopg2
 from psycopg2.extras import execute_batch
 from ..shared.config import DatabaseConfig
 class HashStore:
    """PostgreSQL-based hash store for deduplication"""
    def __init__(self, db_config: DatabaseConfig):
        """Initialize hash store
        Args:
            db_config: Database configuration
        """
        self.db_config = db_config
        self._connection = None
    def _get_connection(self):
        """Get or create database connection"""
        if self._connection is None or self._connection.closed:
            self._connection = psycopg2.connect(
                host=self.db_config.host,
                port=self.db_config.port,
                database=self.db_config.database,
                user=self.db_config.user,
                password=self.db_config.password
            )
        return self._connection
    def _ensure_tables(self):
        """Ensure hash store tables exist"""
        conn = self._get_connection()
        cursor = conn.cursor()
        # Create hashes table for file-level deduplication
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS file_hashes (
                checksum TEXT PRIMARY KEY,
                canonical_path TEXT NOT NULL,
                size BIGINT NOT NULL,
                first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                ref_count INTEGER DEFAULT 1
            )
        """)
        # Create chunk hashes table for chunk-level deduplication
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS chunk_hashes (
                chunk_hash TEXT PRIMARY KEY,
                size INTEGER NOT NULL,
                first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                ref_count INTEGER DEFAULT 1
            )
        """)
        # Create file-chunk mapping table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS file_chunks (
                id SERIAL PRIMARY KEY,
                file_checksum TEXT NOT NULL,
                chunk_hash TEXT NOT NULL,
                chunk_index INTEGER NOT NULL,
                FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),
                FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),
                UNIQUE (file_checksum, chunk_index)
            )
        """)
        # Create indexes
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_file_chunks_file
            ON file_chunks(file_checksum)
        """)
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk
            ON file_chunks(chunk_hash)
        """)
        conn.commit()
        cursor.close()
    def exists(self, checksum: str) -> bool:
        """Check if hash exists in store
        Args:
            checksum: File hash to check
        Returns:
            True if hash exists
        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute(
            "SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1",
            (checksum,)
        )
        exists = cursor.fetchone() is not None
        cursor.close()
        return exists
    def get_canonical(self, checksum: str) -> Optional[str]:
        """Get canonical path for a hash
        Args:
            checksum: File hash
        Returns:
            Canonical file path or None if not found
        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute(
            "SELECT canonical_path FROM file_hashes WHERE checksum = %s",
            (checksum,)
        )
        result = cursor.fetchone()
        cursor.close()
        return result[0] if result else None
    def store_canonical(
        self,
        checksum: str,
        path: Path,
        size: int,
        chunk_hashes: Optional[list[str]] = None
    ) -> None:
        """Store canonical reference for a hash
        Args:
            checksum: File hash
            path: Canonical file path
            size: File size in bytes
            chunk_hashes: Optional list of chunk hashes
        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
        try:
            # Store file hash
            cursor.execute("""
                INSERT INTO file_hashes (checksum, canonical_path, size)
                VALUES (%s, %s, %s)
                ON CONFLICT (checksum) DO UPDATE SET
                    ref_count = file_hashes.ref_count + 1
            """, (checksum, str(path), size))
            # Store chunk hashes if provided
            if chunk_hashes:
                # Insert chunk hashes
                chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes]
                execute_batch(cursor, """
                    INSERT INTO chunk_hashes (chunk_hash, size)
                    VALUES (%s, %s)
                    ON CONFLICT (chunk_hash) DO UPDATE SET
                        ref_count = chunk_hashes.ref_count + 1
                """, chunk_data, page_size=1000)
                # Create file-chunk mappings
                mapping_data = [
                    (checksum, chunk_hash, idx)
                    for idx, chunk_hash in enumerate(chunk_hashes)
                ]
                execute_batch(cursor, """
                    INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)
                    VALUES (%s, %s, %s)
                    ON CONFLICT (file_checksum, chunk_index) DO NOTHING
                """, mapping_data, page_size=1000)
            conn.commit()
        except Exception as e:
            conn.rollback()
            raise
        finally:
            cursor.close()
    def get_chunk_hashes(self, checksum: str) -> list[str]:
        """Get chunk hashes for a file
        Args:
            checksum: File hash
        Returns:
            List of chunk hashes in order
        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute("""
            SELECT chunk_hash
            FROM file_chunks
            WHERE file_checksum = %s
            ORDER BY chunk_index
        """, (checksum,))
        chunk_hashes = [row[0] for row in cursor.fetchall()]
        cursor.close()
        return chunk_hashes
    def get_duplicates(self) -> Dict[str, list[str]]:
        """Get all duplicate file groups
        Returns:
            Dictionary mapping canonical path to list of duplicate paths
        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
        # Get all files with their hashes
        cursor.execute("""
            SELECT f.path, f.checksum
            FROM files f
            WHERE f.checksum IS NOT NULL
        """)
        # Group by checksum
        hash_to_paths: Dict[str, list[str]] = {}
        for path, checksum in cursor.fetchall():
            if checksum not in hash_to_paths:
                hash_to_paths[checksum] = []
            hash_to_paths[checksum].append(path)
        cursor.close()
        # Filter to only duplicates (more than one file)
        duplicates = {
            paths[0]: paths[1:]
            for checksum, paths in hash_to_paths.items()
            if len(paths) > 1
        }
        return duplicates
    def get_stats(self) -> Dict[str, int]:
        """Get hash store statistics
        Returns:
            Dictionary with statistics
        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
        stats = {}
        # Count unique file hashes
        cursor.execute("SELECT COUNT(*) FROM file_hashes")
        stats['unique_files'] = cursor.fetchone()[0]
        # Count unique chunk hashes
        cursor.execute("SELECT COUNT(*) FROM chunk_hashes")
        stats['unique_chunks'] = cursor.fetchone()[0]
        # Count total references
        cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes")
        stats['total_file_refs'] = cursor.fetchone()[0]
        # Count total chunk references
        cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes")
        stats['total_chunk_refs'] = cursor.fetchone()[0]
        # Calculate deduplication ratio
        if stats['total_file_refs'] > 0:
            stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs']
        else:
            stats['dedup_ratio'] = 1.0
        cursor.close()
        return stats
    def find_similar_files(self, checksum: str, threshold: float = 0.8) -> list[tuple[str, float]]:
        """Find files similar to given hash based on chunk overlap
        Args:
            checksum: File hash to compare
            threshold: Similarity threshold (0.0 to 1.0)
        Returns:
            List of tuples (other_checksum, similarity_score)
        """
        self._ensure_tables()
        conn = self._get_connection()
        cursor = conn.cursor()
        # Get chunks for the target file
        target_chunks = set(self.get_chunk_hashes(checksum))
        if not target_chunks:
            cursor.close()
            return []
        # Find files sharing chunks
        cursor.execute("""
            SELECT DISTINCT fc.file_checksum
            FROM file_chunks fc
            WHERE fc.chunk_hash = ANY(%s)
            AND fc.file_checksum != %s
        """, (list(target_chunks), checksum))
        similar_files = []
        for (other_checksum,) in cursor.fetchall():
            other_chunks = set(self.get_chunk_hashes(other_checksum))
            # Calculate Jaccard similarity
            intersection = len(target_chunks & other_chunks)
            union = len(target_chunks | other_chunks)
            if union > 0:
                similarity = intersection / union
                if similarity >= threshold:
                    similar_files.append((other_checksum, similarity))
        cursor.close()
        # Sort by similarity descending
        similar_files.sort(key=lambda x: x[1], reverse=True)
        return similar_files
    def close(self):
        """Close database connection"""
        if self._connection and not self._connection.closed:
            self._connection.close()
    def __enter__(self):
        """Context manager entry"""
        self._ensure_tables()
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        self.close()
 class MemoryHashStore:
    """In-memory hash store for testing and small datasets"""
    def __init__(self):
        """Initialize in-memory hash store"""
        self.hashes: Dict[str, tuple[str, int]] = {}
        self.chunks: Dict[str, int] = {}
        self.file_chunks: Dict[str, list[str]] = {}
    def exists(self, checksum: str) -> bool:
        """Check if hash exists"""
        return checksum in self.hashes
    def get_canonical(self, checksum: str) -> Optional[str]:
        """Get canonical path"""
        return self.hashes.get(checksum, (None, 0))[0]
    def store_canonical(
        self,
        checksum: str,
        path: Path,
        size: int,
        chunk_hashes: Optional[list[str]] = None
    ) -> None:
        """Store canonical reference"""
        self.hashes[checksum] = (str(path), size)
        if chunk_hashes:
            self.file_chunks[checksum] = chunk_hashes
            for chunk_hash in chunk_hashes:
                self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1
    def get_chunk_hashes(self, checksum: str) -> list[str]:
        """Get chunk hashes"""
        return self.file_chunks.get(checksum, [])
    def get_stats(self) -> Dict[str, int]:
        """Get statistics"""
        return {
            'unique_files': len(self.hashes),
            'unique_chunks': len(self.chunks),
            'total_file_refs': len(self.hashes),
            'total_chunk_refs': sum(self.chunks.values()),
            'dedup_ratio': 1.0
        }
    def close(self):
        """No-op for compatibility"""
        pass
    def __enter__(self):
        """Context manager entry"""
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        pass
--- a/app/discovery/init.py
+++ b/app/discovery/init.py
@@ -0,0 +1,5 @@
 from .scanner import FileScanner, FilteredScanner
 from .system import SystemAPI
 from .engine import DiscoveryEngine
 from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
 __all__ = ['FileScanner', 'FilteredScanner', 'SystemAPI', 'DiscoveryEngine', 'FileMeta', 'MountInfo', 'DiskInfo', 'IFileScanner', 'ISystemAPI']
--- a/app/discovery/_protocols.py
+++ b/app/discovery/_protocols.py
@@ -0,0 +1,37 @@
 from typing import Iterator, Protocol, Any
 from pathlib import Path
 from dataclasses import dataclass
@dataclass
 class FileMeta:
    path: Path
    size: int
    modified_time: float
    created_time: float
@dataclass
 class MountInfo:
    device: str
    mount_point: str
    fs_type: str
    options: str
@dataclass
 class DiskInfo:
    device: str
    model: str
    size: int
    serial: str
 class IFileScanner(Protocol):
    def scan(self, root: Path) -> Iterator[FileMeta]:
        ...
 class ISystemAPI(Protocol):
    def query_mounts(self) -> list[MountInfo]:
        ...
    def query_nvmes(self) -> list[DiskInfo]:
        ...
--- a/app/discovery/engine.py
+++ b/app/discovery/engine.py
@@ -0,0 +1,133 @@
 from pathlib import Path
 from typing import Optional, Callable
 from datetime import datetime
 import psycopg2
 from psycopg2.extras import execute_batch
 from .scanner import FileScanner
 from .system import SystemAPI
 from ._protocols import FileMeta
 from ..shared.models import FileRecord, DiskInfo, ProcessingStats
 from ..shared.config import DatabaseConfig
 from ..shared.logger import ProgressLogger
 class DiscoveryEngine:
    def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
        self.db_config = db_config
        self.logger = logger
        self.batch_size = batch_size
        self.system_api = SystemAPI()
        self._connection = None
    def _get_connection(self):
        if self._connection is None or self._connection.closed:
            self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
        return self._connection
    def _ensure_tables(self):
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute("\n            CREATE TABLE IF NOT EXISTS files (\n                id SERIAL PRIMARY KEY,\n                path TEXT NOT NULL UNIQUE,\n                size BIGINT NOT NULL,\n                modified_time DOUBLE PRECISION NOT NULL,\n                created_time DOUBLE PRECISION NOT NULL,\n                disk_label TEXT NOT NULL,\n                checksum TEXT,\n                status TEXT DEFAULT 'indexed',\n                category TEXT,\n                duplicate_of TEXT,\n                discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n            )\n        ")
        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n                       ')
        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n                       ')
        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n                       ')
        conn.commit()
        cursor.close()
    def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
        self.logger.section(f'Discovering: {root}')
        self._ensure_tables()
        if scanner is None:
            scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
        disk = self.system_api.get_disk_for_path(root)
        if disk is None:
            disk = str(root)
        stats = ProcessingStats()
        batch = []
        conn = self._get_connection()
        cursor = conn.cursor()
        try:
            for file_meta in scanner.scan(root):
                record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
                batch.append(record)
                stats.files_processed += 1
                stats.bytes_processed += record.size
                if len(batch) >= self.batch_size:
                    self._insert_batch(cursor, batch)
                    conn.commit()
                    batch.clear()
                    if progress_callback:
                        progress_callback(stats.files_processed, 0, stats)
                    if stats.files_processed % (self.batch_size * 10) == 0:
                        self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
            if batch:
                self._insert_batch(cursor, batch)
                conn.commit()
            stats.files_succeeded = stats.files_processed
        except Exception as e:
            conn.rollback()
            self.logger.error(f'Discovery failed: {e}')
            raise
        finally:
            cursor.close()
        self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
        return stats
    def _insert_batch(self, cursor, batch: list[FileRecord]):
        query = '\n            INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n            ON CONFLICT (path) DO UPDATE SET\n                size = EXCLUDED.size,\n                modified_time = EXCLUDED.modified_time,\n                updated_at = CURRENT_TIMESTAMP\n        '
        data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
        execute_batch(cursor, query, data, page_size=self.batch_size)
    def get_disk_info(self) -> list[DiskInfo]:
        self.logger.subsection('Querying disk information')
        disks = []
        for disk_info in self.system_api.query_nvmes():
            mount_point = None
            fs_type = 'unknown'
            for mount in self.system_api.query_mounts():
                if mount.device == disk_info.device:
                    mount_point = Path(mount.mount_point)
                    fs_type = mount.fs_type
                    break
            if mount_point:
                total, used, free = self.system_api.get_disk_usage(mount_point)
            else:
                total = disk_info.size
                used = 0
                free = disk_info.size
            disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
            disks.append(disk)
            self.logger.info(f'  {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
        return disks
    def get_file_count(self, disk: Optional[str]=None) -> int:
        conn = self._get_connection()
        cursor = conn.cursor()
        if disk:
            cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
        else:
            cursor.execute('SELECT COUNT(*) FROM files')
        count = cursor.fetchone()[0]
        cursor.close()
        return count
    def get_total_size(self, disk: Optional[str]=None) -> int:
        conn = self._get_connection()
        cursor = conn.cursor()
        if disk:
            cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
        else:
            cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
        total = cursor.fetchone()[0]
        cursor.close()
        return total
    def close(self):
        if self._connection and (not self._connection.closed):
            self._connection.close()
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
--- a/app/discovery/scanner.py
+++ b/app/discovery/scanner.py
@@ -0,0 +1,112 @@
 import os
 from pathlib import Path
 from typing import Iterator, Optional, Callable
 from datetime import datetime
 from ._protocols import FileMeta
 class FileScanner:
    def __init__(self, follow_symlinks: bool=False, skip_hidden: bool=True, error_handler: Optional[Callable[[Exception, Path], None]]=None):
        self.follow_symlinks = follow_symlinks
        self.skip_hidden = skip_hidden
        self.error_handler = error_handler
        self._files_scanned = 0
        self._bytes_scanned = 0
        self._errors = 0
    def scan(self, root: Path) -> Iterator[FileMeta]:
        if not root.exists():
            error = FileNotFoundError(f'Path does not exist: {root}')
            if self.error_handler:
                self.error_handler(error, root)
            else:
                raise error
            return
        if not root.is_dir():
            try:
                yield self._get_file_meta(root)
            except Exception as e:
                self._errors += 1
                if self.error_handler:
                    self.error_handler(e, root)
                else:
                    raise
            return
        for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
            current_dir = Path(dirpath)
            if self.skip_hidden:
                dirnames[:] = [d for d in dirnames if not d.startswith('.')]
            for filename in filenames:
                if self.skip_hidden and filename.startswith('.'):
                    continue
                file_path = current_dir / filename
                try:
                    if file_path.is_symlink() and (not file_path.exists()):
                        continue
                    meta = self._get_file_meta(file_path)
                    self._files_scanned += 1
                    self._bytes_scanned += meta.size
                    yield meta
                except PermissionError as e:
                    self._errors += 1
                    if self.error_handler:
                        self.error_handler(e, file_path)
                    continue
                except Exception as e:
                    self._errors += 1
                    if self.error_handler:
                        self.error_handler(e, file_path)
                    continue
    def _get_file_meta(self, path: Path) -> FileMeta:
        stat = path.stat()
        created_time = stat.st_ctime
        if hasattr(stat, 'st_birthtime'):
            created_time = stat.st_birthtime
        return FileMeta(path=path, size=stat.st_size, modified_time=stat.st_mtime, created_time=created_time)
    @property
    def files_scanned(self) -> int:
        return self._files_scanned
    @property
    def bytes_scanned(self) -> int:
        return self._bytes_scanned
    @property
    def errors(self) -> int:
        return self._errors
    def reset_stats(self) -> None:
        self._files_scanned = 0
        self._bytes_scanned = 0
        self._errors = 0
 class FilteredScanner(FileScanner):
    def __init__(self, min_size: Optional[int]=None, max_size: Optional[int]=None, extensions: Optional[list[str]]=None, exclude_patterns: Optional[list[str]]=None, **kwargs):
        super().__init__(**kwargs)
        self.min_size = min_size
        self.max_size = max_size
        self.extensions = {ext.lower() for ext in extensions} if extensions else None
        self.exclude_patterns = exclude_patterns or []
    def scan(self, root: Path) -> Iterator[FileMeta]:
        for meta in super().scan(root):
            if self.min_size is not None and meta.size < self.min_size:
                continue
            if self.max_size is not None and meta.size > self.max_size:
                continue
            if self.extensions is not None:
                if meta.path.suffix.lower() not in self.extensions:
                    continue
            if self._should_exclude(meta.path):
                continue
            yield meta
    def _should_exclude(self, path: Path) -> bool:
        path_str = str(path)
        for pattern in self.exclude_patterns:
            if pattern in path_str:
                return True
        return False
--- a/app/discovery/system.py
+++ b/app/discovery/system.py
@@ -0,0 +1,119 @@
 import os
 import subprocess
 from pathlib import Path
 from typing import Optional
 import psutil
 from ._protocols import MountInfo, DiskInfo
 class SystemAPI:
    def query_mounts(self) -> list[MountInfo]:
        mounts = []
        for partition in psutil.disk_partitions(all=False):
            mount_info = MountInfo(device=partition.device, mount_point=partition.mountpoint, fs_type=partition.fstype, options=partition.opts)
            mounts.append(mount_info)
        return mounts
    def query_nvmes(self) -> list[DiskInfo]:
        disks = []
        try:
            result = subprocess.run(['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'], capture_output=True, text=True, check=False)
            if result.returncode == 0:
                for line in result.stdout.strip().split('\n'):
                    if not line.strip():
                        continue
                    parts = line.split(maxsplit=3)
                    if len(parts) >= 3:
                        device = f'/dev/{parts[0]}'
                        model = parts[1] if len(parts) > 1 else 'Unknown'
                        size_str = parts[2] if len(parts) > 2 else '0'
                        serial = parts[3] if len(parts) > 3 else 'Unknown'
                        try:
                            size = int(size_str)
                        except ValueError:
                            size = 0
                        disk_info = DiskInfo(device=device, model=model, size=size, serial=serial)
                        disks.append(disk_info)
        except FileNotFoundError:
            pass
        if not disks:
            disks = self._query_disks_fallback()
        return disks
    def _query_disks_fallback(self) -> list[DiskInfo]:
        disks = []
        seen_devices = set()
        for partition in psutil.disk_partitions(all=True):
            device = partition.device
            if not device.startswith('/dev/'):
                continue
            base_device = self._get_base_device(device)
            if base_device in seen_devices:
                continue
            seen_devices.add(base_device)
            try:
                usage = psutil.disk_usage(partition.mountpoint)
                size = usage.total
            except (PermissionError, OSError):
                size = 0
            disk_info = DiskInfo(device=base_device, model='Unknown', size=size, serial='Unknown')
            disks.append(disk_info)
        return disks
    def _get_base_device(self, device: str) -> str:
        if 'nvme' in device:
            if 'p' in device:
                return device.rsplit('p', 1)[0]
            return device
        import re
        match = re.match('(/dev/[a-z]+)', device)
        if match:
            return match.group(1)
        return device
    def get_disk_for_path(self, path: Path) -> Optional[str]:
        path = path.resolve()
        best_match = None
        best_match_len = 0
        for partition in psutil.disk_partitions():
            mount_point = Path(partition.mountpoint)
            try:
                if path == mount_point or mount_point in path.parents:
                    mount_len = len(str(mount_point))
                    if mount_len > best_match_len:
                        best_match = partition.device
                        best_match_len = mount_len
            except (ValueError, OSError):
                continue
        return best_match
    def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
        try:
            usage = psutil.disk_usage(str(path))
            return (usage.total, usage.used, usage.free)
        except (PermissionError, OSError):
            return (0, 0, 0)
    def get_mount_point(self, path: Path) -> Optional[Path]:
        path = path.resolve()
        best_match = None
        best_match_len = 0
        for partition in psutil.disk_partitions():
            mount_point = Path(partition.mountpoint)
            try:
                if path == mount_point or mount_point in path.parents:
                    mount_len = len(str(mount_point))
                    if mount_len > best_match_len:
                        best_match = mount_point
                        best_match_len = mount_len
            except (ValueError, OSError):
                continue
        return best_match
    def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
        try:
            stat1 = path1.stat()
            stat2 = path2.stat()
            return stat1.st_dev == stat2.st_dev
        except (OSError, PermissionError):
            return False
--- a/app/enrichment/enricher.py
+++ b/app/enrichment/enricher.py
@@ -0,0 +1,59 @@
 from typing import Dict
 import re
 class ContentEnricher:
    def __init__(self, llm_client=None):
        self.llm_client = llm_client
        self.pii_patterns = {
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
            'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
            'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
        }
    def enrich(self, text: str, use_llm: bool = False) -> Dict:
        enrichment = {
            'summary': self._basic_summary(text),
            'word_count': len(text.split()),
            'has_pii': self._detect_pii(text),
            'quality': self._assess_quality(text),
            'topics': self._extract_basic_topics(text)
        }
        if use_llm and self.llm_client:
            llm_result = self.llm_client.classify_content(text)
            if llm_result.get('success'):
                enrichment['llm_classification'] = llm_result['text']
        return enrichment
    def _basic_summary(self, text: str) -> str:
        sentences = re.split(r'[.!?]+', text)
        return ' '.join(sentences[:3])[:200]
    def _detect_pii(self, text: str) -> Dict:
        detected = {}
        for pii_type, pattern in self.pii_patterns.items():
            matches = re.findall(pattern, text)
            if matches:
                detected[pii_type] = len(matches)
        return detected
    def _assess_quality(self, text: str) -> str:
        if len(text.strip()) < 10:
            return 'low'
        special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
        if special_char_ratio > 0.3:
            return 'low'
        return 'high' if len(text.split()) > 50 else 'medium'
    def _extract_basic_topics(self, text: str) -> list:
        words = re.findall(r'\b[A-Z][a-z]+\b', text)
        word_freq = {}
        for word in words:
            if len(word) > 3:
                word_freq[word] = word_freq.get(word, 0) + 1
        return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
--- a/app/enrichment/llm_client.py
+++ b/app/enrichment/llm_client.py
@@ -0,0 +1,54 @@
 import requests
 import json
 from typing import Dict, Optional
 class LLMClient:
    def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'):
        self.endpoint = endpoint
        self.model = model
        self.local_ollama = 'http://localhost:11434'
    def summarize(self, text: str, max_length: int = 200) -> Dict:
        prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}"
        return self._query(prompt)
    def extract_topics(self, text: str) -> Dict:
        prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}"
        return self._query(prompt)
    def classify_content(self, text: str) -> Dict:
        prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}"
        return self._query(prompt)
    def _query(self, prompt: str, use_local: bool = False) -> Dict:
        try:
            endpoint = self.local_ollama if use_local else self.endpoint
            if use_local:
                response = requests.post(
                    f'{endpoint}/api/generate',
                    json={'model': 'llama3.2', 'prompt': prompt, 'stream': False},
                    timeout=30
                )
            else:
                response = requests.post(
                    f'{endpoint}/v1/chat/completions',
                    json={
                        'model': self.model,
                        'messages': [{'role': 'user', 'content': prompt}],
                        'max_tokens': 500
                    },
                    timeout=30
                )
            if response.status_code == 200:
                data = response.json()
                if use_local:
                    return {'success': True, 'text': data.get('response', '')}
                else:
                    return {'success': True, 'text': data['choices'][0]['message']['content']}
            else:
                return {'success': False, 'error': f'HTTP {response.status_code}'}
        except Exception as e:
            return {'success': False, 'error': str(e)}
--- a/app/filters/init.py
+++ b/app/filters/init.py
@@ -0,0 +1,3 @@
 from .gitignore import GitignoreFilter, DEFAULT_PATTERNS
 __all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS']
--- a/app/filters/gitignore.py
+++ b/app/filters/gitignore.py
@@ -0,0 +1,30 @@
 from pathlib import Path
 from typing import Set
 import fnmatch
 DEFAULT_PATTERNS = {
    'node_modules/**', '__pycache__/**', '.git/**', 'build/**', 'dist/**',
    '.cache/**', 'target/**', 'vendor/**', '.venv/**', 'venv/**',
    '*.pyc', '*.pyo', '*.so', '*.dll', '*.dylib', '*.o', '*.a',
    '.DS_Store', 'Thumbs.db', '.pytest_cache/**', '.tox/**',
    '*.egg-info/**', '.mypy_cache/**', '.coverage', 'htmlcov/**',
    '.gradle/**', 'bin/**', 'obj/**', '.vs/**', '.idea/**'
 }
 class GitignoreFilter:
    def __init__(self, patterns: Set[str] = None):
        self.patterns = patterns or DEFAULT_PATTERNS
    def should_exclude(self, path: str) -> bool:
        path_obj = Path(path)
        for pattern in self.patterns:
            if '**' in pattern:
                clean_pattern = pattern.replace('/**', '').replace('**/', '')
                if clean_pattern in path_obj.parts:
                    return True
            elif fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(path_obj.name, pattern):
                return True
        return False
    def filter_files(self, files: list) -> list:
        return [f for f in files if not self.should_exclude(f)]
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,918 @@
 import os
 import sys
 from dataclasses import dataclass
 import psycopg2
 import shutil
 import hashlib
 import argparse
 import json
 from pathlib import Path
 from typing import List, Dict, Optional
 from datetime import datetime
 import logging
 import time
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)])
 logger = logging.getLogger(__name__)
@dataclass
 class FileRecord:
    path: str
    size: int
    modified_time: float
    disk_label: str
    checksum: Optional[str] = None
    status: str = 'indexed'
 class DiskReorganizer:
    def __init__(self, db_config: Dict=None):
        if db_config is None:
            db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
        self.db_config = db_config
        self.init_database()
    def get_connection(self):
        return psycopg2.connect(**self.db_config)
    def init_database(self):
        try:
            conn = self.get_connection()
            cursor = conn.cursor()
            cursor.execute("\n                SELECT table_name FROM information_schema.tables\n                WHERE table_schema = 'public' AND table_name IN ('files', 'operations')\n            ")
            tables = cursor.fetchall()
            if len(tables) < 2:
                logger.error('Database tables not found! Please run setup_database.sh first.')
                raise Exception('Database not properly initialized. Run setup_database.sh')
            cursor.close()
            conn.close()
            logger.info('Database connection verified successfully')
        except psycopg2.Error as e:
            logger.error(f'Database connection failed: {e}')
            raise
    def index_disk(self, disk_root: str, disk_name: str):
        logger.info(f'Indexing disk: {disk_name} at {disk_root}')
        disk_path = Path(disk_root)
        if not disk_path.exists():
            logger.error(f'Disk path {disk_root} does not exist!')
            return
        files_count = 0
        total_size = 0
        start_time = time.time()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            for root, dirs, files in os.walk(disk_path):
                dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))]
                for file in files:
                    try:
                        file_path = Path(root) / file
                        if not file_path.is_file():
                            continue
                        stat = file_path.stat()
                        size = stat.st_size
                        mtime = datetime.fromtimestamp(stat.st_mtime)
                        rel_path = str(file_path.relative_to(disk_path))
                        cursor.execute('\n                            INSERT INTO files (path, size, modified_time, disk_label, checksum, status)\n                            VALUES (%s, %s, %s, %s, %s, %s)\n                            ON CONFLICT (path) DO UPDATE SET\n                                size = EXCLUDED.size,\n                                modified_time = EXCLUDED.modified_time,\n                                disk_label = EXCLUDED.disk_label,\n                                status = EXCLUDED.status\n                        ', (rel_path, size, mtime, disk_name, None, 'indexed'))
                        files_count += 1
                        total_size += size
                        if files_count % 100 == 0:
                            elapsed = time.time() - start_time
                            rate = files_count / elapsed if elapsed > 0 else 0
                            display_path = str(file_path)
                            if len(display_path) > 60:
                                display_path = '...' + display_path[-57:]
                            print(f'\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}', end='', flush=True)
                        if files_count % 1000 == 0:
                            conn.commit()
                    except Exception as e:
                        conn.rollback()
                        logger.warning(f'\nSkipping {file_path}: {e}')
                        continue
            conn.commit()
            print()
            logger.info(f'Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}')
        finally:
            cursor.close()
            conn.close()
    def calculate_disk_usage(self) -> Dict[str, Dict]:
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            cursor.execute('\n                SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count\n                FROM files\n                GROUP BY disk_label\n            ')
            usage = {}
            for row in cursor.fetchall():
                disk = row[0]
                size = int(row[1] or 0)
                count = int(row[2])
                usage[disk] = {'size': size, 'count': count, 'formatted_size': self.format_size(size)}
            return usage
        finally:
            cursor.close()
            conn.close()
    def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict:
        logger.info(f'Planning migration to free up {target_disk}')
        usage = self.calculate_disk_usage()
        if target_disk not in usage:
            logger.error(f'Target disk {target_disk} not found in index!')
            return {}
        conn = self.get_connection()
        cursor = conn.cursor()
        cursor.execute('SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC', (target_disk,))
        files_to_move = cursor.fetchall()
        cursor.close()
        conn.close()
        target_disk_usage = usage[target_disk]['size']
        logger.info(f'Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}')
        dest_availability = []
        for disk in destination_disks:
            if disk not in usage:
                available = float('inf')
            else:
                available = float('inf')
            dest_availability.append({'disk': disk, 'available': available, 'planned_usage': 0})
        plan = {'target_disk': target_disk, 'total_size': target_disk_usage, 'file_count': len(files_to_move), 'operations': [], 'destination_disks': destination_disks}
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            for file_info in files_to_move:
                rel_path, size, mtime = file_info
                dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)]
                op = {'source_disk': target_disk, 'source_path': rel_path, 'dest_disk': dest_disk, 'target_path': rel_path, 'size': int(size)}
                plan['operations'].append(op)
                cursor.execute('INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)', (f'{target_disk}:{rel_path}', f'{dest_disk}:{rel_path}', 'move', 'pending'))
            conn.commit()
        finally:
            cursor.close()
            conn.close()
        plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(plan_file, 'w') as f:
            json.dump(plan, f, indent=2)
        logger.info(f"Plan created with {len(plan['operations'])} operations")
        logger.info(f'Plan saved to {plan_file}')
        return plan
    def verify_operation(self, source: Path, dest: Path) -> bool:
        if not dest.exists():
            return False
        try:
            source_stat = source.stat()
            dest_stat = dest.stat()
            if source_stat.st_size != dest_stat.st_size:
                return False
            return True
        except Exception as e:
            logger.error(f'Verification error: {e}')
            return False
    @staticmethod
    def file_checksum(path: Path) -> str:
        hash_md5 = hashlib.md5()
        with open(path, 'rb') as f:
            for chunk in iter(lambda: f.read(4096), b''):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()
    def execute_migration(self, plan_file: str, dry_run: bool=True):
        logger.info(f"{('DRY RUN' if dry_run else 'EXECUTING')} migration from {plan_file}")
        with open(plan_file, 'r') as f:
            plan = json.load(f)
        operations = plan['operations']
        logger.info(f'Processing {len(operations)} operations...')
        success_count = 0
        error_count = 0
        start_time = time.time()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            for i, op in enumerate(operations, 1):
                source_disk = op['source_disk']
                source_path = op['source_path']
                dest_disk = op['dest_disk']
                target_path = op['target_path']
                source_full = Path(source_disk) / source_path
                dest_full = Path(dest_disk) / target_path
                elapsed = time.time() - start_time
                rate = i / elapsed if elapsed > 0 else 0
                eta = (len(operations) - i) / rate if rate > 0 else 0
                display_path = str(source_path)
                if len(display_path) > 50:
                    display_path = '...' + display_path[-47:]
                print(f'\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}', end='', flush=True)
                if dry_run:
                    if source_full.exists():
                        success_count += 1
                    else:
                        logger.warning(f'\n  Source does not exist: {source_full}')
                        error_count += 1
                    continue
                try:
                    dest_full.parent.mkdir(parents=True, exist_ok=True)
                    if source_full.exists():
                        shutil.copy2(source_full, dest_full)
                        if self.verify_operation(source_full, dest_full):
                            cursor.execute("UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s", (dest_disk, source_path, source_disk))
                            cursor.execute('UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s', (f'{source_disk}:{source_path}',))
                            success_count += 1
                        else:
                            raise Exception('Verification failed')
                    else:
                        logger.warning(f'\n  Source missing: {source_full}')
                        error_count += 1
                except Exception as e:
                    logger.error(f'\n  Error processing {source_path}: {e}')
                    cursor.execute('UPDATE operations SET error = %s WHERE source_path = %s', (str(e), f'{source_disk}:{source_path}'))
                    error_count += 1
                if i % 10 == 0:
                    conn.commit()
            conn.commit()
            print()
        finally:
            cursor.close()
            conn.close()
        logger.info(f'Migration complete: {success_count} success, {error_count} errors')
        if not dry_run and error_count == 0:
            logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
            logger.info(f"  Remember to safely delete original files from {plan['target_disk']}")
    def run_deduplication(self, disk: Optional[str]=None, use_chunks: bool=True):
        logger.info(f"Starting deduplication{(' for disk ' + disk if disk else '')}")
        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
        conn = self.get_connection()
        cursor = conn.cursor()
        def hash_file_local(file_path: Path) -> str:
            hasher = hashlib.sha256()
            with open(file_path, 'rb') as f:
                while (chunk := f.read(65536)):
                    hasher.update(chunk)
            return hasher.hexdigest()
        try:
            if disk:
                cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC', (disk,))
            else:
                cursor.execute('SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC')
            files_to_process = cursor.fetchall()
            total = len(files_to_process)
            logger.info(f'Found {total} files to hash')
            processed = 0
            skipped = 0
            start_time = time.time()
            batch = []
            print(f'Phase 1: Computing checksums...')
            for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
                try:
                    mount_point = disk_mount_map.get(disk_label, disk_label)
                    full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
                    if not full_path.exists():
                        skipped += 1
                        if idx % 100 == 0:
                            elapsed = time.time() - start_time
                            rate = (processed + skipped) / elapsed if elapsed > 0 else 0
                            remaining = (total - idx) / rate if rate > 0 else 0
                            pct = 100 * idx / total
                            print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
                        continue
                    checksum = hash_file_local(full_path)
                    batch.append((checksum, path_str))
                    processed += 1
                    if len(batch) >= 1000:
                        try:
                            cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
                            conn.commit()
                            batch.clear()
                        except Exception as e:
                            conn.rollback()
                            batch.clear()
                            print(f'\nBatch update failed: {e}')
                    if idx % 100 == 0:
                        elapsed = time.time() - start_time
                        rate = (processed + skipped) / elapsed if elapsed > 0 else 0
                        remaining = (total - idx) / rate if rate > 0 else 0
                        pct = 100 * idx / total
                        print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
                except Exception as e:
                    skipped += 1
                    if idx <= 5:
                        print(f'\nDebug: {full_path} - {e}')
            if batch:
                try:
                    cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
                    conn.commit()
                except Exception as e:
                    conn.rollback()
                    print(f'\nFinal batch failed: {e}')
            print()
            elapsed = time.time() - start_time
            logger.info(f'Phase 1 done: {processed:,} files in {int(elapsed / 60)}m{int(elapsed % 60):02d}s ({skipped:,} skipped)')
            print('Phase 2: Finding duplicates...')
            cursor.execute('\n                UPDATE files f1 SET duplicate_of = (\n                    SELECT MIN(path) FROM files f2\n                    WHERE f2.checksum = f1.checksum AND f2.path < f1.path\n                )\n                WHERE checksum IS NOT NULL\n            ')
            conn.commit()
            cursor.execute('SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL')
            dup_count = cursor.fetchone()[0]
            logger.info(f'Phase 2 done: Found {dup_count:,} duplicates')
        finally:
            cursor.close()
            conn.close()
    def plan_merge(self, sources: List[str], target: str, output_file: str, filter_system: bool=False, network_target: str=None):
        logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
        if filter_system:
            sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
            from filters import GitignoreFilter
            file_filter = GitignoreFilter()
            logger.info('System/build file filtering enabled')
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            placeholders = ','.join(['%s'] * len(sources))
            cursor.execute(f'\n                SELECT path, size, checksum, disk_label, duplicate_of\n                FROM files\n                WHERE disk_label IN ({placeholders})\n                ORDER BY size DESC\n            ', tuple(sources))
            files = cursor.fetchall()
            total_files = len(files)
            total_size = sum((int(f[1]) for f in files))
            unique_files = {}
            duplicate_count = 0
            duplicate_size = 0
            filtered_count = 0
            filtered_size = 0
            for path, size, checksum, disk_label, duplicate_of in files:
                if filter_system and file_filter.should_exclude(path):
                    filtered_count += 1
                    filtered_size += int(size)
                    continue
                if checksum and checksum in unique_files:
                    duplicate_count += 1
                    duplicate_size += int(size)
                elif checksum:
                    unique_files[checksum] = (path, int(size), disk_label)
            unique_count = len(unique_files)
            unique_size = sum((f[1] for f in unique_files.values()))
            plan = {'sources': sources, 'target': target or network_target, 'network': network_target is not None, 'total_files': total_files, 'total_size': total_size, 'unique_files': unique_count, 'unique_size': unique_size, 'duplicate_files': duplicate_count, 'duplicate_size': duplicate_size, 'filtered_files': filtered_count if filter_system else 0, 'filtered_size': filtered_size if filter_system else 0, 'space_saved': duplicate_size + (filtered_size if filter_system else 0), 'operations': []}
            for checksum, (path, size, disk_label) in unique_files.items():
                plan['operations'].append({'source_disk': disk_label, 'source_path': path, 'target_disk': target or network_target, 'target_path': path, 'size': size, 'checksum': checksum})
            with open(output_file, 'w') as f:
                json.dump(plan, f, indent=2)
            logger.info(f'Merge plan saved to {output_file}')
            print(f'\n=== MERGE PLAN SUMMARY ===')
            print(f"Sources: {', '.join(sources)}")
            print(f'Target: {target or network_target}')
            print(f'Total files: {total_files:,} ({self.format_size(total_size)})')
            if filter_system:
                print(f'Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})')
            print(f'Unique files: {unique_count:,} ({self.format_size(unique_size)})')
            print(f'Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})')
            print(f"Total space saved: {self.format_size(plan['space_saved'])}")
            print(f'Space needed on target: {self.format_size(unique_size)}')
        finally:
            cursor.close()
            conn.close()
    def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            if preview_merge:
                with open(preview_merge, 'r') as f:
                    plan = json.load(f)
                print('\n=== MERGE PLAN PREVIEW ===')
                print(f"Sources: {', '.join(plan['sources'])}")
                print(f"Target: {plan['target']}")
                print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
                print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
                print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
                print(f"Space saved: {self.format_size(plan['space_saved'])}")
                print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
                return
            cursor.execute('\n                SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status\n            ')
            print('\n=== FILE MIGRATION REPORT ===')
            for row in cursor.fetchall():
                status, count, size = row
                print(f'{status:15}: {count:6} files, {self.format_size(int(size or 0))}')
            cursor.execute('\n                SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label\n            ')
            print('\n=== DISK USAGE ===')
            for row in cursor.fetchall():
                disk, count, size = row
                print(f'{disk:20}: {count:6} files, {self.format_size(int(size or 0))}')
            cursor.execute('\n                SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL\n            ')
            hashed_count, hashed_size = cursor.fetchone()
            cursor.execute('\n                SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL\n            ')
            dup_count, dup_size = cursor.fetchone()
            print('\n=== DEDUPLICATION STATS ===')
            print(f'Files with checksums: {hashed_count or 0:6}')
            print(f'Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})')
            if show_duplicates and dup_count:
                print('\n=== DUPLICATE FILES ===')
                cursor.execute('\n                    SELECT path, size, duplicate_of FROM files\n                    WHERE duplicate_of IS NOT NULL\n                    ORDER BY size DESC\n                    LIMIT 20\n                ')
                for path, size, dup_of in cursor.fetchall():
                    print(f'  {path} ({self.format_size(int(size))}) → {dup_of}')
            cursor.execute('\n                SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified\n            ')
            print('\n=== OPERATIONS REPORT ===')
            for row in cursor.fetchall():
                op_type, executed, verified, count = row
                status = 'EXECUTED' if executed else 'PENDING'
                if verified:
                    status += '+VERIFIED'
                print(f'{op_type:10} {status:15}: {count} operations')
        finally:
            cursor.close()
            conn.close()
    def profile_content(self, disk: Optional[str]=None, update_db: bool=False, limit: Optional[int]=None):
        from content.profiler import ContentProfiler
        profiler = ContentProfiler()
        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            query = 'SELECT path, size, disk_label FROM files WHERE 1=1'
            params = []
            if disk:
                query += ' AND disk_label = %s'
                params.append(disk)
            if limit:
                query += f' LIMIT {limit}'
            cursor.execute(query, params)
            files = cursor.fetchall()
            total = len(files)
            logger.info(f'Profiling {total:,} files...')
            kind_stats = {}
            processable = 0
            batch = []
            for idx, (path, size, disk_label) in enumerate(files, 1):
                mount_point = disk_mount_map.get(disk_label, disk_label)
                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
                if not full_path.exists():
                    continue
                profile = profiler.profile_file(full_path)
                if 'error' not in profile:
                    kind = profile['kind']
                    if kind not in kind_stats:
                        kind_stats[kind] = {'count': 0, 'processable': 0}
                    kind_stats[kind]['count'] += 1
                    if profile['processable']:
                        kind_stats[kind]['processable'] += 1
                        processable += 1
                    if update_db:
                        profile_json = json.dumps(profile)
                        batch.append((kind, profile_json, path))
                        if len(batch) >= 500:
                            cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
                            conn.commit()
                            batch.clear()
                if idx % 100 == 0:
                    print(f'\rProfiled: {idx:,}/{total:,}', end='', flush=True)
            if update_db and batch:
                cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
                conn.commit()
            print()
            print(f'\n=== CONTENT PROFILE SUMMARY ===')
            print(f'Total files: {total:,}')
            print(f'Processable: {processable:,}\n')
            print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
            print('-' * 60)
            for kind in sorted(kind_stats.keys()):
                stats = kind_stats[kind]
                extractor = profiler._suggest_extractor(kind, '')
                print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
        finally:
            cursor.close()
            conn.close()
    def extract_content(self, kind: Optional[str]=None, limit: int=10):
        from content.profiler import ContentProfiler
        from content.extractors import ContentExtractor
        profiler = ContentProfiler()
        extractor = ContentExtractor()
        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
            params = []
            if kind:
                query += " AND metadata->'profile'->>'kind' = %s"
                params.append(kind)
            query += f' LIMIT {limit}'
            cursor.execute(query, params)
            files = cursor.fetchall()
            print(f'\n=== EXTRACTING CONTENT ===')
            print(f'Processing {len(files)} files\n')
            for path, size, disk_label, metadata in files:
                mount_point = disk_mount_map.get(disk_label, disk_label)
                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
                if not full_path.exists():
                    continue
                profile = metadata.get('profile', {}) if metadata else {}
                extractor_type = profile.get('extractor')
                if not extractor_type:
                    continue
                print(f'Extracting: {path}')
                print(f"  Type: {profile.get('kind')} | Extractor: {extractor_type}")
                result = extractor.extract(full_path, extractor_type)
                if 'text' in result:
                    preview = result['text'][:200]
                    print(f'  Preview: {preview}...')
                elif 'pipeline' in result:
                    print(f"  Pipeline: {' → '.join(result['pipeline'])}")
                    print(f"  Status: {result.get('status', 'pending')}")
                print()
        finally:
            cursor.close()
            conn.close()
    def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False):
        from parsers.text_parser import TextParser
        from parsers.code_parser import CodeParser
        from parsers.pdf_parser import PDFParser
        parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()}
        disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            query = "SELECT path, size, disk_label FROM files WHERE 1=1"
            params = []
            if kind:
                suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
                if kind in suffix_map:
                    query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
            query += f" LIMIT {limit}"
            cursor.execute(query, params)
            files = cursor.fetchall()
            print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
            parsed_count = 0
            for path, size, disk_label in files:
                mount_point = disk_mount_map.get(disk_label, disk_label)
                full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
                if not full_path.exists() or int(size) > 10 * 1024 * 1024:
                    continue
                file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text'
                parser = parsers.get(file_kind)
                if not parser:
                    continue
                result = parser.parse(full_path)
                if 'error' not in result:
                    text = result.get('text', '')
                    quality = result.get('quality', 'unknown')
                    print(f"{path[:60]} | {file_kind} | {len(text):,} chars")
                    if update_db and text:
                        cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path))
                        parsed_count += 1
                        if parsed_count % 10 == 0:
                            conn.commit()
            if update_db:
                conn.commit()
            print(f"\nParsed {parsed_count} files")
        finally:
            cursor.close()
            conn.close()
    def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
        from enrichment.enricher import ContentEnricher
        enricher = ContentEnricher()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
            files = cursor.fetchall()
            print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
            for path, text in files:
                enrichment = enricher.enrich(text[:5000], use_llm=False)
                print(f"{path[:60]}")
                print(f"  Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
                print(f"  PII: {list(enrichment.get('has_pii', {}).keys())}")
                print(f"  Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
                cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
            conn.commit()
            print(f"Enriched {len(files)} files")
        finally:
            cursor.close()
            conn.close()
    def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True):
        from classification.classifier import FileClassifier
        classifier = FileClassifier()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            task_name = f"classify_{disk or 'all'}"
            skip_count = 0
            if resume and update_db:
                cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,))
                checkpoint = cursor.fetchone()
                if checkpoint:
                    last_path, skip_count = checkpoint
                    logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed')
            if disk:
                cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,))
            else:
                cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path')
            files = cursor.fetchall()
            total = len(files)
            logger.info(f'Classifying {total:,} files...')
            categories = {}
            build_artifacts = 0
            batch = []
            processed = 0
            for idx, (path, size, disk_label) in enumerate(files, 1):
                if idx <= skip_count:
                    continue
                labels, category, is_build = classifier.classify_path(path, int(size))
                if is_build:
                    build_artifacts += 1
                if category not in categories:
                    categories[category] = {'count': 0, 'size': 0}
                categories[category]['count'] += 1
                categories[category]['size'] += int(size)
                if update_db:
                    labels_str = ','.join(labels)
                    batch.append((category, labels_str, path))
                    if len(batch) >= 1000:
                        cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
                        cursor.execute('''
                            INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
                            VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
                            ON CONFLICT (task_name) DO UPDATE SET
                                last_processed_path = EXCLUDED.last_processed_path,
                                processed_count = EXCLUDED.processed_count,
                                updated_at = CURRENT_TIMESTAMP
                        ''', (task_name, path, idx))
                        conn.commit()
                        batch.clear()
                processed += 1
                if idx % 1000 == 0:
                    print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True)
            if update_db and batch:
                cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
                cursor.execute('''
                    INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
                    VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
                    ON CONFLICT (task_name) DO UPDATE SET
                        last_processed_path = EXCLUDED.last_processed_path,
                        processed_count = EXCLUDED.processed_count,
                        updated_at = CURRENT_TIMESTAMP
                ''', (task_name, files[-1][0] if files else '', total))
                conn.commit()
            print()
            print(f'\n=== CLASSIFICATION SUMMARY ===')
            print(f'Total files: {total:,}')
            print(f'Build artifacts: {build_artifacts:,}')
            print(f'\nCategories:')
            for category in sorted(categories.keys()):
                info = categories[category]
                print(f"  {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
        finally:
            cursor.close()
            conn.close()
    def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
        from analysis.folder_analyzer import FolderAnalyzer
        analyzer = FolderAnalyzer()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            query = '''
                SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label
                FROM files
                WHERE 1=1
            '''
            params = []
            if disk:
                query += ' AND disk_label = %s'
                params.append(disk)
            cursor.execute(query, params)
            potential_folders = cursor.fetchall()
            logger.info(f'Found {len(potential_folders)} potential folders to analyze')
            processed = 0
            for folder_name, disk_label in potential_folders:
                cursor.execute('''
                    SELECT path, size FROM files
                    WHERE disk_label = %s AND path LIKE %s
                ''', (disk_label, f'{folder_name}%'))
                files = cursor.fetchall()
                if len(files) < min_files:
                    continue
                files_list = [{'path': f[0], 'size': int(f[1])} for f in files]
                folder_path = Path(folder_name)
                analysis = analyzer.analyze_folder(folder_path, files_list)
                readme_text = None
                for file_dict in files_list:
                    if 'readme' in file_dict['path'].lower():
                        readme_text = f"Found README at {file_dict['path']}"
                        break
                summary = analyzer.generate_summary(analysis, readme_text)
                cursor.execute('''
                    INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary,
                                        has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT (path) DO UPDATE SET
                        file_count = EXCLUDED.file_count,
                        total_size = EXCLUDED.total_size,
                        project_type = EXCLUDED.project_type,
                        intent = EXCLUDED.intent,
                        summary = EXCLUDED.summary,
                        has_readme = EXCLUDED.has_readme,
                        has_git = EXCLUDED.has_git,
                        has_manifest = EXCLUDED.has_manifest,
                        manifest_types = EXCLUDED.manifest_types,
                        dominant_file_types = EXCLUDED.dominant_file_types,
                        structure = EXCLUDED.structure,
                        updated_at = CURRENT_TIMESTAMP
                ''', (
                    str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list),
                    analysis.get('project_type'), analysis.get('intent'), summary,
                    analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'),
                    analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})),
                    json.dumps(analysis.get('structure', {}))
                ))
                processed += 1
                if processed % 100 == 0:
                    conn.commit()
                    print(f'\rAnalyzed: {processed} folders', end='', flush=True)
            conn.commit()
            print()
            logger.info(f'Completed folder analysis: {processed} folders')
            cursor.execute('''
                SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size)
                FROM folders
                GROUP BY project_type
            ''')
            print(f'\n=== FOLDER ANALYSIS SUMMARY ===')
            for row in cursor.fetchall():
                proj_type, count, files, size = row
                print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}')
        finally:
            cursor.close()
            conn.close()
    def review_migration(self, category: Optional[str]=None, show_build: bool=False):
        from classification.classifier import FileClassifier
        classifier = FileClassifier()
        conn = self.get_connection()
        cursor = conn.cursor()
        try:
            query = 'SELECT path, size, category FROM files WHERE 1=1'
            params = []
            if category:
                query += ' AND category = %s'
                params.append(category)
            if not show_build:
                query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"
            query += ' ORDER BY category, size DESC LIMIT 100'
            cursor.execute(query, params)
            files = cursor.fetchall()
            if not files:
                print('No files found matching criteria')
                return
            print(f'\n=== MIGRATION PREVIEW ===')
            print(f'Showing {len(files)} files\n')
            current_category = None
            for path, size, cat in files:
                if cat != current_category:
                    current_category = cat
                    print(f'\n{cat}:')
                labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
                target = classifier.suggest_target_path(path, suggested_cat, labels)
                print(f'  {path}')
                print(f'    → {target} ({self.format_size(int(size))})')
        finally:
            cursor.close()
            conn.close()
    @staticmethod
    def format_size(size: int) -> str:
        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
            if size < 1024:
                return f'{size:.1f}{unit}'
            size /= 1024
        return f'{size:.1f}PB'
 def main():
    parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
    subparsers = parser.add_subparsers(dest='command', required=True)
    index_parser = subparsers.add_parser('index', help='Index files on a disk')
    index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
    index_parser.add_argument('disk_name', help='Logical name for the disk')
    plan_parser = subparsers.add_parser('plan', help='Create migration plan')
    plan_parser.add_argument('target_disk', help='Disk to free up')
    plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
    exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
    exec_parser.add_argument('plan_file', help='Path to plan JSON file')
    exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
    dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
    dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
    dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
    merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
    merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
    merge_parser.add_argument('--target', required=True, help='Target disk')
    merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
    merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
    merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
    profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
    profile_parser.add_argument('--disk', help='Profile specific disk')
    profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
    profile_parser.add_argument('--limit', type=int, help='Limit number of files')
    extract_parser = subparsers.add_parser('extract', help='Extract content from files')
    extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
    extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
    parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
    parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
    parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
    parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
    enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
    enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
    enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
    enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
    classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
    classify_parser.add_argument('--disk', help='Classify specific disk')
    classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
    classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch instead of resuming')
    folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
    folders_parser.add_argument('--disk', help='Analyze specific disk')
    folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
    review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
    review_parser.add_argument('--category', help='Review specific category')
    review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
    report_parser = subparsers.add_parser('report', help='Show current status')
    report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
    report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
    report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
    args = parser.parse_args()
    tool = DiskReorganizer()
    if args.command == 'index':
        tool.index_disk(args.disk_root, args.disk_name)
    elif args.command == 'dedupe':
        tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
    elif args.command == 'merge':
        tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output, filter_system=args.filter_system, network_target=args.network)
    elif args.command == 'plan':
        plan = tool.plan_migration(args.target_disk, args.dest_disks)
        if plan:
            print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
            print(f"Destination disks: {', '.join(plan['destination_disks'])}")
    elif args.command == 'execute':
        tool.execute_migration(args.plan_file, dry_run=args.dry_run)
    elif args.command == 'profile':
        tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
    elif args.command == 'extract':
        tool.extract_content(kind=args.kind, limit=args.limit)
    elif args.command == 'parse':
        tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
    elif args.command == 'enrich':
        tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
    elif args.command == 'classify':
        tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
    elif args.command == 'analyze-folders':
        tool.analyze_folders(disk=args.disk, min_files=args.min_files)
    elif args.command == 'review':
        tool.review_migration(category=args.category, show_build=args.show_build)
    elif args.command == 'report':
        tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
 if __name__ == '__main__':
    main()
--- a/app/migration/init.py
+++ b/app/migration/init.py
@@ -0,0 +1,27 @@
 """Migration package exports"""
 from .copy import (
    CopyMigrationStrategy,
    FastCopyStrategy,
    SafeCopyStrategy,
    ReferenceCopyStrategy
 )
 from .hardlink import (
    HardlinkMigrationStrategy,
    SymlinkMigrationStrategy,
    DedupHardlinkStrategy
 )
 from .engine import MigrationEngine
 from ._protocols import IMigrationStrategy, IMigrationEngine
 __all__ = [
    'CopyMigrationStrategy',
    'FastCopyStrategy',
    'SafeCopyStrategy',
    'ReferenceCopyStrategy',
    'HardlinkMigrationStrategy',
    'SymlinkMigrationStrategy',
    'DedupHardlinkStrategy',
    'MigrationEngine',
    'IMigrationStrategy',
    'IMigrationEngine',
 ]
--- a/app/migration/_protocols.py
+++ b/app/migration/_protocols.py
@@ -0,0 +1,107 @@
 """Protocol definitions for the migration package"""
 from typing import Protocol
 from pathlib import Path
 from ..shared.models import OperationRecord
 class IMigrationStrategy(Protocol):
    """Protocol for migration strategies"""
    def migrate(
        self,
        source: Path,
        destination: Path,
        verify: bool = True
    ) -> bool:
        """Migrate a file from source to destination
        Args:
            source: Source file path
            destination: Destination file path
            verify: Whether to verify the operation
        Returns:
            True if migration successful
        """
        ...
    def can_migrate(self, source: Path, destination: Path) -> bool:
        """Check if migration is possible
        Args:
            source: Source file path
            destination: Destination file path
        Returns:
            True if migration is possible
        """
        ...
    def estimate_time(self, source: Path) -> float:
        """Estimate migration time in seconds
        Args:
            source: Source file path
        Returns:
            Estimated time in seconds
        """
        ...
    def cleanup(self, source: Path) -> bool:
        """Cleanup source file after successful migration
        Args:
            source: Source file path
        Returns:
            True if cleanup successful
        """
        ...
 class IMigrationEngine(Protocol):
    """Protocol for migration engine"""
    def plan_migration(
        self,
        disk: str,
        target_base: Path
    ) -> list[OperationRecord]:
        """Plan migration for a disk
        Args:
            disk: Disk identifier
            target_base: Target base directory
        Returns:
            List of planned operations
        """
        ...
    def execute_migration(
        self,
        operations: list[OperationRecord],
        dry_run: bool = False
    ) -> dict:
        """Execute migration operations
        Args:
            operations: List of operations to execute
            dry_run: Whether to perform a dry run
        Returns:
            Dictionary with execution statistics
        """
        ...
    def rollback(self, operation: OperationRecord) -> bool:
        """Rollback a migration operation
        Args:
            operation: Operation to rollback
        Returns:
            True if rollback successful
        """
        ...
--- a/app/migration/copy.py
+++ b/app/migration/copy.py
@@ -0,0 +1,268 @@
 """Copy-based migration strategy"""
 import shutil
 from pathlib import Path
 from typing import Optional
 import os
 from ..shared.logger import ProgressLogger
 class CopyMigrationStrategy:
    """Copy files to destination with verification"""
    def __init__(
        self,
        logger: Optional[ProgressLogger] = None,
        preserve_metadata: bool = True,
        verify_checksums: bool = True
    ):
        """Initialize copy migration strategy
        Args:
            logger: Optional progress logger
            preserve_metadata: Whether to preserve file metadata
            verify_checksums: Whether to verify checksums after copy
        """
        self.logger = logger
        self.preserve_metadata = preserve_metadata
        self.verify_checksums = verify_checksums
    def migrate(
        self,
        source: Path,
        destination: Path,
        verify: bool = True
    ) -> bool:
        """Migrate file by copying
        Args:
            source: Source file path
            destination: Destination file path
            verify: Whether to verify the operation
        Returns:
            True if migration successful
        """
        if not source.exists():
            if self.logger:
                self.logger.error(f"Source file does not exist: {source}")
            return False
        # Create destination directory
        destination.parent.mkdir(parents=True, exist_ok=True)
        try:
            # Copy file
            if self.preserve_metadata:
                shutil.copy2(source, destination)
            else:
                shutil.copy(source, destination)
            # Verify if requested
            if verify and self.verify_checksums:
                if not self._verify_copy(source, destination):
                    if self.logger:
                        self.logger.error(f"Verification failed: {source} -> {destination}")
                    destination.unlink()
                    return False
            return True
        except Exception as e:
            if self.logger:
                self.logger.error(f"Copy failed: {source} -> {destination}: {e}")
            return False
    def _verify_copy(self, source: Path, destination: Path) -> bool:
        """Verify copied file
        Args:
            source: Source file path
            destination: Destination file path
        Returns:
            True if verification successful
        """
        # Check size
        source_size = source.stat().st_size
        dest_size = destination.stat().st_size
        if source_size != dest_size:
            return False
        # Compare checksums for files larger than 1MB
        if source_size > 1024 * 1024:
            from ..deduplication.chunker import hash_file
            source_hash = hash_file(source)
            dest_hash = hash_file(destination)
            return source_hash == dest_hash
        # For small files, compare content directly
        with open(source, 'rb') as f1, open(destination, 'rb') as f2:
            return f1.read() == f2.read()
    def can_migrate(self, source: Path, destination: Path) -> bool:
        """Check if migration is possible
        Args:
            source: Source file path
            destination: Destination file path
        Returns:
            True if migration is possible
        """
        if not source.exists():
            return False
        # Check if destination directory is writable
        dest_dir = destination.parent
        if dest_dir.exists():
            return os.access(dest_dir, os.W_OK)
        # Check if parent directory exists and is writable
        parent = dest_dir.parent
        while not parent.exists() and parent != parent.parent:
            parent = parent.parent
        return parent.exists() and os.access(parent, os.W_OK)
    def estimate_time(self, source: Path) -> float:
        """Estimate migration time in seconds
        Args:
            source: Source file path
        Returns:
            Estimated time in seconds
        """
        if not source.exists():
            return 0.0
        size = source.stat().st_size
        # Estimate based on typical copy speed (100 MB/s)
        typical_speed = 100 * 1024 * 1024  # bytes per second
        return size / typical_speed
    def cleanup(self, source: Path) -> bool:
        """Cleanup source file after successful migration
        Args:
            source: Source file path
        Returns:
            True if cleanup successful
        """
        try:
            if source.exists():
                source.unlink()
            return True
        except Exception as e:
            if self.logger:
                self.logger.warning(f"Failed to cleanup {source}: {e}")
            return False
 class FastCopyStrategy(CopyMigrationStrategy):
    """Fast copy strategy without verification"""
    def __init__(self, logger: Optional[ProgressLogger] = None):
        """Initialize fast copy strategy"""
        super().__init__(
            logger=logger,
            preserve_metadata=True,
            verify_checksums=False
        )
 class SafeCopyStrategy(CopyMigrationStrategy):
    """Safe copy strategy with full verification"""
    def __init__(self, logger: Optional[ProgressLogger] = None):
        """Initialize safe copy strategy"""
        super().__init__(
            logger=logger,
            preserve_metadata=True,
            verify_checksums=True
        )
 class ReferenceCopyStrategy:
    """Create reference copy using reflinks (CoW) if supported"""
    def __init__(self, logger: Optional[ProgressLogger] = None):
        """Initialize reflink copy strategy"""
        self.logger = logger
    def migrate(
        self,
        source: Path,
        destination: Path,
        verify: bool = True
    ) -> bool:
        """Migrate using reflink (copy-on-write)
        Args:
            source: Source file path
            destination: Destination file path
            verify: Whether to verify the operation
        Returns:
            True if migration successful
        """
        if not source.exists():
            if self.logger:
                self.logger.error(f"Source file does not exist: {source}")
            return False
        # Create destination directory
        destination.parent.mkdir(parents=True, exist_ok=True)
        try:
            # Try reflink copy (works on btrfs, xfs, etc.)
            import subprocess
            result = subprocess.run(
                ['cp', '--reflink=auto', str(source), str(destination)],
                capture_output=True,
                check=False
            )
            if result.returncode != 0:
                # Fallback to regular copy
                shutil.copy2(source, destination)
            return True
        except Exception as e:
            if self.logger:
                self.logger.error(f"Reflink copy failed: {source} -> {destination}: {e}")
            return False
    def can_migrate(self, source: Path, destination: Path) -> bool:
        """Check if migration is possible"""
        if not source.exists():
            return False
        dest_dir = destination.parent
        if dest_dir.exists():
            return os.access(dest_dir, os.W_OK)
        return True
    def estimate_time(self, source: Path) -> float:
        """Estimate migration time (reflinks are fast)"""
        return 0.1  # Reflinks are nearly instant
    def cleanup(self, source: Path) -> bool:
        """Cleanup source file"""
        try:
            if source.exists():
                source.unlink()
            return True
        except Exception as e:
            if self.logger:
                self.logger.warning(f"Failed to cleanup {source}: {e}")
            return False
--- a/app/migration/engine.py
+++ b/app/migration/engine.py
@@ -0,0 +1,454 @@
 """Migration engine"""
 from pathlib import Path
 from typing import Optional, Callable
 from datetime import datetime
 import psycopg2
 from psycopg2.extras import execute_batch
 from .copy import CopyMigrationStrategy, SafeCopyStrategy
 from .hardlink import HardlinkMigrationStrategy, SymlinkMigrationStrategy
 from ..shared.models import OperationRecord, ProcessingStats, MigrationPlan
 from ..shared.config import DatabaseConfig, ProcessingConfig
 from ..shared.logger import ProgressLogger
 class MigrationEngine:
    """Engine for migrating files"""
    def __init__(
        self,
        db_config: DatabaseConfig,
        processing_config: ProcessingConfig,
        logger: ProgressLogger,
        target_base: Path
    ):
        """Initialize migration engine
        Args:
            db_config: Database configuration
            processing_config: Processing configuration
            logger: Progress logger
            target_base: Target base directory for migrations
        """
        self.db_config = db_config
        self.processing_config = processing_config
        self.logger = logger
        self.target_base = Path(target_base)
        self._connection = None
        # Initialize strategies
        self.copy_strategy = SafeCopyStrategy(logger=logger)
        self.hardlink_strategy = HardlinkMigrationStrategy(logger=logger)
        self.symlink_strategy = SymlinkMigrationStrategy(logger=logger)
    def _get_connection(self):
        """Get or create database connection"""
        if self._connection is None or self._connection.closed:
            self._connection = psycopg2.connect(
                host=self.db_config.host,
                port=self.db_config.port,
                database=self.db_config.database,
                user=self.db_config.user,
                password=self.db_config.password
            )
        return self._connection
    def _ensure_tables(self):
        """Ensure migration tables exist"""
        conn = self._get_connection()
        cursor = conn.cursor()
        # Create operations table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS operations (
                id SERIAL PRIMARY KEY,
                source_path TEXT NOT NULL,
                target_path TEXT NOT NULL,
                operation_type TEXT NOT NULL,
                size BIGINT DEFAULT 0,
                status TEXT DEFAULT 'pending',
                error TEXT,
                executed_at TIMESTAMP,
                verified BOOLEAN DEFAULT FALSE,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        # Create index on status
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_operations_status
            ON operations(status)
        """)
        conn.commit()
        cursor.close()
    def plan_migration(
        self,
        disk: Optional[str] = None,
        category: Optional[str] = None
    ) -> MigrationPlan:
        """Plan migration for files
        Args:
            disk: Optional disk filter
            category: Optional category filter
        Returns:
            MigrationPlan with planned operations
        """
        self.logger.section("Planning Migration")
        conn = self._get_connection()
        cursor = conn.cursor()
        # Build query
        conditions = ["category IS NOT NULL"]
        params = []
        if disk:
            conditions.append("disk_label = %s")
            params.append(disk)
        if category:
            conditions.append("category = %s")
            params.append(category)
        query = f"""
            SELECT path, size, category, duplicate_of
            FROM files
            WHERE {' AND '.join(conditions)}
            ORDER BY category, path
        """
        cursor.execute(query, params)
        files = cursor.fetchall()
        self.logger.info(f"Found {len(files)} files to migrate")
        operations = []
        total_size = 0
        for path_str, size, file_category, duplicate_of in files:
            source = Path(path_str)
            # Determine destination
            target_path = self.target_base / file_category / source.name
            # Determine operation type
            if duplicate_of:
                # Use hardlink for duplicates
                operation_type = 'hardlink'
            else:
                # Use copy for unique files
                operation_type = 'copy'
            operation = OperationRecord(
                source_path=source,
                target_path=target_path,
                operation_type=operation_type,
                size=size
            )
            operations.append(operation)
            total_size += size
        cursor.close()
        plan = MigrationPlan(
            target_disk=str(self.target_base),
            destination_disks=[str(self.target_base)],
            operations=operations,
            total_size=total_size,
            file_count=len(operations)
        )
        self.logger.info(
            f"Migration plan created: {plan.file_count} files, "
            f"{plan.total_size:,} bytes"
        )
        return plan
    def execute_migration(
        self,
        operations: list[OperationRecord],
        dry_run: bool = False,
        progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
    ) -> ProcessingStats:
        """Execute migration operations
        Args:
            operations: List of operations to execute
            dry_run: Whether to perform a dry run
            progress_callback: Optional callback for progress updates
        Returns:
            ProcessingStats with execution statistics
        """
        self.logger.section("Executing Migration" + (" (DRY RUN)" if dry_run else ""))
        self._ensure_tables()
        stats = ProcessingStats()
        total_ops = len(operations)
        for operation in operations:
            stats.files_processed += 1
            if dry_run:
                # In dry run, just log what would happen
                self.logger.debug(
                    f"[DRY RUN] Would {operation.operation_type}: "
                    f"{operation.source_path} -> {operation.target_path}"
                )
                stats.files_succeeded += 1
            else:
                # Execute actual migration
                success = self._execute_operation(operation)
                if success:
                    stats.files_succeeded += 1
                    stats.bytes_processed += operation.size
                else:
                    stats.files_failed += 1
            # Progress callback
            if progress_callback and stats.files_processed % 100 == 0:
                progress_callback(stats.files_processed, total_ops, stats)
            # Log progress
            if stats.files_processed % 1000 == 0:
                self.logger.progress(
                    stats.files_processed,
                    total_ops,
                    prefix="Operations executed",
                    bytes_processed=stats.bytes_processed,
                    elapsed_seconds=stats.elapsed_seconds
                )
        self.logger.info(
            f"Migration {'dry run' if dry_run else 'execution'} complete: "
            f"{stats.files_succeeded}/{total_ops} operations, "
            f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
        )
        return stats
    def _execute_operation(self, operation: OperationRecord) -> bool:
        """Execute a single migration operation
        Args:
            operation: Operation to execute
        Returns:
            True if successful
        """
        operation.status = 'in_progress'
        operation.executed_at = datetime.now()
        try:
            # Select strategy based on operation type
            if operation.operation_type == 'copy':
                strategy = self.copy_strategy
            elif operation.operation_type == 'hardlink':
                strategy = self.hardlink_strategy
            elif operation.operation_type == 'symlink':
                strategy = self.symlink_strategy
            else:
                raise ValueError(f"Unknown operation type: {operation.operation_type}")
            # Execute migration
            success = strategy.migrate(
                operation.source_path,
                operation.target_path,
                verify=self.processing_config.verify_operations
            )
            if success:
                operation.status = 'completed'
                operation.verified = True
                self._record_operation(operation)
                return True
            else:
                operation.status = 'failed'
                operation.error = "Migration failed"
                self._record_operation(operation)
                return False
        except Exception as e:
            operation.status = 'failed'
            operation.error = str(e)
            self._record_operation(operation)
            self.logger.error(f"Operation failed: {operation.source_path}: {e}")
            return False
    def _record_operation(self, operation: OperationRecord):
        """Record operation in database
        Args:
            operation: Operation to record
        """
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute("""
            INSERT INTO operations (
                source_path, target_path, operation_type, bytes_processed,
                status, error, executed_at, verified
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """, (
            str(operation.source_path),
            str(operation.target_path),
            operation.operation_type,
            operation.size,
            operation.status,
            operation.error,
            operation.executed_at,
            operation.verified
        ))
        conn.commit()
        cursor.close()
    def rollback(self, operation: OperationRecord) -> bool:
        """Rollback a migration operation
        Args:
            operation: Operation to rollback
        Returns:
            True if rollback successful
        """
        self.logger.warning(f"Rolling back: {operation.target_path}")
        try:
            # Remove destination
            if operation.target_path.exists():
                operation.target_path.unlink()
            # Update database
            conn = self._get_connection()
            cursor = conn.cursor()
            cursor.execute("""
                UPDATE operations
                SET status = 'rolled_back'
                WHERE source_path = %s AND target_path = %s
            """, (str(operation.source_path), str(operation.target_path)))
            conn.commit()
            cursor.close()
            return True
        except Exception as e:
            self.logger.error(f"Rollback failed: {operation.target_path}: {e}")
            return False
    def get_migration_stats(self) -> dict:
        """Get migration statistics
        Returns:
            Dictionary with statistics
        """
        conn = self._get_connection()
        cursor = conn.cursor()
        stats = {}
        # Total operations
        cursor.execute("SELECT COUNT(*) FROM operations")
        stats['total_operations'] = cursor.fetchone()[0]
        # Operations by status
        cursor.execute("""
            SELECT status, COUNT(*)
            FROM operations
            GROUP BY status
        """)
        for status, count in cursor.fetchall():
            stats[f'{status}_operations'] = count
        # Total size migrated
        cursor.execute("""
            SELECT COALESCE(SUM(size), 0)
            FROM operations
            WHERE status = 'completed'
        """)
        stats['total_size_migrated'] = cursor.fetchone()[0]
        cursor.close()
        return stats
    def verify_migrations(self) -> dict:
        """Verify completed migrations
        Returns:
            Dictionary with verification results
        """
        self.logger.subsection("Verifying Migrations")
        conn = self._get_connection()
        cursor = conn.cursor()
        cursor.execute("""
            SELECT source_path, target_path, operation_type
            FROM operations
            WHERE status = 'completed' AND verified = FALSE
        """)
        operations = cursor.fetchall()
        cursor.close()
        results = {
            'total': len(operations),
            'verified': 0,
            'failed': 0
        }
        for source_str, dest_str, op_type in operations:
            source = Path(source_str)
            dest = Path(dest_str)
            # Verify destination exists
            if not dest.exists():
                results['failed'] += 1
                self.logger.warning(f"Verification failed: {dest} does not exist")
                continue
            # Verify based on operation type
            if op_type == 'hardlink':
                # Check if hardlinked
                if source.exists() and source.stat().st_ino == dest.stat().st_ino:
                    results['verified'] += 1
                else:
                    results['failed'] += 1
            else:
                # Check if destination exists and has correct size
                if dest.exists():
                    results['verified'] += 1
                else:
                    results['failed'] += 1
        self.logger.info(
            f"Verification complete: {results['verified']}/{results['total']} verified"
        )
        return results
    def close(self):
        """Close database connection"""
        if self._connection and not self._connection.closed:
            self._connection.close()
    def __enter__(self):
        """Context manager entry"""
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        self.close()
--- a/app/migration/hardlink.py
+++ b/app/migration/hardlink.py
@@ -0,0 +1,377 @@
 """Hardlink-based migration strategy"""
 import os
 from pathlib import Path
 from typing import Optional
 from ..shared.logger import ProgressLogger
 class HardlinkMigrationStrategy:
    """Create hardlinks to files instead of copying"""
    def __init__(self, logger: Optional[ProgressLogger] = None):
        """Initialize hardlink migration strategy
        Args:
            logger: Optional progress logger
        """
        self.logger = logger
    def migrate(
        self,
        source: Path,
        destination: Path,
        verify: bool = True
    ) -> bool:
        """Migrate file by creating hardlink
        Args:
            source: Source file path
            destination: Destination file path
            verify: Whether to verify the operation
        Returns:
            True if migration successful
        """
        if not source.exists():
            if self.logger:
                self.logger.error(f"Source file does not exist: {source}")
            return False
        # Check if source and destination are on same filesystem
        if not self._same_filesystem(source, destination.parent):
            if self.logger:
                self.logger.warning(
                    f"Cannot hardlink across filesystems: {source} -> {destination}"
                )
            return False
        # Create destination directory
        destination.parent.mkdir(parents=True, exist_ok=True)
        try:
            # Create hardlink
            os.link(source, destination)
            # Verify if requested
            if verify:
                if not self._verify_hardlink(source, destination):
                    if self.logger:
                        self.logger.error(f"Verification failed: {source} -> {destination}")
                    destination.unlink()
                    return False
            return True
        except FileExistsError:
            if self.logger:
                self.logger.warning(f"Destination already exists: {destination}")
            return False
        except Exception as e:
            if self.logger:
                self.logger.error(f"Hardlink failed: {source} -> {destination}: {e}")
            return False
    def _same_filesystem(self, path1: Path, path2: Path) -> bool:
        """Check if two paths are on the same filesystem
        Args:
            path1: First path
            path2: Second path
        Returns:
            True if on same filesystem
        """
        try:
            # Get device IDs
            stat1 = path1.stat()
            stat2 = path2.stat()
            return stat1.st_dev == stat2.st_dev
        except Exception:
            return False
    def _verify_hardlink(self, source: Path, destination: Path) -> bool:
        """Verify hardlink
        Args:
            source: Source file path
            destination: Destination file path
        Returns:
            True if verification successful
        """
        try:
            # Check if they have the same inode
            source_stat = source.stat()
            dest_stat = destination.stat()
            return source_stat.st_ino == dest_stat.st_ino
        except Exception:
            return False
    def can_migrate(self, source: Path, destination: Path) -> bool:
        """Check if migration is possible
        Args:
            source: Source file path
            destination: Destination file path
        Returns:
            True if migration is possible
        """
        if not source.exists():
            return False
        # Check if on same filesystem
        dest_dir = destination.parent
        if dest_dir.exists():
            return self._same_filesystem(source, dest_dir)
        # Check parent directories
        parent = dest_dir.parent
        while not parent.exists() and parent != parent.parent:
            parent = parent.parent
        return parent.exists() and self._same_filesystem(source, parent)
    def estimate_time(self, source: Path) -> float:
        """Estimate migration time in seconds
        Args:
            source: Source file path
        Returns:
            Estimated time in seconds (hardlinks are instant)
        """
        return 0.01  # Hardlinks are nearly instant
    def cleanup(self, source: Path) -> bool:
        """Cleanup source file after successful migration
        Note: For hardlinks, we typically don't remove the source
        immediately as both links point to the same inode.
        Args:
            source: Source file path
        Returns:
            True (no cleanup needed for hardlinks)
        """
        # For hardlinks, we don't remove the source
        # Both source and destination point to the same data
        return True
 class SymlinkMigrationStrategy:
    """Create symbolic links to files"""
    def __init__(
        self,
        logger: Optional[ProgressLogger] = None,
        absolute_links: bool = True
    ):
        """Initialize symlink migration strategy
        Args:
            logger: Optional progress logger
            absolute_links: Whether to create absolute symlinks
        """
        self.logger = logger
        self.absolute_links = absolute_links
    def migrate(
        self,
        source: Path,
        destination: Path,
        verify: bool = True
    ) -> bool:
        """Migrate file by creating symlink
        Args:
            source: Source file path
            destination: Destination file path
            verify: Whether to verify the operation
        Returns:
            True if migration successful
        """
        if not source.exists():
            if self.logger:
                self.logger.error(f"Source file does not exist: {source}")
            return False
        # Create destination directory
        destination.parent.mkdir(parents=True, exist_ok=True)
        try:
            # Determine link target
            if self.absolute_links:
                target = source.resolve()
            else:
                # Create relative symlink
                target = os.path.relpath(source, destination.parent)
            # Create symlink
            destination.symlink_to(target)
            # Verify if requested
            if verify:
                if not self._verify_symlink(destination, source):
                    if self.logger:
                        self.logger.error(f"Verification failed: {source} -> {destination}")
                    destination.unlink()
                    return False
            return True
        except FileExistsError:
            if self.logger:
                self.logger.warning(f"Destination already exists: {destination}")
            return False
        except Exception as e:
            if self.logger:
                self.logger.error(f"Symlink failed: {source} -> {destination}: {e}")
            return False
    def _verify_symlink(self, symlink: Path, expected_target: Path) -> bool:
        """Verify symlink
        Args:
            symlink: Symlink path
            expected_target: Expected target path
        Returns:
            True if verification successful
        """
        try:
            # Check if it's a symlink
            if not symlink.is_symlink():
                return False
            # Resolve and compare
            resolved = symlink.resolve()
            expected = expected_target.resolve()
            return resolved == expected
        except Exception:
            return False
    def can_migrate(self, source: Path, destination: Path) -> bool:
        """Check if migration is possible
        Args:
            source: Source file path
            destination: Destination file path
        Returns:
            True if migration is possible
        """
        if not source.exists():
            return False
        # Check if destination directory is writable
        dest_dir = destination.parent
        if dest_dir.exists():
            return os.access(dest_dir, os.W_OK)
        return True
    def estimate_time(self, source: Path) -> float:
        """Estimate migration time in seconds
        Args:
            source: Source file path
        Returns:
            Estimated time in seconds (symlinks are instant)
        """
        return 0.01  # Symlinks are instant
    def cleanup(self, source: Path) -> bool:
        """Cleanup source file after successful migration
        Note: For symlinks, we don't remove the source as the
        symlink points to it.
        Args:
            source: Source file path
        Returns:
            True (no cleanup needed for symlinks)
        """
        # For symlinks, we don't remove the source
        return True
 class DedupHardlinkStrategy(HardlinkMigrationStrategy):
    """Hardlink strategy for deduplication
    Creates hardlinks for duplicate files to save space.
    """
    def __init__(self, logger: Optional[ProgressLogger] = None):
        """Initialize dedup hardlink strategy"""
        super().__init__(logger=logger)
    def deduplicate(
        self,
        canonical: Path,
        duplicate: Path
    ) -> bool:
        """Replace duplicate with hardlink to canonical
        Args:
            canonical: Canonical file path
            duplicate: Duplicate file path
        Returns:
            True if deduplication successful
        """
        if not canonical.exists():
            if self.logger:
                self.logger.error(f"Canonical file does not exist: {canonical}")
            return False
        if not duplicate.exists():
            if self.logger:
                self.logger.error(f"Duplicate file does not exist: {duplicate}")
            return False
        # Check if already hardlinked
        if self._verify_hardlink(canonical, duplicate):
            return True
        # Check if on same filesystem
        if not self._same_filesystem(canonical, duplicate):
            if self.logger:
                self.logger.warning(
                    f"Cannot hardlink across filesystems: {canonical} -> {duplicate}"
                )
            return False
        try:
            # Create temporary backup
            backup = duplicate.with_suffix(duplicate.suffix + '.bak')
            duplicate.rename(backup)
            # Create hardlink
            os.link(canonical, duplicate)
            # Remove backup
            backup.unlink()
            return True
        except Exception as e:
            if self.logger:
                self.logger.error(f"Deduplication failed: {duplicate}: {e}")
            # Restore from backup
            if backup.exists():
                backup.rename(duplicate)
            return False
--- a/app/parsers/code_parser.py
+++ b/app/parsers/code_parser.py
@@ -0,0 +1,44 @@
 from pathlib import Path
 from typing import Dict
 import re
 class CodeParser:
    def __init__(self):
        self.patterns = {
            'python': {'imports': r'^import |^from .+ import', 'class': r'^class \w+', 'function': r'^def \w+'},
            'javascript': {'imports': r'^import |^require\(', 'class': r'^class \w+', 'function': r'^function \w+|^const \w+ = '},
            'java': {'package': r'^package ', 'imports': r'^import ', 'class': r'^public class \w+'},
            'go': {'package': r'^package ', 'imports': r'^import ', 'function': r'^func \w+'}
        }
    def parse(self, file_path: Path) -> Dict:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
            language = self._detect_language(file_path, text)
            structure = self._extract_structure(text, language)
            return {
                'text': text,
                'language': language,
                'line_count': len(text.split('\n')),
                'structure': structure,
                'quality': 'high'
            }
        except Exception as e:
            return {'error': str(e)}
    def _detect_language(self, file_path: Path, text: str) -> str:
        lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go'}
        return lang_map.get(file_path.suffix.lower(), 'unknown')
    def _extract_structure(self, text: str, language: str) -> Dict:
        patterns = self.patterns.get(language, {})
        structure = {'type': 'code', 'language': language}
        for key, pattern in patterns.items():
            matches = re.findall(pattern, text, re.MULTILINE)
            structure[key] = len(matches)
        return structure
--- a/app/parsers/media_parser.py
+++ b/app/parsers/media_parser.py
@@ -0,0 +1,42 @@
 from pathlib import Path
 from typing import Dict
 class MediaParser:
    def parse_audio(self, file_path: Path) -> Dict:
        return {
            'text': '[Audio transcription pending]',
            'needs_transcription': True,
            'transcription_service': 'whisper',
            'structure': {'type': 'audio'},
            'quality': 'pending'
        }
    def parse_video(self, file_path: Path) -> Dict:
        return {
            'text': '[Video transcription pending]',
            'needs_transcription': True,
            'needs_scene_detection': True,
            'transcription_service': 'whisper',
            'structure': {'type': 'video'},
            'quality': 'pending'
        }
    def parse_image(self, file_path: Path) -> Dict:
        try:
            from PIL import Image
            with Image.open(file_path) as img:
                width, height = img.size
                mode = img.mode
            return {
                'text': '[Image caption/OCR pending]',
                'needs_ocr': True,
                'needs_caption': True,
                'dimensions': f'{width}x{height}',
                'mode': mode,
                'structure': {'type': 'image', 'width': width, 'height': height},
                'quality': 'pending'
            }
        except Exception as e:
            return {'error': str(e)}
--- a/app/parsers/pdf_parser.py
+++ b/app/parsers/pdf_parser.py
@@ -0,0 +1,31 @@
 from pathlib import Path
 from typing import Dict, List
 class PDFParser:
    def parse(self, file_path: Path) -> Dict:
        try:
            import PyPDF2
            pages = []
            with open(file_path, 'rb') as f:
                pdf = PyPDF2.PdfReader(f)
                page_count = len(pdf.pages)
                for i, page in enumerate(pdf.pages[:50]):
                    text = page.extract_text()
                    pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
            full_text = '\n\n'.join([p['text'] for p in pages])
            has_text_layer = sum(p['char_count'] for p in pages) > 100
            return {
                'text': full_text,
                'page_count': page_count,
                'pages_extracted': len(pages),
                'has_text_layer': has_text_layer,
                'needs_ocr': not has_text_layer,
                'structure': {'type': 'document', 'pages': pages[:5]},
                'quality': 'high' if has_text_layer else 'needs_ocr'
            }
        except Exception as e:
            return {'error': str(e), 'needs_ocr': True}
--- a/app/parsers/text_parser.py
+++ b/app/parsers/text_parser.py
@@ -0,0 +1,26 @@
 from pathlib import Path
 from typing import Dict, Optional
 import chardet
 class TextParser:
    def parse(self, file_path: Path) -> Dict:
        try:
            with open(file_path, 'rb') as f:
                raw_data = f.read(1024 * 1024)
            encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
            text = raw_data.decode(encoding, errors='ignore')
            lines = text.split('\n')
            return {
                'text': text,
                'encoding': encoding,
                'line_count': len(lines),
                'char_count': len(text),
                'word_count': len(text.split()),
                'structure': {'type': 'plain_text'},
                'quality': 'high' if encoding == 'utf-8' else 'medium'
            }
        except Exception as e:
            return {'error': str(e)}
--- a/app/setup.py
+++ b/app/setup.py
@@ -0,0 +1,51 @@
 #!/usr/bin/env python3
 """Setup script for defrag disk reorganizer"""
 from setuptools import setup, find_packages
 from pathlib import Path
 # Read requirements
 requirements_path = Path(__file__).parent / 'requirements.txt'
 with open(requirements_path) as f:
    requirements = [
        line.strip() 
        for line in f 
        if line.strip() and not line.startswith('#')
    ]
 # Read long description from README
 readme_path = Path(__file__).parent / 'README.md'
 long_description = ""
 if readme_path.exists():
    with open(readme_path) as f:
        long_description = f.read()
 setup(
    name='defrag',
    version='1.0.0',
    description='Intelligent disk reorganization system for 20TB+ data with deduplication and classification',
    long_description=long_description,
    long_description_content_type='text/markdown',
    author='Project Defrag',
    author_email='defrag@example.com',
    url='https://github.com/yourusername/defrag',
    packages=find_packages(),
    install_requires=requirements,
    python_requires='>=3.9',
    entry_points={
        'console_scripts': [
            'defrag=main:main',
        ],
    },
    classifiers=[
        'Development Status :: 4 - Beta',
        'Intended Audience :: System Administrators',
        'Topic :: System :: Filesystems',
        'License :: OSI Approved :: MIT License',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.9',
        'Programming Language :: Python :: 3.10',
        'Programming Language :: Python :: 3.11',
        'Programming Language :: Python :: 3.12',
    ],
    keywords='disk management storage deduplication classification migration',
 )
--- a/app/shared/init.py
+++ b/app/shared/init.py
@@ -0,0 +1,50 @@
 """Shared package exports"""
 from .models import (
    FileRecord,
    OperationRecord,
    DiskInfo,
    MigrationPlan,
    ProcessingStats
 )
 from .config import (
    Config,
    DatabaseConfig,
    ProcessingConfig,
    LoggingConfig,
    load_config
 )
 from .logger import (
    ProgressLogger,
    create_logger,
    format_size,
    format_rate,
    format_time
 )
 from ._protocols import IDatabase, ILogger
 __all__ = [
    # Models
    'FileRecord',
    'OperationRecord',
    'DiskInfo',
    'MigrationPlan',
    'ProcessingStats',
    # Config
    'Config',
    'DatabaseConfig',
    'ProcessingConfig',
    'LoggingConfig',
    'load_config',
    # Logger
    'ProgressLogger',
    'create_logger',
    'format_size',
    'format_rate',
    'format_time',
    # Protocols
    'IDatabase',
    'ILogger',
 ]
--- a/app/shared/_protocols.py
+++ b/app/shared/_protocols.py
@@ -0,0 +1,67 @@
 """Protocol definitions for the shared package"""
 from typing import Protocol, Any
 from pathlib import Path
 from dataclasses import dataclass
 from datetime import datetime
@dataclass
 class FileRecord:
    """Core file record with all metadata"""
    path: Path
    size: int
    modified_time: float
    created_time: float
    disk_label: str
    checksum: str | None = None
    status: str = 'indexed'  # indexed, planned, moved, verified
    category: str | None = None
    duplicate_of: str | None = None
@dataclass
 class OperationRecord:
    """Record of a migration operation"""
    source_path: Path
    target_path: Path
    operation_type: str  # move, copy, hardlink, symlink
    status: str = 'pending'  # pending, in_progress, completed, failed
    error: str | None = None
    executed_at: datetime | None = None
    verified: bool = False
 class IDatabase(Protocol):
    """Protocol for database operations"""
    def store_file(self, file_record: FileRecord) -> None:
        """Store a file record"""
        ...
    def get_files_by_disk(self, disk: str) -> list[FileRecord]:
        """Get all files on a specific disk"""
        ...
    def store_operation(self, operation: OperationRecord) -> None:
        """Store an operation record"""
        ...
    def get_pending_operations(self) -> list[OperationRecord]:
        """Get all pending operations"""
        ...
 class ILogger(Protocol):
    """Protocol for logging operations"""
    def info(self, message: str) -> None:
        ...
    def warning(self, message: str) -> None:
        ...
    def error(self, message: str) -> None:
        ...
    def debug(self, message: str) -> None:
        ...
--- a/app/shared/config.py
+++ b/app/shared/config.py
@@ -0,0 +1,110 @@
 """Configuration management for disk reorganizer"""
 import json
 from pathlib import Path
 from dataclasses import dataclass, asdict
 from typing import Optional
@dataclass
 class DatabaseConfig:
    """Database connection configuration"""
    host: str = '192.168.1.159'
    port: int = 5432
    database: str = 'disk_reorganizer_db'
    user: str = 'disk_reorg_user'
    password: str = 'heel-goed-wachtwoord'
    def to_dict(self) -> dict:
        """Convert to dictionary"""
        return asdict(self)
@dataclass
 class ProcessingConfig:
    """Processing behavior configuration"""
    batch_size: int = 1000
    commit_interval: int = 100
    parallel_workers: int = 4
    chunk_size: int = 8192
    hash_algorithm: str = 'sha256'
    verify_operations: bool = True
    preserve_timestamps: bool = True
    def to_dict(self) -> dict:
        """Convert to dictionary"""
        return asdict(self)
@dataclass
 class LoggingConfig:
    """Logging configuration"""
    level: str = 'INFO'
    log_file: str = 'disk_reorganizer.log'
    console_output: bool = True
    file_output: bool = True
    def to_dict(self) -> dict:
        """Convert to dictionary"""
        return asdict(self)
@dataclass
 class Config:
    """Main configuration container"""
    database: DatabaseConfig = None
    processing: ProcessingConfig = None
    logging: LoggingConfig = None
    def __post_init__(self):
        """Initialize nested configs with defaults if not provided"""
        if self.database is None:
            self.database = DatabaseConfig()
        if self.processing is None:
            self.processing = ProcessingConfig()
        if self.logging is None:
            self.logging = LoggingConfig()
    @classmethod
    def from_file(cls, config_path: Path) -> 'Config':
        """Load configuration from JSON file"""
        if not config_path.exists():
            return cls()
        with open(config_path, 'r') as f:
            data = json.load(f)
        return cls(
            database=DatabaseConfig(**data.get('database', {})),
            processing=ProcessingConfig(**data.get('processing', {})),
            logging=LoggingConfig(**data.get('logging', {}))
        )
    def to_file(self, config_path: Path) -> None:
        """Save configuration to JSON file"""
        data = {
            'database': self.database.to_dict(),
            'processing': self.processing.to_dict(),
            'logging': self.logging.to_dict()
        }
        with open(config_path, 'w') as f:
            json.dump(data, f, indent=2)
    def to_dict(self) -> dict:
        """Convert to dictionary"""
        return {
            'database': self.database.to_dict(),
            'processing': self.processing.to_dict(),
            'logging': self.logging.to_dict()
        }
 def load_config(config_path: Optional[Path] = None) -> Config:
    """Load configuration from file or return default"""
    if config_path is None:
        config_path = Path('config.json')
    if config_path.exists():
        return Config.from_file(config_path)
    return Config()
--- a/app/shared/logger.py
+++ b/app/shared/logger.py
@@ -0,0 +1,217 @@
 """Dynamic progress logger with formatting utilities"""
 import sys
 import logging
 from typing import Optional
 from datetime import datetime
 from pathlib import Path
 def format_size(bytes_size: int) -> str:
    """Format bytes to human-readable size string
    Args:
        bytes_size: Size in bytes
    Returns:
        Human-readable size string (e.g., "1.5 GB", "234.5 MB")
    """
    for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
        if bytes_size < 1024.0:
            return f"{bytes_size:.1f} {unit}"
        bytes_size /= 1024.0
    return f"{bytes_size:.1f} EB"
 def format_rate(bytes_per_second: float) -> str:
    """Format transfer rate to human-readable string
    Args:
        bytes_per_second: Transfer rate in bytes per second
    Returns:
        Human-readable rate string (e.g., "125.3 MB/s")
    """
    return f"{format_size(int(bytes_per_second))}/s"
 def format_time(seconds: float) -> str:
    """Format seconds to human-readable time string
    Args:
        seconds: Time in seconds
    Returns:
        Human-readable time string (e.g., "2h 34m 12s", "45m 23s", "12s")
    """
    if seconds < 60:
        return f"{int(seconds)}s"
    elif seconds < 3600:
        minutes = int(seconds // 60)
        secs = int(seconds % 60)
        return f"{minutes}m {secs}s"
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        return f"{hours}h {minutes}m {secs}s"
 class ProgressLogger:
    """Dynamic progress logger with real-time statistics"""
    def __init__(
        self,
        name: str = "defrag",
        level: int = logging.INFO,
        log_file: Optional[Path] = None,
        console_output: bool = True
    ):
        """Initialize progress logger
        Args:
            name: Logger name
            level: Logging level
            log_file: Optional log file path
            console_output: Whether to output to console
        """
        self.logger = logging.getLogger(name)
        self.logger.setLevel(level)
        self.logger.handlers.clear()
        # Create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        # Add console handler
        if console_output:
            console_handler = logging.StreamHandler(sys.stdout)
            console_handler.setLevel(level)
            console_handler.setFormatter(formatter)
            self.logger.addHandler(console_handler)
        # Add file handler
        if log_file:
            log_file.parent.mkdir(parents=True, exist_ok=True)
            file_handler = logging.FileHandler(log_file)
            file_handler.setLevel(level)
            file_handler.setFormatter(formatter)
            self.logger.addHandler(file_handler)
        self._last_progress_line = ""
    def info(self, message: str) -> None:
        """Log info message"""
        self.logger.info(message)
    def warning(self, message: str) -> None:
        """Log warning message"""
        self.logger.warning(message)
    def error(self, message: str) -> None:
        """Log error message"""
        self.logger.error(message)
    def debug(self, message: str) -> None:
        """Log debug message"""
        self.logger.debug(message)
    def critical(self, message: str) -> None:
        """Log critical message"""
        self.logger.critical(message)
    def progress(
        self,
        current: int,
        total: int,
        prefix: str = "",
        suffix: str = "",
        bytes_processed: Optional[int] = None,
        elapsed_seconds: Optional[float] = None
    ) -> None:
        """Log progress with dynamic statistics
        Args:
            current: Current progress count
            total: Total count
            prefix: Prefix message
            suffix: Suffix message
            bytes_processed: Optional bytes processed for rate calculation
            elapsed_seconds: Optional elapsed time for rate calculation
        """
        if total == 0:
            percent = 0.0
        else:
            percent = (current / total) * 100
        progress_msg = f"{prefix} [{current}/{total}] {percent:.1f}%"
        if bytes_processed is not None and elapsed_seconds is not None and elapsed_seconds > 0:
            rate = bytes_per_second = bytes_processed / elapsed_seconds
            progress_msg += f" | {format_size(bytes_processed)} @ {format_rate(rate)}"
            # Estimate time remaining
            if current > 0:
                estimated_total_seconds = (elapsed_seconds / current) * total
                remaining_seconds = estimated_total_seconds - elapsed_seconds
                progress_msg += f" | ETA: {format_time(remaining_seconds)}"
        if suffix:
            progress_msg += f" | {suffix}"
        self.info(progress_msg)
    def section(self, title: str) -> None:
        """Log section header
        Args:
            title: Section title
        """
        separator = "=" * 60
        self.info(separator)
        self.info(f"  {title}")
        self.info(separator)
    def subsection(self, title: str) -> None:
        """Log subsection header
        Args:
            title: Subsection title
        """
        self.info(f"\n--- {title} ---")
 def create_logger(
    name: str = "defrag",
    level: str = "INFO",
    log_file: Optional[Path] = None,
    console_output: bool = True
 ) -> ProgressLogger:
    """Create and configure a progress logger
    Args:
        name: Logger name
        level: Logging level as string
        log_file: Optional log file path
        console_output: Whether to output to console
    Returns:
        Configured ProgressLogger instance
    """
    level_map = {
        'DEBUG': logging.DEBUG,
        'INFO': logging.INFO,
        'WARNING': logging.WARNING,
        'ERROR': logging.ERROR,
        'CRITICAL': logging.CRITICAL
    }
    log_level = level_map.get(level.upper(), logging.INFO)
    return ProgressLogger(
        name=name,
        level=log_level,
        log_file=log_file,
        console_output=console_output
    )
--- a/app/shared/models.py
+++ b/app/shared/models.py
@@ -0,0 +1,127 @@
 """Data models for the disk reorganizer"""
 from dataclasses import dataclass, field
 from pathlib import Path
 from datetime import datetime
 from typing import Optional
@dataclass
 class FileRecord:
    """Core file record with all metadata"""
    path: Path
    size: int
    modified_time: float
    created_time: float
    disk_label: str
    checksum: Optional[str] = None
    status: str = 'indexed'  # indexed, planned, moved, verified
    category: Optional[str] = None
    duplicate_of: Optional[str] = None
    def to_dict(self) -> dict:
        """Convert to dictionary for serialization"""
        return {
            'path': str(self.path),
            'size': self.size,
            'modified_time': self.modified_time,
            'created_time': self.created_time,
            'disk_label': self.disk_label,
            'checksum': self.checksum,
            'status': self.status,
            'category': self.category,
            'duplicate_of': self.duplicate_of
        }
@dataclass
 class OperationRecord:
    """Record of a migration operation"""
    source_path: Path
    target_path: Path
    operation_type: str  # move, copy, hardlink, symlink
    size: int = 0
    status: str = 'pending'  # pending, in_progress, completed, failed
    error: Optional[str] = None
    executed_at: Optional[datetime] = None
    verified: bool = False
    def to_dict(self) -> dict:
        """Convert to dictionary for serialization"""
        return {
            'source_path': str(self.source_path),
            'target_path': str(self.target_path),
            'operation_type': self.operation_type,
            'size': self.size,
            'status': self.status,
            'error': self.error,
            'executed_at': self.executed_at.isoformat() if self.executed_at else None,
            'verified': self.verified
        }
@dataclass
 class DiskInfo:
    """Information about a disk/volume"""
    name: str
    device: str
    mount_point: Path
    total_size: int
    used_size: int
    free_size: int
    fs_type: str
    @property
    def usage_percent(self) -> float:
        """Calculate usage percentage"""
        if self.total_size == 0:
            return 0.0
        return (self.used_size / self.total_size) * 100
@dataclass
 class MigrationPlan:
    """Complete migration plan"""
    target_disk: str
    destination_disks: list[str]
    operations: list[OperationRecord]
    total_size: int
    file_count: int
    created_at: datetime = field(default_factory=datetime.now)
    def to_dict(self) -> dict:
        """Convert to dictionary for serialization"""
        return {
            'target_disk': self.target_disk,
            'destination_disks': self.destination_disks,
            'operations': [op.to_dict() for op in self.operations],
            'total_size': self.total_size,
            'file_count': self.file_count,
            'created_at': self.created_at.isoformat()
        }
@dataclass
 class ProcessingStats:
    """Statistics for processing operations"""
    files_processed: int = 0
    bytes_processed: int = 0
    files_succeeded: int = 0
    files_failed: int = 0
    start_time: datetime = field(default_factory=datetime.now)
    @property
    def elapsed_seconds(self) -> float:
        """Calculate elapsed time in seconds"""
        return (datetime.now() - self.start_time).total_seconds()
    @property
    def files_per_second(self) -> float:
        """Calculate processing rate"""
        elapsed = self.elapsed_seconds
        return self.files_processed / elapsed if elapsed > 0 else 0.0
    @property
    def bytes_per_second(self) -> float:
        """Calculate throughput"""
        elapsed = self.elapsed_seconds
        return self.bytes_processed / elapsed if elapsed > 0 else 0.0
--- a/app/tests/init.py
+++ b/app/tests/init.py
--- a/defrag.iml
+++ b/defrag.iml
@@ -0,0 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager" inherit-compiler-output="true">
    <exclude-output />
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/docker-compose.override.yml
+++ b/docker-compose.override.yml
@@ -0,0 +1,20 @@
 services:
  app:
    environment:
      - LOG_LEVEL=DEBUG
      - PYTHONPATH=/app
    volumes:
      - .:/app
      - /var/run/docker.sock:/var/run/docker.sock
    ports:
      - "8000:8000"
    command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
  postgres:
    environment:
      - POSTGRES_LOG_STATEMENT=all
    ports:
      - "5433:5432"  # Different port to avoid conflict with host PostgreSQL
  redis:
    command: redis-server --appendonly yes --loglevel verbose
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,276 @@
 services:
  # PostgreSQL Database
  postgres:
    image: postgres:15-alpine
    container_name: project_defrag_db
    environment:
      POSTGRES_USER: disk_reorg_user
      POSTGRES_PASSWORD: heel-goed-wachtwoord
      POSTGRES_DB: disk_reorganizer_db
      POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C"
    volumes:
      - postgres_data:/var/lib/postgresql/data
      - ./sql/init.sql:/docker-entrypoint-initdb.d/init.sql
      - ./sql/migrations:/docker-entrypoint-initdb.d/migrations
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U disk_reorg_user -d disk_reorganizer_db"]
      interval: 10s
      timeout: 5s
      retries: 5
    networks:
      - defrag-network
  # Redis for deduplication hash store (optional)
  redis:
    image: redis:7-alpine
    container_name: project_defrag_redis
    command: redis-server --appendonly yes
    volumes:
      - redis_data:/data
    ports:
      - "6379:6379"
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
    networks:
      - defrag-network
  # Application Service
  app:
    build: .
    container_name: project_defrag_app
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_healthy
    environment:
      # Database Configuration
      DB_HOST: postgres
      DB_PORT: 5432
      DB_NAME: disk_reorganizer_db
      DB_USER: disk_reorg_user
      DB_PASSWORD: heel-goed-wachtwoord
      # Redis Configuration
      REDIS_HOST: redis
      REDIS_PORT: 6379
      # Application Configuration
      LOG_LEVEL: INFO
      MAX_WORKERS: 4
      CHUNK_SIZE_KB: 64
      # Mount points (set these when running specific commands)
      SOURCE_MOUNT: /mnt/source
      TARGET_MOUNT: /mnt/target
    volumes:
      # Mount host directories for file operations
      - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
      - ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
      # Mount for configuration and plans
      - ./config:/app/config
      - ./plans:/app/plans
      - ./logs:/app/logs
      # Bind mount for development (optional)
      - .:/app
    networks:
      - defrag-network
    profiles:
      - full-cycle
      - development
    # Uncomment for development with hot reload
    # command: watchmedo auto-restart --pattern="*.py" --recursive -- python app/main.py
  # Single command services for specific operations
  index:
    build: .
    container_name: defrag_index
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      DB_HOST: postgres
      DB_PORT: 5432
      DB_NAME: disk_reorganizer_db
      DB_USER: disk_reorg_user
      DB_PASSWORD: heel-goed-wachtwoord
    volumes:
      - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
      - ./config:/app/config
      - ./logs:/app/logs
    command: ["python", "app/main.py", "index", "/media/mike/SMT", "SMT"]
    profiles:
      - index-only
    networks:
      - defrag-network
  plan:
    build: .
    container_name: defrag_plan
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      DB_HOST: postgres
      DB_PORT: 5432
      DB_NAME: disk_reorganizer_db
      DB_USER: disk_reorg_user
      DB_PASSWORD: heel-goed-wachtwoord
    volumes:
      - ./config:/app/config
      - ./plans:/app/plans
      - ./logs:/app/logs
    command: ["python", "app/main.py", "plan",  "/media/mike/SMT", "SMT"]
    profiles:
      - plan-only
    networks:
      - defrag-network
  execute:
    build: .
    container_name: defrag_execute
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      DB_HOST: postgres
      DB_PORT: 5432
      DB_NAME: disk_reorganizer_db
      DB_USER: disk_reorg_user
      DB_PASSWORD: heel-goed-wachtwoord
    volumes:
      - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source
      - ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
      - ./plans:/app/plans
      - ./config:/app/config
      - ./logs:/app/logs
    command: ["python", "app/main.py", "execute", "/app/plans/plan.json"]
    profiles:
      - execute-only
    networks:
      - defrag-network
  dry-run:
    build: .
    container_name: defrag_dry_run
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      DB_HOST: postgres
      DB_PORT: 5432
      DB_NAME: disk_reorganizer_db
      DB_USER: disk_reorg_user
      DB_PASSWORD: heel-goed-wachtwoord
    volumes:
      - ./plans:/app/plans
      - ./config:/app/config
      - ./logs:/app/logs
    command: ["python", "app/main.py", "execute", "/app/plans/plan.json", "--dry-run"]
    profiles:
      - dry-run-only
    networks:
      - defrag-network
  report:
    build: .
    container_name: defrag_report
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      DB_HOST: postgres
      DB_PORT: 5432
      DB_NAME: disk_reorganizer_db
      DB_USER: disk_reorg_user
      DB_PASSWORD: heel-goed-wachtwoord
    volumes:
      - ./reports:/app/reports
      - ./logs:/app/logs
    command: ["python", "app/main.py", "report"]
    profiles:
      - report-only
    networks:
      - defrag-network
  # Monitoring and Admin Services
  pgadmin:
    image: dpage/pgadmin4:latest
    container_name: defrag_pgadmin
    environment:
      PGADMIN_DEFAULT_EMAIL: admin@defrag.local
      PGADMIN_DEFAULT_PASSWORD: admin123
    volumes:
      - pgadmin_data:/var/lib/pgadmin
    ports:
      - "5050:80"
    depends_on:
      - postgres
    profiles:
      - monitoring
    networks:
      - defrag-network
  redis-commander:
    image: rediscommander/redis-commander:latest
    container_name: defrag_redis_commander
    environment:
      REDIS_HOSTS: local:redis:6379
    ports:
      - "8081:8081"
    depends_on:
      - redis
    profiles:
      - monitoring
    networks:
      - defrag-network
  flyway:
    image: flyway/flyway:latest
    container_name: flyway
    volumes:
      - ./sql/migration:/flyway/sql:ro
    environment:
      FLYWAY_URL: jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
      FLYWAY_USER: disk_reorg_user
      FLYWAY_PASSWORD: heel-goed-wachtwoord
      FLYWAY_SCHEMAS: public
      FLYWAY_LOCATIONS: filesystem:./sql
      FLYWAY_CONNECT_RETRIES: "60"
    command: migrate
    restart: "no"
  pg_backup:
    image: postgres:16
    container_name: pg_backup
    environment:
      PGPASSWORD: heel-goed-wachtwoord
    volumes:
      - ./:/backup
    command:
      - bash
      - -lc
      - >
        pg_dump -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db
        --format=custom --no-owner --no-privileges
        -f /backup/backup_$(date +%F_%H%M)_disk_reorganizer_db.dump
    restart: "no"
 networks:
  defrag-network:
    driver: bridge
 volumes:
  postgres_data:
    driver: local
  redis_data:
    driver: local
  pgadmin_data:
    driver: local
--- a/flyway.conf
+++ b/flyway.conf
@@ -0,0 +1,7 @@
 flyway.url=jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
 flyway.user=disk_org_user
 flyway.password=heel-goed-wachtwoord
 flyway.locations=filesystem:sql/migration
 flyway.schemas=public
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,74 @@
 [build-system]
 requires = ["setuptools>=65.0", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "defrag"
 version = "1.0.0"
 description = "Intelligent disk reorganization system for 20TB+ data"
 readme = "README.md"
 requires-python = ">=3.9"
 license = {text = "MIT"}
 authors = [
    {name = "Project Defrag"}
 ]
 keywords = ["disk", "storage", "deduplication", "classification", "migration"]
 classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: System Administrators",
    "Topic :: System :: Filesystems",
    "License :: OSI Approved :: MIT License",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
    "psycopg2-binary>=2.9.0",
    "psutil>=5.9.0",
    "pandas>=1.5.0",
    "pyarrow>=10.0.0",
    "python-magic>=0.4.27",
 ]
 [project.optional-dependencies]
 redis = ["redis>=4.5.0"]
 ml = ["scikit-learn>=1.2.0", "numpy>=1.24.0"]
 dev = [
    "pytest>=7.2.0",
    "pytest-cov>=4.0.0",
    "black>=23.0.0",
    "mypy>=1.0.0",
    "flake8>=6.0.0",
 ]
 all = [
    "redis>=4.5.0",
    "scikit-learn>=1.2.0",
    "numpy>=1.24.0",
 ]
 [project.scripts]
 defrag = "main:main"
 [tool.black]
 line-length = 100
 target-version = ['py39', 'py310', 'py311', 'py312']
 include = '\.pyi?$'
 [tool.mypy]
 python_version = "3.9"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = false
 disallow_incomplete_defs = false
 check_untyped_defs = true
 no_implicit_optional = true
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]
 addopts = "-v --cov=. --cov-report=html --cov-report=term"
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,41 @@
 # PostgreSQL database adapter for Python
 psycopg2-binary>=2.9.9
 # Alternative: psycopg2>=2.9.9 (requires PostgreSQL development libraries)
 # Use psycopg2-binary for easier installation without compilation
 # Core dependencies
 # Optional/feature dependencies
 redis>=4.5.0  # For RedisHashStore
 scikit-learn>=1.0.0  # For MLClassifier
 numpy>=1.21.0  # For MLClassifier
 # Development dependencies
 pytest>=7.0.0
 pytest-cov>=4.0.0
 black>=22.0.0
 mypy>=0.950
 flake8>=5.0.0
 # Core dependencies
 psycopg2-binary>=2.9.0
 psutil>=5.9.0
 # Data processing
 pandas>=1.5.0
 pyarrow>=10.0.0
 # File type detection
 python-magic>=0.4.27
 # Optional dependencies
 redis>=4.5.0  # For RedisHashStore (optional)
 scikit-learn>=1.2.0  # For MLClassifier (optional)
 numpy>=1.24.0  # For MLClassifier (optional)
 # Development dependencies
 pytest>=7.2.0
 pytest-cov>=4.0.0
 black>=23.0.0
 mypy>=1.0.0
 flake8>=6.0.0
 chardet
--- a/setup.sh
+++ b/setup.sh
@@ -0,0 +1,51 @@
 #!/bin/bash
 # setup.sh - Complete Docker setup for Project Defrag
 set -e
 echo "🚀 Setting up Project Defrag with Docker..."
 # 1. Create necessary directories
 echo "📁 Creating directories..."
 mkdir -p {config,plans,logs,reports,sql/migrations}
 # 2. Copy environment file
 if [ ! -f .env ]; then
    echo "⚙️  Creating .env file from template..."
    cp .env.example .env
    echo "⚠️  Please edit .env file with your configuration!"
 fi
 # 3. Build the Docker image
 echo "🐳 Building Docker image..."
 docker compose build app
 # 4. Start the database
 #echo "🗄️  Starting PostgreSQL database..."
 #docker-compose up -d postgres
 # 5. Wait for database to be ready
 #echo "⏳ Waiting for database to be ready..."
 #sleep 10
 # 6. Run database initialization
 #echo "📊 Initializing database..."
 #docker-compose exec -T postgres psql -U disk_reorg_user -d disk_reorganizer_db -f /docker-entrypoint-initdb.d/init.sql
 # 7. Start optional services
 echo "🔧 Starting monitoring services..."
 docker compose --profile monitoring up -d
 echo "✅ Setup complete!"
 echo ""
 echo "📋 Available commands:"
 echo "  docker compose up -d              # Start all services"
 echo "  docker compose --profile index-only up index     # Run index only"
 echo "  docker compose --profile plan-only up plan       # Generate plan"
 echo "  docker compose --profile dry-run-only up dry-run # Dry run"
 echo "  docker compose --profile execute-only up execute # Execute migration"
 echo "  docker compose --profile report-only up report   # Generate report"
 echo ""
 echo "🌐 Access monitoring:"
 echo "  - PostgreSQL Admin: http://localhost:5050"
 echo "  - Redis Commander: http://localhost:8081"
--- a/sql/legacy_setup.sql
+++ b/sql/legacy_setup.sql
@@ -0,0 +1,61 @@
 -- PostgreSQL Database Setup Script for Disk Reorganizer
 -- Database: disk_reorganizer_db
 -- User: disk_reorg_user
 CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
 CREATE EXTENSION IF NOT EXISTS "pgcrypto";
 -- Create the database (run as superuser: auction)
 CREATE DATABASE disk_reorganizer_db
    WITH
    ENCODING = 'UTF8'
    LC_COLLATE = 'en_US.UTF-8'
    LC_CTYPE = 'en_US.UTF-8'
    TEMPLATE = template0;
 -- Connect to the new database
 \c disk_reorganizer_db
 -- Create the user
 CREATE USER disk_reorg_user WITH PASSWORD 'heel-goed-wachtwoord';
 -- Create files table
 -- Create index on disk column for faster queries
 -- Grant privileges to disk_reorg_user
 GRANT CONNECT ON DATABASE disk_reorganizer_db TO disk_reorg_user;
 GRANT USAGE ON SCHEMA public TO disk_reorg_user;
 GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO disk_reorg_user;
 GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO disk_reorg_user;
 -- future tables/sequences created by your owner role (pick the role that creates them)
 ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
    GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
 ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
    GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
 -- Create function to update updated_at timestamp
 CREATE OR REPLACE FUNCTION update_updated_at_column()
    RETURNS TRIGGER AS
 $$
 BEGIN
    NEW.updated_at = CURRENT_TIMESTAMP;
    RETURN NEW;
 END;
 $$ LANGUAGE plpgsql;
 -- Create trigger for files table
 CREATE TRIGGER update_files_updated_at
    BEFORE UPDATE
    ON files
    FOR EACH ROW
 EXECUTE FUNCTION update_updated_at_column();
 -- Display success message
 \echo 'Database setup completed successfully!'
 \echo 'Database: disk_reorganizer_db'
 \echo 'User: disk_reorg_user'
 \echo 'Tables created: files, operations'
 \echo 'Indexes and triggers created 2)'
--- a/sql/migration/V001__init.sql
+++ b/sql/migration/V001__init.sql
@@ -0,0 +1,188 @@
 -- sql/init.sql
 -- Initialize PostgreSQL database for Project Defrag
 -- Enable useful extensions
 CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
 CREATE EXTENSION IF NOT EXISTS "pgcrypto";
 -- future tables/sequences created by your owner role (pick the role that creates them)
 ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
    GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
 ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
    GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
 ALTER DATABASE disk_reorganizer_db OWNER TO disk_reorg_user;
 -- Files table
 CREATE TABLE IF NOT EXISTS files
 (
    id            UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
    path          TEXT   NOT NULL,
    size          BIGINT NOT NULL,
    modified_time TIMESTAMP WITH TIME ZONE,
    created_time  TIMESTAMP WITH TIME ZONE,
    file_hash     VARCHAR(64), -- SHA-256 hash
    checksum      VARCHAR(64), -- Alias for file_hash (legacy compatibility)
    category      VARCHAR(50),
    disk_label    VARCHAR(50),
    last_verified TIMESTAMP WITH TIME ZONE,
    status        VARCHAR(20)              DEFAULT 'indexed',
    duplicate_of  TEXT,        -- Path to canonical file if this is a duplicate
    -- Metadata
    metadata      JSONB                    DEFAULT '{}',
    -- Audit fields
    created_at    TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at    TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    -- Constraints
    CONSTRAINT unique_file_path UNIQUE (path)
 );
 -- Operations table (audit log)
 CREATE TABLE IF NOT EXISTS operations
 (
    id              UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
    operation_type  VARCHAR(50) NOT NULL,
    source_path     TEXT,
    target_path     TEXT,
    status          VARCHAR(20) NOT NULL,
    -- Legacy compatibility fields
    executed        INTEGER                  DEFAULT 0,
    verified        INTEGER                  DEFAULT 0,
    error           TEXT,
    -- File reference
    file_id         UUID        REFERENCES files (id) ON DELETE SET NULL,
    -- Performance metrics
    duration_ms     INTEGER,
    bytes_processed BIGINT,
    -- Error information
    error_message   TEXT,
    error_details   JSONB,
    -- Context
    session_id      VARCHAR(100),
    user_agent      TEXT,
    -- Audit fields
    started_at      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    completed_at    TIMESTAMP WITH TIME ZONE,
    executed_at     TIMESTAMP WITH TIME ZONE,
    created_at      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
 -- Deduplication hash store
 CREATE TABLE IF NOT EXISTS deduplication_store
 (
    hash            VARCHAR(64) PRIMARY KEY,
    canonical_path  TEXT NOT NULL,
    reference_count INTEGER                  DEFAULT 1,
    first_seen      TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    last_seen       TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
 -- Migration plan table
 CREATE TABLE IF NOT EXISTS migration_plans
 (
    id                 UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
    name               VARCHAR(100) NOT NULL,
    source_disk        VARCHAR(50)  NOT NULL,
    target_disk        VARCHAR(50)  NOT NULL,
    plan_json          JSONB        NOT NULL,
    -- Statistics
    total_files        INTEGER                  DEFAULT 0,
    total_size         BIGINT                   DEFAULT 0,
    estimated_duration INTEGER, -- in seconds
    -- Status
    status             VARCHAR(20)              DEFAULT 'draft',
    -- Audit
    created_at         TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    executed_at        TIMESTAMP WITH TIME ZONE,
    completed_at       TIMESTAMP WITH TIME ZONE
 );
 -- Indexes for performance
 CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
 CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
 CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
 CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
 CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
 create index on files (checksum);
 create index on files (checksum, path);
 CREATE INDEX IF NOT EXISTS idx_operations_status ON operations (status);
 CREATE INDEX IF NOT EXISTS idx_operations_created ON operations (created_at);
 CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations (file_id);
 CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store (canonical_path);
 -- Functions for updating timestamps
 CREATE OR REPLACE FUNCTION update_updated_at_column()
    RETURNS TRIGGER AS
 $$
 BEGIN
    NEW.updated_at = CURRENT_TIMESTAMP;
    RETURN NEW;
 END;
 $$ language 'plpgsql';
 -- Triggers for automatic updated_at
 CREATE TRIGGER update_files_updated_at
    BEFORE UPDATE
    ON files
    FOR EACH ROW
 EXECUTE FUNCTION update_updated_at_column();
 -- View for operational dashboard
 CREATE OR REPLACE VIEW operational_dashboard AS
 SELECT o.status,
       COUNT(*)               as operation_count,
       SUM(o.bytes_processed) as total_bytes,
       AVG(o.duration_ms)     as avg_duration_ms,
       MIN(o.started_at)      as earliest_operation,
       MAX(o.completed_at)    as latest_operation
 FROM operations o
 WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
 GROUP BY o.status;
 -- View for disk usage statistics
 CREATE OR REPLACE VIEW disk_usage_stats AS
 SELECT disk_label,
       COUNT(*)           as file_count,
       SUM(size)          as total_size,
       AVG(size)          as avg_file_size,
       MIN(created_time)  as oldest_file,
       MAX(modified_time) as newest_file
 FROM files
 GROUP BY disk_label;
 -- Insert default configuration
 INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
 VALUES ('Default Migration Plan',
        'disk_d',
        'disk_e',
        '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
        'draft')
 ON CONFLICT DO NOTHING;
 -- Create read-only user for monitoring
 DO
 $$
    BEGIN
        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
            CREATE USER monitor_user WITH PASSWORD 'monitor_password';
        END IF;
    END
 $$;
 GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
 GRANT USAGE ON SCHEMA public TO monitor_user;
 GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
 GRANT SELECT ON operational_dashboard TO monitor_user;
 GRANT SELECT ON disk_usage_stats TO monitor_user;
--- a/sql/migration/V002__add_extracted_text.sql
+++ b/sql/migration/V002__add_extracted_text.sql
@@ -0,0 +1,11 @@
 -- Add extracted text and enrichment columns
 ALTER TABLE files ADD COLUMN IF NOT EXISTS extracted_text TEXT;
 ALTER TABLE files ADD COLUMN IF NOT EXISTS text_quality VARCHAR(20);
 ALTER TABLE files ADD COLUMN IF NOT EXISTS enrichment JSONB;
 -- Add indexes for text search
 CREATE INDEX IF NOT EXISTS idx_files_extracted_text ON files USING gin(to_tsvector('english', extracted_text));
 CREATE INDEX IF NOT EXISTS idx_files_enrichment ON files USING gin(enrichment);
 -- Add full text search capability
 CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING gin(to_tsvector('english', COALESCE(extracted_text, '')));
--- a/sql/migration/V003__add_folder_support.sql
+++ b/sql/migration/V003__add_folder_support.sql
@@ -0,0 +1,41 @@
 CREATE TABLE IF NOT EXISTS folders
 (
    id                  UUID PRIMARY KEY         DEFAULT gen_random_uuid(),
    path                TEXT NOT NULL UNIQUE,
    parent_path         TEXT,
    disk_label          VARCHAR(50),
    file_count          INT                      DEFAULT 0,
    total_size          BIGINT                   DEFAULT 0,
    project_type        VARCHAR(50),
    intent              TEXT,
    summary             TEXT,
    has_readme          BOOLEAN                  DEFAULT FALSE,
    has_git             BOOLEAN                  DEFAULT FALSE,
    has_manifest        BOOLEAN                  DEFAULT FALSE,
    manifest_types      TEXT[],
    dominant_file_types JSONB,
    structure           JSONB,
    created_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
 CREATE INDEX IF NOT EXISTS idx_folders_path ON folders (path);
 CREATE INDEX IF NOT EXISTS idx_folders_parent ON folders (parent_path);
 CREATE INDEX IF NOT EXISTS idx_folders_disk ON folders (disk_label);
 CREATE INDEX IF NOT EXISTS idx_folders_project_type ON folders (project_type);
 CREATE TABLE IF NOT EXISTS processing_checkpoints
 (
    task_name           VARCHAR(100) PRIMARY KEY,
    last_processed_id   TEXT,
    last_processed_path TEXT,
    processed_count     INT                      DEFAULT 0,
    total_count         INT,
    started_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at          TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
 );
		`@@ -0,0 +1,2 @@`
							`from .classifier import FileClassifier`
							`__all__ = ['FileClassifier']`
		`@@ -0,0 +1,3 @@`
							`from .gitignore import GitignoreFilter, DEFAULT_PATTERNS`

							`__all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS']`