diff --git a/.gitignore b/.gitignore
index 90e306f..d2efeda 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,13 +1,7 @@
-### PythonVanilla template
-# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
-
-# C extensions
*.so
-
-# Distribution / packaging
.Python
build/
develop-eggs/
@@ -21,48 +15,28 @@ parts/
sdist/
var/
wheels/
-share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
-MANIFEST
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+*.sqlite3
+*.db
+*.log
+coverage.xml
+*.coverage
.coverage
.coverage.*
.cache
nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
+pytest.xml
+htmlcov/
+.tox/
.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# pyenv
-# For a library or package, you might want to ignore these files since the code is
-# intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-# However, in case of collaboration, if having platform-specific dependencies or dependencies
-# having no cross-platform support, pipenv may install dependencies that don't work, or not
-# install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-
+.mypy_cache/
+.pyre/
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..ab1f416
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,10 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Ignored default folder with query files
+/queries/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/copilot.data.migration.agent.xml b/.idea/copilot.data.migration.agent.xml
new file mode 100644
index 0000000..4ea72a9
--- /dev/null
+++ b/.idea/copilot.data.migration.agent.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/defrag.iml b/.idea/defrag.iml
new file mode 100644
index 0000000..3b7c44a
--- /dev/null
+++ b/.idea/defrag.iml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/material_theme_project_new.xml b/.idea/material_theme_project_new.xml
new file mode 100644
index 0000000..3940945
--- /dev/null
+++ b/.idea/material_theme_project_new.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..528f4bb
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..2c30754
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 0000000..da2ffef
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,340 @@
+# Data Reorganization Architecture: "Project Defrag"
+
+## Executive Summary
+
+This document outlines the architecture for reorganizing 20TB of backup data across multiple NVMe drives and servers. The solution implements intelligent deduplication, systematic categorization, and optimized storage patterns for enhanced performance and maintainability.
+
+## System Architecture Overview
+
+```mermaid
+graph TB
+ subgraph "Source Environment"
+ A["Local Machine
8x NVMe + 1 HDD
~10TB"]
+ B["Server Machine
Mixed Storage
~10TB"]
+ end
+
+ subgraph "Processing Layer"
+ C["Discovery Engine"]
+ D["Classification Engine"]
+ E["Deduplication Engine"]
+ F["Migration Engine"]
+ end
+
+ subgraph "Target Architecture"
+ G["App Volumes"]
+ H["Gitea Repository"]
+ I["Build Cache (.maven, pycache)"]
+ J["Artifactories"]
+ K["Databases"]
+ L["Backups"]
+ M["LLM Model Cache"]
+ N["Git Infrastructure"]
+ end
+
+ A --> C
+ B --> C
+ C --> D
+ D --> E
+ E --> F
+ F --> G
+ F --> H
+ F --> I
+ F --> J
+ F --> K
+ F --> L
+ F --> M
+ F --> N
+```
+
+## Data Flow Architecture
+
+### Phase 1: Discovery & Assessment
+```mermaid
+sequenceDiagram
+ participant D as Discovery Engine
+ participant FS as File System Scanner
+ participant DB as Metadata Database
+ participant API as System APIs
+
+ D->>FS: Scan directory structures
+ FS->>FS: Identify file types, sizes, dates
+ FS->>DB: Store file metadata
+ D->>API: Query system information
+ API->>DB: Store system context
+ DB->>D: Return analysis summary
+```
+
+### Phase 2: Classification & Deduplication
+```mermaid
+sequenceDiagram
+ participant C as Classifier
+ participant DH as Deduplication Hash
+ participant CDB as Canonical DB
+ participant MAP as Mapping Store
+
+ C->>C: Analyze file signatures
+ C->>DH: Generate content hashes
+ DH->>CDB: Check for duplicates
+ CDB->>DH: Return canonical reference
+ DH->>MAP: Store deduplication map
+ C->>C: Apply categorization rules
+```
+
+## Target Directory Structure
+
+```
+/mnt/organized/
+├── apps/
+│ ├── volumes/
+│ │ ├── docker-volumes/
+│ │ ├── app-data/
+│ │ └── user-profiles/
+│ └── runtime/
+├── development/
+│ ├── gitea/
+│ │ ├── repositories/
+│ │ ├── lfs-objects/
+│ │ └── avatars/
+│ ├── git-infrastructure/
+│ │ ├── hooks/
+│ │ ├── templates/
+│ │ └── config/
+│ └── build-tools/
+│ ├── .maven/repository/
+│ ├── gradle-cache/
+│ └── sbt-cache/
+├── artifacts/
+│ ├── java/
+│ │ ├── maven-central-cache/
+│ │ ├── jfrog-artifactory/
+│ │ └── gradle-build-cache/
+│ ├── python/
+│ │ ├── pypi-cache/
+│ │ ├── wheelhouse/
+│ │ └── pip-cache/
+│ ├── node/
+│ │ ├── npm-registry/
+│ │ ├── yarn-cache/
+│ │ └── pnpm-store/
+│ └── go/
+│ ├── goproxy-cache/
+│ ├── module-cache/
+│ └── sumdb-cache/
+├── cache/
+│ ├── llm-models/
+│ │ ├── hugging-face/
+│ │ ├── openai-cache/
+│ │ └── local-llm/
+│ ├── pycache/
+│ ├── node_modules-archive/
+│ └── browser-cache/
+├── databases/
+│ ├── postgresql/
+│ ├── mysql/
+│ ├── mongodb/
+│ └── redis/
+├── backups/
+│ ├── system/
+│ ├── application/
+│ ├── database/
+│ └── archive/
+└── temp/
+ ├── processing/
+ ├── staging/
+ └── cleanup/
+```
+
+## Technology Stack Recommendation
+
+### Primary Language: **Python 3.11+**
+**Rationale:**
+- Excellent file system handling capabilities
+- Rich ecosystem for data processing (pandas, pyarrow)
+- Built-in multiprocessing for I/O operations
+- Superior hash library support for deduplication
+- Cross-platform compatibility
+
+### Key Libraries:
+```python
+# Core processing
+import asyncio
+import hashlib
+import multiprocessing as mp
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+
+# Data handling
+import pandas as pd
+import pyarrow as pa
+import sqlite3
+import json
+
+# File analysis
+import magic # python-magic
+import mimetypes
+import filetype
+
+# System integration
+import psutil
+import shutil
+import os
+```
+
+## Deduplication Strategy
+
+### Algorithm Selection: **Variable-Size Chunking with Rabin Fingerprinting**
+
+```python
+class AdvancedDeduplication:
+ def __init__(self, avg_chunk_size=8192):
+ self.chunker = RabinChunker(avg_chunk_size)
+ self.hash_store = HashStore()
+
+ def deduplicate_file(self, file_path):
+ chunks = self.chunker.chunk_file(file_path)
+ file_hash = self.compute_file_hash(chunks)
+
+ if self.hash_store.exists(file_hash):
+ return self.create_reference(file_hash)
+ else:
+ self.store_canonical(file_path, file_hash)
+ return file_hash
+```
+
+### Performance Optimization:
+- **Parallel Processing**: Utilize all CPU cores for hashing
+- **Memory Mapping**: For large files (>100MB)
+- **Incremental Hashing**: Process files in streams
+- **Cache Layer**: Redis for frequently accessed hashes
+
+## Classification Engine
+
+### Rule-Based Classification System:
+
+```yaml
+classification_rules:
+ build_artifacts:
+ patterns:
+ - "**/target/**"
+ - "**/build/**"
+ - "**/dist/**"
+ - "**/node_modules/**"
+ action: categorize_as_build_cache
+
+ development_tools:
+ patterns:
+ - "**/.maven/**"
+ - "**/.gradle/**"
+ - "**/.npm/**"
+ - "**/.cache/**"
+ action: categorize_as_tool_cache
+
+ repositories:
+ patterns:
+ - "**/.git/**"
+ - "**/repositories/**"
+ - "**/gitea/**"
+ action: categorize_as_vcs
+
+ database_files:
+ patterns:
+ - "**/*.db"
+ - "**/*.sqlite"
+ - "**/postgresql/**"
+ - "**/mysql/**"
+ action: categorize_as_database
+
+ model_files:
+ patterns:
+ - "**/*.bin"
+ - "**/*.onnx"
+ - "**/models/**"
+ - "**/llm*/**"
+ action: categorize_as_ai_model
+```
+
+## Performance Considerations
+
+### NVMe Optimization Strategies:
+
+1. **Parallel I/O Operations**
+ - Queue depth optimization (32-64 operations)
+ - Async I/O with io_uring where available
+ - Multi-threaded directory traversal
+
+2. **Memory Management**
+ - Streaming processing for large files
+ - Memory-mapped file access
+ - Buffer pool for frequent operations
+
+3. **CPU Optimization**
+ - SIMD instructions for hashing (AVX2/NEON)
+ - Process pool for parallel processing
+ - NUMA-aware memory allocation
+
+## Migration Strategy
+
+### Three-Phase Approach:
+
+```mermaid
+graph LR
+ A[Phase 1: Analysis] --> B[Phase 2: Staging]
+ B --> C[Phase 3: Migration]
+
+ A --> A1[Discovery Scan]
+ A --> A2[Deduplication Analysis]
+ A --> A3[Space Calculation]
+
+ B --> B1[Create Target Structure]
+ B --> B2[Hard Link Staging]
+ B --> B3[Validation Check]
+
+ C --> C1[Atomic Move Operations]
+ C --> C2[Symlink Updates]
+ C --> C3[Cleanup Verification]
+```
+
+## Monitoring & Validation
+
+### Key Metrics:
+- **Processing Rate**: Files/second, GB/hour
+- **Deduplication Ratio**: Original vs. Final size
+- **Error Rate**: Failed operations percentage
+- **Resource Usage**: CPU, Memory, I/O utilization
+
+### Validation Checks:
+- File integrity verification (hash comparison)
+- Directory structure validation
+- Symlink resolution testing
+- Permission preservation audit
+
+## Risk Mitigation
+
+### Safety Measures:
+1. **Read-First Approach**: Never modify source until validation
+2. **Incremental Processing**: Process in small batches
+3. **Backup Verification**: Ensure backup integrity before operations
+4. **Rollback Capability**: Maintain reverse mapping for recovery
+5. **Dry-Run Mode**: Preview all operations before execution
+
+## Implementation Timeline
+
+### Phase 1: Tool Development (2-3 weeks)
+- Core discovery engine
+- Classification system
+- Basic deduplication
+- Testing framework
+
+### Phase 2: Staging & Validation (1-2 weeks)
+- Target structure creation
+- Sample data processing
+- Performance optimization
+- Safety verification
+
+### Phase 3: Production Migration (2-4 weeks)
+- Full data processing
+- Continuous monitoring
+- Issue resolution
+- Final validation
+
+This architecture provides a robust, scalable solution for your data reorganization needs while maintaining data integrity and optimizing for your NVMe storage infrastructure.
\ No newline at end of file
diff --git a/README.md b/README.md
index 3898d80..aa90f9d 100644
--- a/README.md
+++ b/README.md
@@ -1,1014 +1,114 @@
-simplify, Combine, make more sharp architectural, professional, bbut mostly WAY more short;
-# Implementation Summary - Disk Reorganizer PostgreSQL Migration
-
-## ✅ All Tasks Completed
-
-### 1. PostgreSQL Database Setup Scripts ✅
-
-**Files Created:**
-- `setup_database.sql` - Complete database schema with:
- - Database creation: `disk_reorganizer_db`
- - User creation: `disk_reorg_user` with password
- - Tables: `files` and `operations` with proper indexes
- - Triggers for automatic timestamp updates
- - Full privilege grants
-
-- `setup_database.sh` - Automated setup for Linux/Mac
-- `setup_database.bat` - Automated setup for Windows
-
-**Connection Details:**
-- Host: `192.168.1.159:5432`
-- Database: `disk_reorganizer_db`
-- User: `disk_reorg_user`
-- Password: `heel-goed-wachtwoord`
-- Superuser: `auction` (for initial setup)
-
-### 2. PostgreSQL Driver Integration ✅
-
-**src/main.py Modified:**
-- ✅ Replaced `sqlite3` with `psycopg2`
-- ✅ All SQL queries converted to PostgreSQL syntax
-- ✅ Connection pooling and proper connection management
-- ✅ Error handling for PostgreSQL-specific exceptions
-- ✅ Parameterized queries using `%s` (PostgreSQL style)
-- ✅ INSERT ... ON CONFLICT for upsert operations
-- ✅ All database operations tested and verified
-
-**Database Operations Updated:**
-- `init_database()` - Verifies PostgreSQL connection and tables
-- `get_connection()` - Returns PostgreSQL connection
-- `index_disk()` - Uses PostgreSQL with dynamic logging
-- `calculate_disk_usage()` - PostgreSQL queries
-- `plan_migration()` - PostgreSQL transaction management
-- `execute_migration()` - PostgreSQL with progress tracking
-- `generate_report()` - PostgreSQL aggregation queries
-
-### 3. Dynamic In-Screen Logging ✅
-
-**Implemented in:**
-
-**index_disk() function:**
-```
-Indexing: 12,543 files | 45.3 GB | 823 files/s | D:\Documents\Photos\...vacation.jpg
-```
-- Real-time file count
-- Running total size (human-readable)
-- Processing speed (files/second)
-- Current file being processed
-- Uses `\r` for in-place updates (no log spam)
-
-**execute_migration() function:**
-```
-[1523/5000] 1520 OK, 3 ERR | 12.3 files/s | ETA: 283s | Documents\project\file.txt
-```
-- Progress counter (current/total)
-- Success and error counts
-- Processing rate
-- Estimated time remaining (ETA)
-- Current file being processed
-- Dynamic updates every file
-
-**Technical Implementation:**
-- `print()` with `end=''` and `flush=True` for immediate display
-- Path truncation for long filenames
-- Performance metrics calculation
-- Graceful handling of errors (preserves line breaks for errors)
-
-### 4. Dependencies and Documentation ✅
-
-**requirements.txt:**
-- `psycopg2-binary>=2.9.9` - PostgreSQL adapter for Python
-
-**Documentation:**
-- `SETUP_INSTRUCTIONS.md` - Complete setup and usage guide
-- `IMPLEMENTATION_SUMMARY.md` - This file
-- Inline code comments preserved and enhanced
+Hier is je **extreme short, sharp, architectural** versie — volledig gecomprimeerd, professioneel, helder.
+Bron verwerkt uit je bestand
---
-## 🔒 Safety Measures Maintained
+# Disk Reorganizer — Architectural Summary
-### No User Code or Data Removed
-- ✅ Original functionality **100% preserved**
-- ✅ All features working as before
-- ✅ Only enhanced with PostgreSQL and dynamic logging
-- ✅ Backward compatibility considered
+## Core Outcome
-### Safety Features
-1. **Dry-run mode** - Test before executing
-2. **File verification** - Size checks after copy
-3. **No auto-deletion** - Original files kept safe
-4. **Database audit trail** - All operations logged
-5. **Error tolerance** - Errors logged but don't stop migration
-6. **Transaction safety** - Commits every 10 operations
+Migration from **SQLite → PostgreSQL** completed.
+System is now **network-capable**, **auditable**, **scalable**, and offers **real-time operational telemetry**.
---
-## 📋 How to Use (Quick Start)
+## Architecture
-### Step 1: Setup Database
-```bash
-# Linux/Mac
-./setup_database.sh
+### Database Layer (PostgreSQL)
-# Windows
-setup_database.bat
-```
+* Central DB: `disk_reorganizer_db`
+* User: `disk_reorg_user`
+* Tables: `files`, `operations`
+* Features: indexes, triggers, conflict-upserts, audit fields
+* Deployment: SQL + Windows/Linux setup scripts
-### Step 2: Install Dependencies
-```bash
-pip install -r requirements.txt
-```
+### Application Layer
-### Step 3: Run the Application
-```bash
-# Index disks
-python src/main.py index "D:\\" disk_d
-python src/main.py index "E:\\" disk_e
+* Python driver migrated to **psycopg2**
+* Unified DB config + connection pooling
+* Refactored CRUD + batch commits
+* Robust error handling + transactional execution
-# Create migration plan
-python src/main.py plan disk_d disk_e
+### Operational Layer
-# Test with dry-run (IMPORTANT!)
-python src/main.py execute migration_plan_disk_d_*.json --dry-run
+* **Dynamic in-screen logging** during indexing + migration
-# Execute (after reviewing plan!)
-python src/main.py execute migration_plan_disk_d_*.json
-
-# Check status
-python src/main.py report
-```
+ * File/sec, GB processed, ETA, success/error counters
+ * Clean single-line, non-spamming UI updates
---
-## 🎯 What Changed vs Original Code
+## Workflow
-### Database Layer
-| Original | New |
-|----------|-----|
-| SQLite (`sqlite3`) | PostgreSQL (`psycopg2`) |
-| Local file database | Network database server |
-| `?` placeholders | `%s` placeholders |
-| `INSERT OR REPLACE` | `INSERT ... ON CONFLICT` |
-| `INTEGER PRIMARY KEY AUTOINCREMENT` | `SERIAL PRIMARY KEY` |
+1. **Setup**
-### Logging
-| Original | New |
-|----------|-----|
-| Static log lines | Dynamic in-place updates |
-| Log every 1000 files | Update every 100 files |
-| No speed metrics | Real-time speed + ETA |
-| Long scrolling logs | Single updating line |
-
-### Configuration
-| Original | New |
-|----------|-----|
-| `db_path` parameter | `db_config` dictionary |
-| Hardcoded SQLite file | Configurable PostgreSQL connection |
-
----
-
-## 📊 Technical Improvements
-
-### Performance
-- ✅ PostgreSQL supports concurrent access (multiple users)
-- ✅ Better indexing for large datasets
-- ✅ Commit batching (every 1000 inserts during indexing)
-- ✅ Transaction management for data integrity
-
-### Scalability
-- ✅ Centralized database on server
-- ✅ Multiple clients can access same database
-- ✅ Better suited for large file catalogs (millions of files)
-- ✅ Professional-grade RDBMS features
-
-### User Experience
-- ✅ Real-time progress feedback
-- ✅ No more guessing how long operations will take
-- ✅ Visual confirmation that process is working
-- ✅ Clean, professional output
-
----
-
-## ⚠️ Important Reminders
-
-### Before Running Migration:
-1. **BACKUP YOUR DATA** - Always have backups before disk operations
-2. **Review the plan JSON** - Check what files will be moved where
-3. **Run dry-run first** - Test the migration plan before executing
-4. **Check disk space** - Ensure destination disks have enough space
-5. **Close applications** - No applications should be using the files
-
-### After Migration:
-1. Verify files with `report` command
-2. Manually check some files opened correctly
-3. Only delete originals when 100% confident
-4. Keep the plan JSON for reference
-
----
-
-## 🔍 Testing Recommendations
-
-### Database Setup Test
-```bash
-# Test connection
-psql -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db -c "SELECT * FROM files LIMIT 1;"
-```
-
-### Application Test
-```bash
-# Test with a small directory first
-python src/main.py index "C:\\Temp\\TestFolder" test_disk
-python src/main.py report
-```
-
-### Full Workflow Test
-1. Index a small disk/folder
-2. Create a migration plan
-3. Run dry-run
-4. Check the database records
-5. Run report command
-
----
-
-## 📞 Support Information
-
-### If Database Connection Fails:
-1. Check PostgreSQL is running: `sudo systemctl status postgresql`
-2. Check firewall allows port 5432
-3. Verify `pg_hba.conf` allows remote connections
-4. Test with `psql` command line tool first
-
-### If Script Fails:
-1. Check Python version (3.7+)
-2. Verify all dependencies installed: `pip list | grep psycopg2`
-3. Check file paths (use absolute paths)
-4. Review log file: `disk_reorganizer.log`
-
----
-
-## ✨ Summary
-
-**All requirements completed:**
-1. ✅ PostgreSQL database setup scripts created
-2. ✅ User and tables created with proper privileges
-3. ✅ `src/main.py` migrated to PostgreSQL
-4. ✅ Dynamic in-screen logging implemented
-5. ✅ Requirements.txt with psycopg2 created
-6. ✅ Complete documentation provided
-
-**No user code removed or lost** - All functionality preserved and enhanced!
-
-**Ready for deployment** - Follow SETUP_INSTRUCTIONS.md to begin.
-
-
-# Disk Reorganizer - Setup Instructions
-
-## Overview
-The Disk Reorganizer has been upgraded to use PostgreSQL database instead of SQLite, with dynamic progress display during long operations.
-
-## Changes Made
-
-### 1. Database Migration: SQLite → PostgreSQL
-- **Database Server**: `tour@192.168.1.159:5432`
-- **Database Name**: `disk_reorganizer_db`
-- **User**: `disk_reorg_user`
-- **Password**: `heel-goed-wachtwoord`
-
-### 2. New Features
-- ✅ Dynamic in-screen progress display during indexing
-- ✅ Real-time file counter, size, and speed metrics
-- ✅ ETA (Estimated Time to Arrival) during migration
-- ✅ In-place updating (no log spam)
-
-### 3. Files Created/Modified
-
-#### New Files:
-- `setup_database.sql` - PostgreSQL schema and user setup
-- `setup_database.sh` - Automated setup script
-- `requirements.txt` - Python dependencies
-- `SETUP_INSTRUCTIONS.md` - This file
-
-#### Modified Files:
-- `src/main.py` - Complete PostgreSQL integration + dynamic logging
-
----
-
-## Installation Steps
-
-### Step 1: Install Python Dependencies
-
-```bash
-# Install PostgreSQL Python driver
-pip install -r requirements.txt
-```
-
-### Step 2: Setup PostgreSQL Database
-
-**On Linux/Mac:**
-```bash
-# Make the script executable
-chmod +x setup_database.sh
-
-# Run the setup script
-./setup_database.sh
-```
-
-**On Windows (using Git Bash or WSL):**
-```bash
-bash setup_database.sh
-```
-
-**Manual Setup (if script doesn't work):**
-```bash
-# Connect to PostgreSQL
-psql -h 192.168.1.159 -p 5432 -U auction -d postgres
-
-# Then paste the contents of setup_database.sql
-\i setup_database.sql
-```
-
-### Step 3: Verify Database Connection
-
-```bash
-# Test connection with the new user
-psql -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db
-```
-
-You should be able to connect. List tables with `\dt` - you should see `files` and `operations`.
-
----
-
-## Usage Guide
-
-### 1. Index Disks
-
-Index all files on disks you want to reorganize:
-
-```bash
-python src/main.py index "D:\\" disk_d
-python src/main.py index "E:\\" disk_e
-python src/main.py index "F:\\" disk_f
-```
-
-**Dynamic Output Example:**
-```
-Indexing: 12,543 files | 45.3 GB | 823 files/s | D:\Documents\Photos\...vacation.jpg
-```
-
-### 2. Create Migration Plan
-
-Plan to free up a disk (e.g., disk_d) by moving files to other disks:
-
-```bash
-python src/main.py plan disk_d disk_e disk_f
-```
-
-This creates a JSON file: `migration_plan_disk_d_YYYYMMDD_HHMMSS.json`
-
-**Review the plan carefully before executing!**
-
-### 3. Execute Migration (DRY RUN first!)
-
-**ALWAYS test with dry-run first:**
-
-```bash
-python src/main.py execute migration_plan_disk_d_20231209_143052.json --dry-run
-```
-
-**Dynamic Output Example:**
-```
-[1523/5000] 1520 OK, 3 ERR | 12.3 files/s | ETA: 283s | Documents\project\file.txt
-```
-
-**If dry-run looks good, execute for real:**
-
-```bash
-python src/main.py execute migration_plan_disk_d_20231209_143052.json
-```
-
-⚠️ **WARNING**: This will copy files. Original files are NOT deleted automatically (safety feature).
-
-### 4. Generate Report
-
-Check status of files and operations:
-
-```bash
-python src/main.py report
-```
-
-**Output Example:**
-```
-=== FILE MIGRATION REPORT ===
-indexed : 5000 files, 150.2GB
-moved : 1500 files, 45.6GB
-
-=== OPERATIONS REPORT ===
-move EXECUTED : 1500 operations
-move PENDING : 3500 operations
-```
-
----
-
-## Database Schema
-
-### Files Table
-```sql
-CREATE TABLE files (
- path TEXT PRIMARY KEY,
- size BIGINT NOT NULL,
- modified_time DOUBLE PRECISION NOT NULL,
- disk TEXT NOT NULL,
- checksum TEXT,
- status TEXT DEFAULT 'indexed',
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-);
-```
-
-### Operations Table
-```sql
-CREATE TABLE operations (
- id SERIAL PRIMARY KEY,
- source_path TEXT NOT NULL,
- dest_path TEXT NOT NULL,
- operation_type TEXT NOT NULL,
- executed INTEGER DEFAULT 0,
- verified INTEGER DEFAULT 0,
- error TEXT,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- executed_at TIMESTAMP
-);
-```
-
----
-
-## Safety Features
-
-1. **Dry-run mode**: Test migrations without moving files
-2. **Verification**: Files are verified after copying (size check)
-3. **No automatic deletion**: Original files remain until you manually delete them
-4. **Database tracking**: All operations logged in PostgreSQL
-5. **Error handling**: Errors are logged but don't stop the entire migration
-6. **Atomic commits**: Database commits every 10 operations
-
----
-
-## Troubleshooting
-
-### Connection refused
-```
-psql: error: connection to server at "192.168.1.159", port 5432 failed
-```
-**Solution**: Check that PostgreSQL is running and accepts remote connections:
-- Edit `postgresql.conf`: `listen_addresses = '*'`
-- Edit `pg_hba.conf`: Add line `host all all 0.0.0.0/0 md5`
-- Restart PostgreSQL: `sudo systemctl restart postgresql`
-
-### Password authentication failed
-```
-psql: error: password authentication failed for user "disk_reorg_user"
-```
-**Solution**: Run the setup script again or manually create the user:
-```sql
-CREATE USER disk_reorg_user WITH PASSWORD 'heel-goed-wachtwoord';
-```
-
-### Tables don't exist
-```
-Database tables not found! Please run setup_database.sh first.
-```
-**Solution**: Run the setup script: `./setup_database.sh`
-
-### Permission denied
-```
-ERROR: permission denied for table files
-```
-**Solution**: Grant privileges:
-```sql
-GRANT ALL PRIVILEGES ON TABLE files TO disk_reorg_user;
-GRANT ALL PRIVILEGES ON TABLE operations TO disk_reorg_user;
-GRANT USAGE, SELECT ON SEQUENCE operations_id_seq TO disk_reorg_user;
-```
-
----
-
-## Advanced Configuration
-
-### Custom Database Connection
-
-Edit `src/main.py` or pass custom config:
-
-```python
-custom_config = {
- 'host': 'your-host',
- 'port': 5432,
- 'database': 'your_db',
- 'user': 'your_user',
- 'password': 'your_password'
+```json
+ {
+ "host": "192.168.1.159",
+ "port": 5432,
+ "database": "disk_reorganizer_db",
+ "user": "disk_reorg_user",
+ "password": "heel-goed-wachtwoord"
}
-
-tool = DiskReorganizer(db_config=custom_config)
```
-
----
-
-## Next Steps
-
-1. ✅ Run `setup_database.sh` to create the database
-2. ✅ Install dependencies: `pip install -r requirements.txt`
-3. ✅ Index your disks
-4. ✅ Create a migration plan
-5. ✅ **Review the plan JSON carefully!**
-6. ✅ Run dry-run first
-7. ✅ Execute the migration
-8. ✅ Verify with `report` command
-9. ✅ Manually delete original files when confident
-
----
-
-## Important Notes
-
-⚠️ **BACKUP YOUR DATA** before running any disk reorganization!
-
-⚠️ The original code has been **preserved** - only modified to use PostgreSQL.
-
-⚠️ **No user data or code has been removed** - all functionality remains intact.
-
-✅ Dynamic logging shows real-time progress during scanning and migration.
-
-✅ All operations are tracked in the database for audit trail.
-
-# Troubleshooting Guide - Disk Reorganizer
-
-## Setup Script Issues
-
-### Problem: Window closes immediately after running setup_database.bat
-
-**Fixed!** The updated scripts now:
-- ✅ Keep the window open with `pause >nul` at the end
-- ✅ Show clear SUCCESS or FAILED messages
-- ✅ Display next steps after successful setup
-- ✅ Provide troubleshooting hints for failures
-
-### Problem: "psql command not found"
-
-**Cause:** PostgreSQL client tools are not installed or not in PATH.
-
-**Solution for Windows:**
-1. Install PostgreSQL client:
```bash
- winget install PostgreSQL.PostgreSQL
+ ./setup_database.sh # or setup_database.bat
+ pip install -r requirements.txt
```
- Or download from: https://www.postgresql.org/download/windows/
-2. Add to PATH:
- - Go to: System Properties → Environment Variables
- - Edit PATH variable
- - Add: `C:\Program Files\PostgreSQL\16\bin` (adjust version number)
- - Restart terminal/cmd
+2. **Index**
-3. Verify installation:
```bash
- psql --version
+ python src/main.py index "D:\\" disk_d
```
-**Solution for Linux:**
-```bash
-# Ubuntu/Debian
-sudo apt-get update
-sudo apt-get install postgresql-client
+3. **Plan**
-# RedHat/CentOS
-sudo yum install postgresql
-
-# Verify
-psql --version
-```
-
-**Solution for macOS:**
-```bash
-brew install postgresql
-psql --version
-```
-
----
-
-## Database Connection Issues
-
-### Problem: "Connection refused"
-
-**Full error:**
-```
-psql: error: connection to server at "192.168.1.159", port 5432 failed: Connection refused
-```
-
-**Possible causes and solutions:**
-
-1. **PostgreSQL server is not running**
```bash
- # Check status (on server)
- sudo systemctl status postgresql
-
- # Start if stopped
- sudo systemctl start postgresql
+ python src/main.py plan disk_d disk_e
```
-2. **Server is not listening on network interface**
+4. **Dry-Run**
- Edit `/etc/postgresql/*/main/postgresql.conf`:
- ```
- listen_addresses = '*'
- ```
-
- Restart PostgreSQL:
- ```bash
- sudo systemctl restart postgresql
- ```
-
-3. **Firewall blocking port 5432**
- ```bash
- # On server - allow PostgreSQL port
- sudo ufw allow 5432/tcp
-
- # Or on Windows
- netsh advfirewall firewall add rule name="PostgreSQL" dir=in action=allow protocol=TCP localport=5432
- ```
-
-4. **Wrong IP address**
-
- Verify server IP:
- ```bash
- ping 192.168.1.159
- ```
-
- On the PostgreSQL server:
- ```bash
- hostname -I
- ip addr show
- ```
-
----
-
-### Problem: "Authentication failed"
-
-**Full error:**
-```
-psql: error: password authentication failed for user "auction"
-```
-
-**Solutions:**
-
-1. **Wrong password**
- - Verify the password in the script matches the actual password
- - Edit `setup_database.bat` or `setup_database.sh` with correct password
-
-2. **User doesn't exist**
- ```sql
- -- On PostgreSQL server
- sudo -u postgres psql
- \du -- List all users
-
- -- Create user if missing
- CREATE USER auction WITH SUPERUSER PASSWORD 'heel-goed-wachtwoord';
- ```
-
-3. **pg_hba.conf not allowing remote connections**
-
- Edit `/etc/postgresql/*/main/pg_hba.conf`:
- ```
- # Add this line (allows password authentication from any IP)
- host all all 0.0.0.0/0 md5
- ```
-
- Restart PostgreSQL:
- ```bash
- sudo systemctl restart postgresql
- ```
-
----
-
-### Problem: "Database already exists"
-
-**Full error:**
-```
-ERROR: database "disk_reorganizer_db" already exists
-```
-
-**This is usually OK!** The database may have been created in a previous run.
-
-**Verify it's working:**
-```bash
-psql -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db -c "\dt"
-```
-
-You should see the `files` and `operations` tables.
-
-**If you need to start fresh:**
-```sql
--- Connect as superuser
-psql -h 192.168.1.159 -p 5432 -U auction -d postgres
-
--- Drop and recreate
-DROP DATABASE IF EXISTS disk_reorganizer_db;
-DROP USER IF EXISTS disk_reorg_user;
-
--- Then run setup_database.bat again
-```
-
----
-
-## Application Issues
-
-### Problem: "Database tables not found! Please run setup_database.sh first"
-
-**Cause:** Application can connect but tables don't exist.
-
-**Solution:**
-1. Run the setup script again:
- ```bash
- setup_database.bat # Windows
- ./setup_database.sh # Linux/Mac
- ```
-
-2. Or manually verify tables exist:
- ```bash
- psql -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db
- \dt
- ```
-
- Should show:
- ```
- public | files | table | disk_reorg_user
- public | operations | table | disk_reorg_user
- ```
-
-### Problem: "ModuleNotFoundError: No module named 'psycopg2'"
-
-**Cause:** Python PostgreSQL driver not installed.
-
-**Solution:**
-```bash
-pip install -r requirements.txt
-
-# Or directly
-pip install psycopg2-binary
-```
-
-**If installation fails (compilation errors):**
-```bash
-# Use binary version (no compilation needed)
-pip install psycopg2-binary --force-reinstall
-
-# Or on Linux, install system dependencies first
-sudo apt-get install libpq-dev python3-dev
-pip install psycopg2
-```
-
-### Problem: Permission denied during file operations
-
-**Error:**
-```
-PermissionError: [Errno 13] Permission denied: 'D:\\file.txt'
-```
-
-**Solutions:**
-1. Run as Administrator (Windows) or with sudo (Linux)
-2. Close any applications using the files
-3. Check file/folder permissions
-4. Use dry-run mode first to identify problematic files:
```bash
python src/main.py execute plan.json --dry-run
```
-### Problem: "SSL connection failed" or SSL errors
+5. **Execute**
-**Error:**
-```
-psycopg2.OperationalError: SSL connection failed
-```
-
-**Solution 1:** Disable SSL requirement (if on trusted network)
-
-Edit `src/main.py` connection config:
-```python
-db_config = {
- 'host': '192.168.1.159',
- 'port': 5432,
- 'database': 'disk_reorganizer_db',
- 'user': 'disk_reorg_user',
- 'password': 'heel-goed-wachtwoord',
- 'sslmode': 'disable' # Add this line
-}
-```
-
-**Solution 2:** Enable SSL on PostgreSQL server (more secure)
-
-Edit `postgresql.conf`:
-```
-ssl = on
-ssl_cert_file = '/path/to/server.crt'
-ssl_key_file = '/path/to/server.key'
-```
-
----
-
-## Performance Issues
-
-### Problem: Indexing is very slow
-
-**Solutions:**
-
-1. **Check network speed** (if database is remote)
```bash
- # Test network speed
- iperf3 -c 192.168.1.159
+ python src/main.py execute plan.json
```
-2. **Increase commit batch size**
+6. **Report**
- Edit `src/main.py` line ~155:
- ```python
- # Change from 1000 to 5000
- if files_count % 5000 == 0:
- conn.commit()
- ```
-
-3. **Skip system directories**
-
- Already implemented! Script skips `$`, `System Volume Information`, `Recovery`
-
-4. **Use local PostgreSQL** (fastest)
-
- Install PostgreSQL locally and use:
- ```python
- 'host': 'localhost'
- ```
-
-### Problem: Migration is taking too long
-
-**Normal behavior:** Large file migrations take time (copying files).
-
-**Tips:**
-1. Use `--dry-run` first to verify (no actual copying)
-2. Run during off-hours
-3. Monitor with the dynamic progress display:
- ```
- [1523/5000] 1520 OK, 3 ERR | 12.3 files/s | ETA: 283s
- ```
-4. Check disk I/O isn't bottlenecked:
```bash
- # Windows
- resmon.exe # Check Disk tab
-
- # Linux
- iotop
+ python src/main.py report
```
---
-## Verification Steps
+## Guarantees
-### Verify Database Setup
-
-```bash
-# 1. Test connection
-psql -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db
-
-# 2. List tables
-\dt
-
-# 3. Check table structure
-\d files
-\d operations
-
-# 4. Verify permissions
-\dp files
-\dp operations
-
-# 5. Test insert (should work)
-INSERT INTO files (path, size, modified_time, disk) VALUES ('test', 100, 1.0, 'test');
-SELECT * FROM files WHERE path = 'test';
-DELETE FROM files WHERE path = 'test';
-
-# 6. Exit
-\q
-```
-
-### Verify Application Setup
-
-```bash
-# 1. Check Python version (need 3.7+)
-python --version
-
-# 2. Check dependencies
-pip list | grep psycopg2
-
-# 3. Test application
-python src/main.py report
-
-# Should show empty reports if no data yet
-```
+* No destructive actions by default
+* Originals preserved
+* Every action logged in DB
+* Error-resilient, continues safely
+* Suitable for millions of file records
---
-## Getting More Help
+## Failure Points to Check
-### Enable Debug Logging
-
-Edit `src/main.py` line ~21:
-```python
-logging.basicConfig(
- level=logging.DEBUG, # Change from INFO to DEBUG
- format='%(asctime)s - %(levelname)s - %(message)s',
- handlers=[
- logging.FileHandler('disk_reorganizer.log'),
- logging.StreamHandler(sys.stdout)
- ]
-)
-```
-
-Check `disk_reorganizer.log` for detailed information.
-
-### Check PostgreSQL Server Logs
-
-```bash
-# Ubuntu/Debian
-sudo tail -f /var/log/postgresql/postgresql-*-main.log
-
-# RedHat/CentOS
-sudo tail -f /var/lib/pgsql/data/pg_log/postgresql-*.log
-
-# Or via psql
-psql -U postgres -c "SHOW log_directory;"
-psql -U postgres -c "SHOW log_filename;"
-```
-
-### Test Network Connectivity
-
-```bash
-# Ping server
-ping 192.168.1.159
-
-# Test port
-telnet 192.168.1.159 5432
-
-# Or using nc (netcat)
-nc -zv 192.168.1.159 5432
-
-# PowerShell (Windows)
-Test-NetConnection -ComputerName 192.168.1.159 -Port 5432
-```
+* PostgreSQL reachable on 5432
+* Correct credentials
+* Disk permissions
+* Python + psycopg2 installed
---
-## Common Mistakes to Avoid
-
-❌ **Not running setup script first**
-- Always run `setup_database.bat` or `setup_database.sh` before using the app
-
-❌ **Using wrong database credentials**
-- Double-check host, port, username, password in scripts
-
-❌ **Skipping dry-run**
-- Always test with `--dry-run` before real migration
-
-❌ **Not checking disk space**
-- Verify destination disks have enough space before migration
-
-❌ **Deleting originals too quickly**
-- Keep original files until thoroughly verified
-
-❌ **Running without backups**
-- Always have backups before disk reorganization
-
----
-
-## Still Having Issues?
-
-If none of these solutions work:
-
-1. **Capture full error output:**
- ```bash
- python src/main.py report 2>&1 | tee error.log
- ```
-
-2. **Check the setup summary:**
- - Read `SETUP_INSTRUCTIONS.md`
- - Read `IMPLEMENTATION_SUMMARY.md`
-
-3. **Verify all prerequisites:**
- - [ ] PostgreSQL server running
- - [ ] PostgreSQL client tools installed
- - [ ] Python 3.7+ installed
- - [ ] psycopg2-binary installed
- - [ ] Database created
- - [ ] Tables created
- - [ ] User created with privileges
-
-4. **Test step by step:**
- - Can you connect with `psql`?
- - Can you run `python src/main.py report`?
- - Can you index a small test folder?
+## Essence
+A lean, safe, high-visibility disk migration tool running on a proper relational backbone, engineered for clarity, scale, and operational certainty.
+Wil je ook een **ultrakorte executive one-pager** of een **diagram-versie**?
diff --git a/requirements.txt b/requirements.txt
index b8a772c..63bae10 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,17 @@ psycopg2-binary>=2.9.9
# Alternative: psycopg2>=2.9.9 (requires PostgreSQL development libraries)
# Use psycopg2-binary for easier installation without compilation
+# Core dependencies
+python>=3.9
+
+# Optional/feature dependencies
+redis>=4.5.0 # For RedisHashStore
+scikit-learn>=1.0.0 # For MLClassifier
+numpy>=1.21.0 # For MLClassifier
+
+# Development dependencies
+pytest>=7.0.0
+pytest-cov>=4.0.0
+black>=22.0.0
+mypy>=0.950
+flake8>=5.0.0
diff --git a/src/classification/__init__.py b/src/classification/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classification/_protocols.py b/src/classification/_protocols.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classification/engine.py b/src/classification/engine.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classification/ml.py b/src/classification/ml.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classification/rules.py b/src/classification/rules.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/deduplication/__init__.py b/src/deduplication/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/deduplication/_protocols.py b/src/deduplication/_protocols.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/deduplication/chunker.py b/src/deduplication/chunker.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/deduplication/engine.py b/src/deduplication/engine.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/deduplication/store.py b/src/deduplication/store.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/discovery/__init__.py b/src/discovery/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/discovery/_protocols.py b/src/discovery/_protocols.py
new file mode 100644
index 0000000..45898bf
--- /dev/null
+++ b/src/discovery/_protocols.py
@@ -0,0 +1,54 @@
+"""Protocol definitions for the discovery package"""
+from typing import Iterator, Protocol, Any
+from pathlib import Path
+from dataclasses import dataclass
+
+
+@dataclass
+class FileMeta:
+ """Metadata for a discovered file"""
+ path: Path
+ size: int
+ modified_time: float
+ created_time: float
+ # Add other metadata fields as needed
+
+
+@dataclass
+class MountInfo:
+ """Information about a mounted filesystem"""
+ device: str
+ mount_point: str
+ fs_type: str
+ options: str
+ # Add other mount info fields as needed
+
+
+@dataclass
+class DiskInfo:
+ """Information about a disk/NVMe device"""
+ device: str
+ model: str
+ size: int
+ serial: str
+ # Add other disk info fields as needed
+
+
+class IFileScanner(Protocol):
+ """Protocol for file scanning operations"""
+
+ def scan(self, root: Path) -> Iterator[FileMeta]:
+ """Scan a directory tree and yield file metadata"""
+ ...
+
+
+class ISystemAPI(Protocol):
+ """Protocol for system information queries"""
+
+ def query_mounts(self) -> list[MountInfo]:
+ """Query mounted filesystems"""
+ ...
+
+ def query_nvmes(self) -> list[DiskInfo]:
+ """Query NVMe/disk information"""
+ ...
diff --git a/src/discovery/engine.py b/src/discovery/engine.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/discovery/scanner.py b/src/discovery/scanner.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/discovery/system.py b/src/discovery/system.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/migration/__init__.py b/src/migration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/migration/_protocols.py b/src/migration/_protocols.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/migration/copy.py b/src/migration/copy.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/migration/engine.py b/src/migration/engine.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/migration/hardlink.py b/src/migration/hardlink.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/shared/__init__.py b/src/shared/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/shared/_protocols.py b/src/shared/_protocols.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/shared/config.py b/src/shared/config.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/shared/logger.py b/src/shared/logger.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/shared/models.py b/src/shared/models.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/tests/__init__.py b/src/tests/__init__.py
new file mode 100644
index 0000000..e69de29