diff --git a/.aiignore b/.aiignore
new file mode 100644
index 0000000..772cc80
--- /dev/null
+++ b/.aiignore
@@ -0,0 +1,18 @@
+.DS_Store
+*.log
+*.tmp
+dist/
+build/
+out/
+.idea
+node_modules/
+.vscode/
+.git
+.github
+scripts
+.pytest_cache/
+__pycache__
+.aiignore
+*.iml
+.env
+.bundle.md
\ No newline at end of file
diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml
new file mode 100644
index 0000000..9118540
--- /dev/null
+++ b/.idea/dataSources.xml
@@ -0,0 +1,12 @@
+
+
+
+
+ postgresql
+ true
+ org.postgresql.Driver
+ jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
+ $ProjectFileDir$
+
+
+
\ No newline at end of file
diff --git a/.idea/data_source_mapping.xml b/.idea/data_source_mapping.xml
new file mode 100644
index 0000000..90e17e1
--- /dev/null
+++ b/.idea/data_source_mapping.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/defrag.iml b/.idea/defrag.iml
index 3b7c44a..651d017 100644
--- a/.idea/defrag.iml
+++ b/.idea/defrag.iml
@@ -3,6 +3,9 @@
+
+
+
diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml
new file mode 100644
index 0000000..d0fd9f9
--- /dev/null
+++ b/.idea/sqldialects.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..55016b6
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,38 @@
+# Dockerfile for Project Defrag with PostgreSQL integration
+FROM python:3.11-slim
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+ gcc \
+ g++ \
+ libpq-dev \
+ postgresql-client \
+ && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1 \
+ PYTHONUNBUFFERED=1 \
+ PYTHONPATH=/app
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+ pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY . .
+
+# Create non-root user
+RUN useradd -m -u 1000 appuser && \
+ chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+ CMD python -c "import psycopg2; psycopg2.connect(dbname='${POSTGRES_DB:-disk_reorganizer_db}', user='${POSTGRES_USER:-disk_reorg_user}', password='${POSTGRES_PASSWORD}', host='${DB_HOST:-db}', port='${DB_PORT:-5432}')" || exit 1
+
+# Default command (can be overridden in docker-compose)
+CMD ["python", "main.py", "--help"]
\ No newline at end of file
diff --git a/docker-compose.override.yml b/docker-compose.override.yml
new file mode 100644
index 0000000..c0a3559
--- /dev/null
+++ b/docker-compose.override.yml
@@ -0,0 +1,20 @@
+services:
+ app:
+ environment:
+ - LOG_LEVEL=DEBUG
+ - PYTHONPATH=/app
+ volumes:
+ - .:/app
+ - /var/run/docker.sock:/var/run/docker.sock
+ ports:
+ - "8000:8000"
+ command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
+
+ postgres:
+ environment:
+ - POSTGRES_LOG_STATEMENT=all
+ ports:
+ - "5433:5432" # Different port to avoid conflict with host PostgreSQL
+
+ redis:
+ command: redis-server --appendonly yes --loglevel verbose
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..4a40bbc
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,245 @@
+services:
+ # PostgreSQL Database
+ postgres:
+ image: postgres:15-alpine
+ container_name: project_defrag_db
+ environment:
+ POSTGRES_USER: disk_reorg_user
+ POSTGRES_PASSWORD: heel-goed-wachtwoord
+ POSTGRES_DB: disk_reorganizer_db
+ POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C"
+ volumes:
+ - postgres_data:/var/lib/postgresql/data
+ - ./sql/init.sql:/docker-entrypoint-initdb.d/init.sql
+ - ./sql/migrations:/docker-entrypoint-initdb.d/migrations
+ ports:
+ - "5432:5432"
+ healthcheck:
+ test: ["CMD-SHELL", "pg_isready -U disk_reorg_user -d disk_reorganizer_db"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ networks:
+ - defrag-network
+
+ # Redis for deduplication hash store (optional)
+ redis:
+ image: redis:7-alpine
+ container_name: project_defrag_redis
+ command: redis-server --appendonly yes
+ volumes:
+ - redis_data:/data
+ ports:
+ - "6379:6379"
+ healthcheck:
+ test: ["CMD", "redis-cli", "ping"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ networks:
+ - defrag-network
+
+ # Application Service
+ app:
+ build: .
+ container_name: project_defrag_app
+ depends_on:
+ postgres:
+ condition: service_healthy
+ redis:
+ condition: service_healthy
+ environment:
+ # Database Configuration
+ DB_HOST: postgres
+ DB_PORT: 5432
+ DB_NAME: disk_reorganizer_db
+ DB_USER: disk_reorg_user
+ DB_PASSWORD: heel-goed-wachtwoord
+
+ # Redis Configuration
+ REDIS_HOST: redis
+ REDIS_PORT: 6379
+
+ # Application Configuration
+ LOG_LEVEL: INFO
+ MAX_WORKERS: 4
+ CHUNK_SIZE_KB: 64
+
+ # Mount points (set these when running specific commands)
+ SOURCE_MOUNT: /mnt/source
+ TARGET_MOUNT: /mnt/target
+ volumes:
+ # Mount host directories for file operations
+ - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
+ - ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
+
+ # Mount for configuration and plans
+ - ./config:/app/config
+ - ./plans:/app/plans
+ - ./logs:/app/logs
+
+ # Bind mount for development (optional)
+ - .:/app
+ networks:
+ - defrag-network
+ profiles:
+ - full-cycle
+ - development
+ # Uncomment for development with hot reload
+ # command: watchmedo auto-restart --pattern="*.py" --recursive -- python main.py
+
+ # Single command services for specific operations
+ index:
+ build: .
+ container_name: defrag_index
+ depends_on:
+ postgres:
+ condition: service_healthy
+ environment:
+ DB_HOST: postgres
+ DB_PORT: 5432
+ DB_NAME: disk_reorganizer_db
+ DB_USER: disk_reorg_user
+ DB_PASSWORD: heel-goed-wachtwoord
+ volumes:
+ - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
+ - ./config:/app/config
+ - ./logs:/app/logs
+ command: ["python", "main.py", "index", "/mnt/source", "disk_d"]
+ profiles:
+ - index-only
+ networks:
+ - defrag-network
+
+ plan:
+ build: .
+ container_name: defrag_plan
+ depends_on:
+ postgres:
+ condition: service_healthy
+ environment:
+ DB_HOST: postgres
+ DB_PORT: 5432
+ DB_NAME: disk_reorganizer_db
+ DB_USER: disk_reorg_user
+ DB_PASSWORD: heel-goed-wachtwoord
+ volumes:
+ - ./config:/app/config
+ - ./plans:/app/plans
+ - ./logs:/app/logs
+ command: ["python", "main.py", "plan", "disk_d", "disk_e"]
+ profiles:
+ - plan-only
+ networks:
+ - defrag-network
+
+ execute:
+ build: .
+ container_name: defrag_execute
+ depends_on:
+ postgres:
+ condition: service_healthy
+ environment:
+ DB_HOST: postgres
+ DB_PORT: 5432
+ DB_NAME: disk_reorganizer_db
+ DB_USER: disk_reorg_user
+ DB_PASSWORD: heel-goed-wachtwoord
+ volumes:
+ - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source
+ - ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
+ - ./plans:/app/plans
+ - ./config:/app/config
+ - ./logs:/app/logs
+ command: ["python", "main.py", "execute", "/app/plans/plan.json"]
+ profiles:
+ - execute-only
+ networks:
+ - defrag-network
+
+ dry-run:
+ build: .
+ container_name: defrag_dry_run
+ depends_on:
+ postgres:
+ condition: service_healthy
+ environment:
+ DB_HOST: postgres
+ DB_PORT: 5432
+ DB_NAME: disk_reorganizer_db
+ DB_USER: disk_reorg_user
+ DB_PASSWORD: heel-goed-wachtwoord
+ volumes:
+ - ./plans:/app/plans
+ - ./config:/app/config
+ - ./logs:/app/logs
+ command: ["python", "main.py", "execute", "/app/plans/plan.json", "--dry-run"]
+ profiles:
+ - dry-run-only
+ networks:
+ - defrag-network
+
+ report:
+ build: .
+ container_name: defrag_report
+ depends_on:
+ postgres:
+ condition: service_healthy
+ environment:
+ DB_HOST: postgres
+ DB_PORT: 5432
+ DB_NAME: disk_reorganizer_db
+ DB_USER: disk_reorg_user
+ DB_PASSWORD: heel-goed-wachtwoord
+ volumes:
+ - ./reports:/app/reports
+ - ./logs:/app/logs
+ command: ["python", "main.py", "report", "--format", "html"]
+ profiles:
+ - report-only
+ networks:
+ - defrag-network
+
+ # Monitoring and Admin Services
+ pgadmin:
+ image: dpage/pgadmin4:latest
+ container_name: defrag_pgadmin
+ environment:
+ PGADMIN_DEFAULT_EMAIL: admin@defrag.local
+ PGADMIN_DEFAULT_PASSWORD: admin123
+ volumes:
+ - pgadmin_data:/var/lib/pgadmin
+ ports:
+ - "5050:80"
+ depends_on:
+ - postgres
+ profiles:
+ - monitoring
+ networks:
+ - defrag-network
+
+ redis-commander:
+ image: rediscommander/redis-commander:latest
+ container_name: defrag_redis_commander
+ environment:
+ REDIS_HOSTS: local:redis:6379
+ ports:
+ - "8081:8081"
+ depends_on:
+ - redis
+ profiles:
+ - monitoring
+ networks:
+ - defrag-network
+
+networks:
+ defrag-network:
+ driver: bridge
+
+volumes:
+ postgres_data:
+ driver: local
+ redis_data:
+ driver: local
+ pgadmin_data:
+ driver: local
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..27d2803
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,74 @@
+[build-system]
+requires = ["setuptools>=65.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "defrag"
+version = "1.0.0"
+description = "Intelligent disk reorganization system for 20TB+ data"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "MIT"}
+authors = [
+ {name = "Project Defrag"}
+]
+keywords = ["disk", "storage", "deduplication", "classification", "migration"]
+classifiers = [
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: System Administrators",
+ "Topic :: System :: Filesystems",
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+]
+
+dependencies = [
+ "psycopg2-binary>=2.9.0",
+ "psutil>=5.9.0",
+ "pandas>=1.5.0",
+ "pyarrow>=10.0.0",
+ "python-magic>=0.4.27",
+]
+
+[project.optional-dependencies]
+redis = ["redis>=4.5.0"]
+ml = ["scikit-learn>=1.2.0", "numpy>=1.24.0"]
+dev = [
+ "pytest>=7.2.0",
+ "pytest-cov>=4.0.0",
+ "black>=23.0.0",
+ "mypy>=1.0.0",
+ "flake8>=6.0.0",
+]
+all = [
+ "redis>=4.5.0",
+ "scikit-learn>=1.2.0",
+ "numpy>=1.24.0",
+]
+
+[project.scripts]
+defrag = "main:main"
+
+[tool.black]
+line-length = 100
+target-version = ['py39', 'py310', 'py311', 'py312']
+include = '\.pyi?$'
+
+[tool.mypy]
+python_version = "3.9"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = false
+disallow_incomplete_defs = false
+check_untyped_defs = true
+no_implicit_optional = true
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = "-v --cov=. --cov-report=html --cov-report=term"
diff --git a/requirements.txt b/requirements.txt
index e0ab347..f838a8a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,25 @@ pytest-cov>=4.0.0
black>=22.0.0
mypy>=0.950
flake8>=5.0.0
+# Core dependencies
+psycopg2-binary>=2.9.0
+psutil>=5.9.0
+
+# Data processing
+pandas>=1.5.0
+pyarrow>=10.0.0
+
+# File type detection
+python-magic>=0.4.27
+
+# Optional dependencies
+redis>=4.5.0 # For RedisHashStore (optional)
+scikit-learn>=1.2.0 # For MLClassifier (optional)
+numpy>=1.24.0 # For MLClassifier (optional)
+
+# Development dependencies
+pytest>=7.2.0
+pytest-cov>=4.0.0
+black>=23.0.0
+mypy>=1.0.0
+flake8>=6.0.0
diff --git a/sql/init.sql b/sql/init.sql
new file mode 100644
index 0000000..c175132
--- /dev/null
+++ b/sql/init.sql
@@ -0,0 +1,164 @@
+-- sql/init.sql
+-- Initialize PostgreSQL database for Project Defrag
+
+-- Enable useful extensions
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+CREATE EXTENSION IF NOT EXISTS "pgcrypto";
+
+-- Files table
+CREATE TABLE IF NOT EXISTS files (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ path TEXT NOT NULL,
+ size BIGINT NOT NULL,
+ modified_time TIMESTAMP WITH TIME ZONE,
+ created_time TIMESTAMP WITH TIME ZONE,
+ file_hash VARCHAR(64), -- SHA-256 hash
+ category VARCHAR(50),
+ disk_label VARCHAR(50),
+ last_verified TIMESTAMP WITH TIME ZONE,
+
+ -- Metadata
+ metadata JSONB DEFAULT '{}',
+
+ -- Audit fields
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+
+ -- Constraints
+ CONSTRAINT unique_file_path UNIQUE(path)
+);
+
+-- Operations table (audit log)
+CREATE TABLE IF NOT EXISTS operations (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ operation_type VARCHAR(50) NOT NULL,
+ source_path TEXT,
+ target_path TEXT,
+ status VARCHAR(20) NOT NULL,
+
+ -- File reference
+ file_id UUID REFERENCES files(id) ON DELETE SET NULL,
+
+ -- Performance metrics
+ duration_ms INTEGER,
+ bytes_processed BIGINT,
+
+ -- Error information
+ error_message TEXT,
+ error_details JSONB,
+
+ -- Context
+ session_id VARCHAR(100),
+ user_agent TEXT,
+
+ -- Audit fields
+ started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ completed_at TIMESTAMP WITH TIME ZONE,
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Deduplication hash store
+CREATE TABLE IF NOT EXISTS deduplication_store (
+ hash VARCHAR(64) PRIMARY KEY,
+ canonical_path TEXT NOT NULL,
+ reference_count INTEGER DEFAULT 1,
+ first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Migration plan table
+CREATE TABLE IF NOT EXISTS migration_plans (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ name VARCHAR(100) NOT NULL,
+ source_disk VARCHAR(50) NOT NULL,
+ target_disk VARCHAR(50) NOT NULL,
+ plan_json JSONB NOT NULL,
+
+ -- Statistics
+ total_files INTEGER DEFAULT 0,
+ total_size BIGINT DEFAULT 0,
+ estimated_duration INTEGER, -- in seconds
+
+ -- Status
+ status VARCHAR(20) DEFAULT 'draft',
+
+ -- Audit
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ executed_at TIMESTAMP WITH TIME ZONE,
+ completed_at TIMESTAMP WITH TIME ZONE
+);
+
+-- Indexes for performance
+CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);
+CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);
+CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label);
+CREATE INDEX IF NOT EXISTS idx_files_category ON files(category);
+
+CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
+CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);
+CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations(file_id);
+
+CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store(canonical_path);
+
+-- Functions for updating timestamps
+CREATE OR REPLACE FUNCTION update_updated_at_column()
+RETURNS TRIGGER AS $$
+BEGIN
+ NEW.updated_at = CURRENT_TIMESTAMP;
+ RETURN NEW;
+END;
+$$ language 'plpgsql';
+
+-- Triggers for automatic updated_at
+CREATE TRIGGER update_files_updated_at BEFORE UPDATE ON files
+ FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
+
+-- View for operational dashboard
+CREATE OR REPLACE VIEW operational_dashboard AS
+SELECT
+ o.status,
+ COUNT(*) as operation_count,
+ SUM(o.bytes_processed) as total_bytes,
+ AVG(o.duration_ms) as avg_duration_ms,
+ MIN(o.started_at) as earliest_operation,
+ MAX(o.completed_at) as latest_operation
+FROM operations o
+WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
+GROUP BY o.status;
+
+-- View for disk usage statistics
+CREATE OR REPLACE VIEW disk_usage_stats AS
+SELECT
+ disk_label,
+ COUNT(*) as file_count,
+ SUM(size) as total_size,
+ AVG(size) as avg_file_size,
+ MIN(created_time) as oldest_file,
+ MAX(modified_time) as newest_file
+FROM files
+GROUP BY disk_label;
+
+-- Insert default configuration
+INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
+VALUES (
+ 'Default Migration Plan',
+ 'disk_d',
+ 'disk_e',
+ '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
+ 'draft'
+) ON CONFLICT DO NOTHING;
+
+-- Create read-only user for monitoring
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
+ CREATE USER monitor_user WITH PASSWORD 'monitor_password';
+ END IF;
+END
+$$;
+
+GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
+GRANT USAGE ON SCHEMA public TO monitor_user;
+GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
+GRANT SELECT ON operational_dashboard TO monitor_user;
+GRANT SELECT ON disk_usage_stats TO monitor_user;
\ No newline at end of file
diff --git a/src/setup.py b/src/setup.py
new file mode 100644
index 0000000..1556dfd
--- /dev/null
+++ b/src/setup.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""Setup script for defrag disk reorganizer"""
+from setuptools import setup, find_packages
+from pathlib import Path
+
+# Read requirements
+requirements_path = Path(__file__).parent / 'requirements.txt'
+with open(requirements_path) as f:
+ requirements = [
+ line.strip()
+ for line in f
+ if line.strip() and not line.startswith('#')
+ ]
+
+# Read long description from README
+readme_path = Path(__file__).parent / 'README.md'
+long_description = ""
+if readme_path.exists():
+ with open(readme_path) as f:
+ long_description = f.read()
+
+setup(
+ name='defrag',
+ version='1.0.0',
+ description='Intelligent disk reorganization system for 20TB+ data with deduplication and classification',
+ long_description=long_description,
+ long_description_content_type='text/markdown',
+ author='Project Defrag',
+ author_email='defrag@example.com',
+ url='https://github.com/yourusername/defrag',
+ packages=find_packages(),
+ install_requires=requirements,
+ python_requires='>=3.9',
+ entry_points={
+ 'console_scripts': [
+ 'defrag=main:main',
+ ],
+ },
+ classifiers=[
+ 'Development Status :: 4 - Beta',
+ 'Intended Audience :: System Administrators',
+ 'Topic :: System :: Filesystems',
+ 'License :: OSI Approved :: MIT License',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.9',
+ 'Programming Language :: Python :: 3.10',
+ 'Programming Language :: Python :: 3.11',
+ 'Programming Language :: Python :: 3.12',
+ ],
+ keywords='disk management storage deduplication classification migration',
+)