diff --git a/.aiignore b/.aiignore new file mode 100644 index 0000000..772cc80 --- /dev/null +++ b/.aiignore @@ -0,0 +1,18 @@ +.DS_Store +*.log +*.tmp +dist/ +build/ +out/ +.idea +node_modules/ +.vscode/ +.git +.github +scripts +.pytest_cache/ +__pycache__ +.aiignore +*.iml +.env +.bundle.md \ No newline at end of file diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml new file mode 100644 index 0000000..9118540 --- /dev/null +++ b/.idea/dataSources.xml @@ -0,0 +1,12 @@ + + + + + postgresql + true + org.postgresql.Driver + jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db + $ProjectFileDir$ + + + \ No newline at end of file diff --git a/.idea/data_source_mapping.xml b/.idea/data_source_mapping.xml new file mode 100644 index 0000000..90e17e1 --- /dev/null +++ b/.idea/data_source_mapping.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/.idea/defrag.iml b/.idea/defrag.iml index 3b7c44a..651d017 100644 --- a/.idea/defrag.iml +++ b/.idea/defrag.iml @@ -3,6 +3,9 @@ + + + diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml new file mode 100644 index 0000000..d0fd9f9 --- /dev/null +++ b/.idea/sqldialects.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..55016b6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +# Dockerfile for Project Defrag with PostgreSQL integration +FROM python:3.11-slim + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + g++ \ + libpq-dev \ + postgresql-client \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Create non-root user +RUN useradd -m -u 1000 appuser && \ + chown -R appuser:appuser /app +USER appuser + +# Health check +HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ + CMD python -c "import psycopg2; psycopg2.connect(dbname='${POSTGRES_DB:-disk_reorganizer_db}', user='${POSTGRES_USER:-disk_reorg_user}', password='${POSTGRES_PASSWORD}', host='${DB_HOST:-db}', port='${DB_PORT:-5432}')" || exit 1 + +# Default command (can be overridden in docker-compose) +CMD ["python", "main.py", "--help"] \ No newline at end of file diff --git a/docker-compose.override.yml b/docker-compose.override.yml new file mode 100644 index 0000000..c0a3559 --- /dev/null +++ b/docker-compose.override.yml @@ -0,0 +1,20 @@ +services: + app: + environment: + - LOG_LEVEL=DEBUG + - PYTHONPATH=/app + volumes: + - .:/app + - /var/run/docker.sock:/var/run/docker.sock + ports: + - "8000:8000" + command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload + + postgres: + environment: + - POSTGRES_LOG_STATEMENT=all + ports: + - "5433:5432" # Different port to avoid conflict with host PostgreSQL + + redis: + command: redis-server --appendonly yes --loglevel verbose diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4a40bbc --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,245 @@ +services: + # PostgreSQL Database + postgres: + image: postgres:15-alpine + container_name: project_defrag_db + environment: + POSTGRES_USER: disk_reorg_user + POSTGRES_PASSWORD: heel-goed-wachtwoord + POSTGRES_DB: disk_reorganizer_db + POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C" + volumes: + - postgres_data:/var/lib/postgresql/data + - ./sql/init.sql:/docker-entrypoint-initdb.d/init.sql + - ./sql/migrations:/docker-entrypoint-initdb.d/migrations + ports: + - "5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U disk_reorg_user -d disk_reorganizer_db"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - defrag-network + + # Redis for deduplication hash store (optional) + redis: + image: redis:7-alpine + container_name: project_defrag_redis + command: redis-server --appendonly yes + volumes: + - redis_data:/data + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - defrag-network + + # Application Service + app: + build: . + container_name: project_defrag_app + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + environment: + # Database Configuration + DB_HOST: postgres + DB_PORT: 5432 + DB_NAME: disk_reorganizer_db + DB_USER: disk_reorg_user + DB_PASSWORD: heel-goed-wachtwoord + + # Redis Configuration + REDIS_HOST: redis + REDIS_PORT: 6379 + + # Application Configuration + LOG_LEVEL: INFO + MAX_WORKERS: 4 + CHUNK_SIZE_KB: 64 + + # Mount points (set these when running specific commands) + SOURCE_MOUNT: /mnt/source + TARGET_MOUNT: /mnt/target + volumes: + # Mount host directories for file operations + - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro + - ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target + + # Mount for configuration and plans + - ./config:/app/config + - ./plans:/app/plans + - ./logs:/app/logs + + # Bind mount for development (optional) + - .:/app + networks: + - defrag-network + profiles: + - full-cycle + - development + # Uncomment for development with hot reload + # command: watchmedo auto-restart --pattern="*.py" --recursive -- python main.py + + # Single command services for specific operations + index: + build: . + container_name: defrag_index + depends_on: + postgres: + condition: service_healthy + environment: + DB_HOST: postgres + DB_PORT: 5432 + DB_NAME: disk_reorganizer_db + DB_USER: disk_reorg_user + DB_PASSWORD: heel-goed-wachtwoord + volumes: + - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro + - ./config:/app/config + - ./logs:/app/logs + command: ["python", "main.py", "index", "/mnt/source", "disk_d"] + profiles: + - index-only + networks: + - defrag-network + + plan: + build: . + container_name: defrag_plan + depends_on: + postgres: + condition: service_healthy + environment: + DB_HOST: postgres + DB_PORT: 5432 + DB_NAME: disk_reorganizer_db + DB_USER: disk_reorg_user + DB_PASSWORD: heel-goed-wachtwoord + volumes: + - ./config:/app/config + - ./plans:/app/plans + - ./logs:/app/logs + command: ["python", "main.py", "plan", "disk_d", "disk_e"] + profiles: + - plan-only + networks: + - defrag-network + + execute: + build: . + container_name: defrag_execute + depends_on: + postgres: + condition: service_healthy + environment: + DB_HOST: postgres + DB_PORT: 5432 + DB_NAME: disk_reorganizer_db + DB_USER: disk_reorg_user + DB_PASSWORD: heel-goed-wachtwoord + volumes: + - ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source + - ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target + - ./plans:/app/plans + - ./config:/app/config + - ./logs:/app/logs + command: ["python", "main.py", "execute", "/app/plans/plan.json"] + profiles: + - execute-only + networks: + - defrag-network + + dry-run: + build: . + container_name: defrag_dry_run + depends_on: + postgres: + condition: service_healthy + environment: + DB_HOST: postgres + DB_PORT: 5432 + DB_NAME: disk_reorganizer_db + DB_USER: disk_reorg_user + DB_PASSWORD: heel-goed-wachtwoord + volumes: + - ./plans:/app/plans + - ./config:/app/config + - ./logs:/app/logs + command: ["python", "main.py", "execute", "/app/plans/plan.json", "--dry-run"] + profiles: + - dry-run-only + networks: + - defrag-network + + report: + build: . + container_name: defrag_report + depends_on: + postgres: + condition: service_healthy + environment: + DB_HOST: postgres + DB_PORT: 5432 + DB_NAME: disk_reorganizer_db + DB_USER: disk_reorg_user + DB_PASSWORD: heel-goed-wachtwoord + volumes: + - ./reports:/app/reports + - ./logs:/app/logs + command: ["python", "main.py", "report", "--format", "html"] + profiles: + - report-only + networks: + - defrag-network + + # Monitoring and Admin Services + pgadmin: + image: dpage/pgadmin4:latest + container_name: defrag_pgadmin + environment: + PGADMIN_DEFAULT_EMAIL: admin@defrag.local + PGADMIN_DEFAULT_PASSWORD: admin123 + volumes: + - pgadmin_data:/var/lib/pgadmin + ports: + - "5050:80" + depends_on: + - postgres + profiles: + - monitoring + networks: + - defrag-network + + redis-commander: + image: rediscommander/redis-commander:latest + container_name: defrag_redis_commander + environment: + REDIS_HOSTS: local:redis:6379 + ports: + - "8081:8081" + depends_on: + - redis + profiles: + - monitoring + networks: + - defrag-network + +networks: + defrag-network: + driver: bridge + +volumes: + postgres_data: + driver: local + redis_data: + driver: local + pgadmin_data: + driver: local \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..27d2803 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,74 @@ +[build-system] +requires = ["setuptools>=65.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "defrag" +version = "1.0.0" +description = "Intelligent disk reorganization system for 20TB+ data" +readme = "README.md" +requires-python = ">=3.9" +license = {text = "MIT"} +authors = [ + {name = "Project Defrag"} +] +keywords = ["disk", "storage", "deduplication", "classification", "migration"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: System Administrators", + "Topic :: System :: Filesystems", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dependencies = [ + "psycopg2-binary>=2.9.0", + "psutil>=5.9.0", + "pandas>=1.5.0", + "pyarrow>=10.0.0", + "python-magic>=0.4.27", +] + +[project.optional-dependencies] +redis = ["redis>=4.5.0"] +ml = ["scikit-learn>=1.2.0", "numpy>=1.24.0"] +dev = [ + "pytest>=7.2.0", + "pytest-cov>=4.0.0", + "black>=23.0.0", + "mypy>=1.0.0", + "flake8>=6.0.0", +] +all = [ + "redis>=4.5.0", + "scikit-learn>=1.2.0", + "numpy>=1.24.0", +] + +[project.scripts] +defrag = "main:main" + +[tool.black] +line-length = 100 +target-version = ['py39', 'py310', 'py311', 'py312'] +include = '\.pyi?$' + +[tool.mypy] +python_version = "3.9" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +disallow_incomplete_defs = false +check_untyped_defs = true +no_implicit_optional = true + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v --cov=. --cov-report=html --cov-report=term" diff --git a/requirements.txt b/requirements.txt index e0ab347..f838a8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,25 @@ pytest-cov>=4.0.0 black>=22.0.0 mypy>=0.950 flake8>=5.0.0 +# Core dependencies +psycopg2-binary>=2.9.0 +psutil>=5.9.0 + +# Data processing +pandas>=1.5.0 +pyarrow>=10.0.0 + +# File type detection +python-magic>=0.4.27 + +# Optional dependencies +redis>=4.5.0 # For RedisHashStore (optional) +scikit-learn>=1.2.0 # For MLClassifier (optional) +numpy>=1.24.0 # For MLClassifier (optional) + +# Development dependencies +pytest>=7.2.0 +pytest-cov>=4.0.0 +black>=23.0.0 +mypy>=1.0.0 +flake8>=6.0.0 diff --git a/sql/init.sql b/sql/init.sql new file mode 100644 index 0000000..c175132 --- /dev/null +++ b/sql/init.sql @@ -0,0 +1,164 @@ +-- sql/init.sql +-- Initialize PostgreSQL database for Project Defrag + +-- Enable useful extensions +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +CREATE EXTENSION IF NOT EXISTS "pgcrypto"; + +-- Files table +CREATE TABLE IF NOT EXISTS files ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + path TEXT NOT NULL, + size BIGINT NOT NULL, + modified_time TIMESTAMP WITH TIME ZONE, + created_time TIMESTAMP WITH TIME ZONE, + file_hash VARCHAR(64), -- SHA-256 hash + category VARCHAR(50), + disk_label VARCHAR(50), + last_verified TIMESTAMP WITH TIME ZONE, + + -- Metadata + metadata JSONB DEFAULT '{}', + + -- Audit fields + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + + -- Constraints + CONSTRAINT unique_file_path UNIQUE(path) +); + +-- Operations table (audit log) +CREATE TABLE IF NOT EXISTS operations ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + operation_type VARCHAR(50) NOT NULL, + source_path TEXT, + target_path TEXT, + status VARCHAR(20) NOT NULL, + + -- File reference + file_id UUID REFERENCES files(id) ON DELETE SET NULL, + + -- Performance metrics + duration_ms INTEGER, + bytes_processed BIGINT, + + -- Error information + error_message TEXT, + error_details JSONB, + + -- Context + session_id VARCHAR(100), + user_agent TEXT, + + -- Audit fields + started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + completed_at TIMESTAMP WITH TIME ZONE, + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +-- Deduplication hash store +CREATE TABLE IF NOT EXISTS deduplication_store ( + hash VARCHAR(64) PRIMARY KEY, + canonical_path TEXT NOT NULL, + reference_count INTEGER DEFAULT 1, + first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +-- Migration plan table +CREATE TABLE IF NOT EXISTS migration_plans ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(100) NOT NULL, + source_disk VARCHAR(50) NOT NULL, + target_disk VARCHAR(50) NOT NULL, + plan_json JSONB NOT NULL, + + -- Statistics + total_files INTEGER DEFAULT 0, + total_size BIGINT DEFAULT 0, + estimated_duration INTEGER, -- in seconds + + -- Status + status VARCHAR(20) DEFAULT 'draft', + + -- Audit + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + executed_at TIMESTAMP WITH TIME ZONE, + completed_at TIMESTAMP WITH TIME ZONE +); + +-- Indexes for performance +CREATE INDEX IF NOT EXISTS idx_files_path ON files(path); +CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash); +CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label); +CREATE INDEX IF NOT EXISTS idx_files_category ON files(category); + +CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status); +CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at); +CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations(file_id); + +CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store(canonical_path); + +-- Functions for updating timestamps +CREATE OR REPLACE FUNCTION update_updated_at_column() +RETURNS TRIGGER AS $$ +BEGIN + NEW.updated_at = CURRENT_TIMESTAMP; + RETURN NEW; +END; +$$ language 'plpgsql'; + +-- Triggers for automatic updated_at +CREATE TRIGGER update_files_updated_at BEFORE UPDATE ON files + FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); + +-- View for operational dashboard +CREATE OR REPLACE VIEW operational_dashboard AS +SELECT + o.status, + COUNT(*) as operation_count, + SUM(o.bytes_processed) as total_bytes, + AVG(o.duration_ms) as avg_duration_ms, + MIN(o.started_at) as earliest_operation, + MAX(o.completed_at) as latest_operation +FROM operations o +WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours' +GROUP BY o.status; + +-- View for disk usage statistics +CREATE OR REPLACE VIEW disk_usage_stats AS +SELECT + disk_label, + COUNT(*) as file_count, + SUM(size) as total_size, + AVG(size) as avg_file_size, + MIN(created_time) as oldest_file, + MAX(modified_time) as newest_file +FROM files +GROUP BY disk_label; + +-- Insert default configuration +INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status) +VALUES ( + 'Default Migration Plan', + 'disk_d', + 'disk_e', + '{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb, + 'draft' +) ON CONFLICT DO NOTHING; + +-- Create read-only user for monitoring +DO $$ +BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN + CREATE USER monitor_user WITH PASSWORD 'monitor_password'; + END IF; +END +$$; + +GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user; +GRANT USAGE ON SCHEMA public TO monitor_user; +GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user; +GRANT SELECT ON operational_dashboard TO monitor_user; +GRANT SELECT ON disk_usage_stats TO monitor_user; \ No newline at end of file diff --git a/src/setup.py b/src/setup.py new file mode 100644 index 0000000..1556dfd --- /dev/null +++ b/src/setup.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +"""Setup script for defrag disk reorganizer""" +from setuptools import setup, find_packages +from pathlib import Path + +# Read requirements +requirements_path = Path(__file__).parent / 'requirements.txt' +with open(requirements_path) as f: + requirements = [ + line.strip() + for line in f + if line.strip() and not line.startswith('#') + ] + +# Read long description from README +readme_path = Path(__file__).parent / 'README.md' +long_description = "" +if readme_path.exists(): + with open(readme_path) as f: + long_description = f.read() + +setup( + name='defrag', + version='1.0.0', + description='Intelligent disk reorganization system for 20TB+ data with deduplication and classification', + long_description=long_description, + long_description_content_type='text/markdown', + author='Project Defrag', + author_email='defrag@example.com', + url='https://github.com/yourusername/defrag', + packages=find_packages(), + install_requires=requirements, + python_requires='>=3.9', + entry_points={ + 'console_scripts': [ + 'defrag=main:main', + ], + }, + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: System Administrators', + 'Topic :: System :: Filesystems', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + ], + keywords='disk management storage deduplication classification migration', +)