This commit is contained in:
mike
2025-12-12 05:40:33 +01:00
parent 0eba1619cf
commit fb675746ee
12 changed files with 661 additions and 0 deletions

18
.aiignore Normal file
View File

@@ -0,0 +1,18 @@
.DS_Store
*.log
*.tmp
dist/
build/
out/
.idea
node_modules/
.vscode/
.git
.github
scripts
.pytest_cache/
__pycache__
.aiignore
*.iml
.env
.bundle.md

12
.idea/dataSources.xml generated Normal file
View File

@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
<data-source source="LOCAL" name="disk_reorganizer_db@192.168.1.159" uuid="40177905-314d-45ca-b0d0-ae9d40009a0c">
<driver-ref>postgresql</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.postgresql.Driver</jdbc-driver>
<jdbc-url>jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
</data-source>
</component>
</project>

7
.idea/data_source_mapping.xml generated Normal file
View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourcePerFileMappings">
<file url="file://$PROJECT_DIR$/.idea/queries/Query.sql" value="40177905-314d-45ca-b0d0-ae9d40009a0c" />
<file url="file://$PROJECT_DIR$/setup_database.sql" value="40177905-314d-45ca-b0d0-ae9d40009a0c" />
</component>
</project>

3
.idea/defrag.iml generated
View File

@@ -3,6 +3,9 @@
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/.git" />
<excludeFolder url="file://$MODULE_DIR$/.idea/dataSources" />
<excludeFolder url="file://$MODULE_DIR$/.idea/queries" />
</content>
<orderEntry type="jdk" jdkName="Python 3.13 (.venv)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />

7
.idea/sqldialects.xml generated Normal file
View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="SqlDialectMappings">
<file url="file://$PROJECT_DIR$/setup_database.sql" dialect="PostgreSQL" />
<file url="PROJECT" dialect="PostgreSQL" />
</component>
</project>

38
Dockerfile Normal file
View File

@@ -0,0 +1,38 @@
# Dockerfile for Project Defrag with PostgreSQL integration
FROM python:3.11-slim
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
g++ \
libpq-dev \
postgresql-client \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Create non-root user
RUN useradd -m -u 1000 appuser && \
chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD python -c "import psycopg2; psycopg2.connect(dbname='${POSTGRES_DB:-disk_reorganizer_db}', user='${POSTGRES_USER:-disk_reorg_user}', password='${POSTGRES_PASSWORD}', host='${DB_HOST:-db}', port='${DB_PORT:-5432}')" || exit 1
# Default command (can be overridden in docker-compose)
CMD ["python", "main.py", "--help"]

View File

@@ -0,0 +1,20 @@
services:
app:
environment:
- LOG_LEVEL=DEBUG
- PYTHONPATH=/app
volumes:
- .:/app
- /var/run/docker.sock:/var/run/docker.sock
ports:
- "8000:8000"
command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
postgres:
environment:
- POSTGRES_LOG_STATEMENT=all
ports:
- "5433:5432" # Different port to avoid conflict with host PostgreSQL
redis:
command: redis-server --appendonly yes --loglevel verbose

245
docker-compose.yml Normal file
View File

@@ -0,0 +1,245 @@
services:
# PostgreSQL Database
postgres:
image: postgres:15-alpine
container_name: project_defrag_db
environment:
POSTGRES_USER: disk_reorg_user
POSTGRES_PASSWORD: heel-goed-wachtwoord
POSTGRES_DB: disk_reorganizer_db
POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C"
volumes:
- postgres_data:/var/lib/postgresql/data
- ./sql/init.sql:/docker-entrypoint-initdb.d/init.sql
- ./sql/migrations:/docker-entrypoint-initdb.d/migrations
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U disk_reorg_user -d disk_reorganizer_db"]
interval: 10s
timeout: 5s
retries: 5
networks:
- defrag-network
# Redis for deduplication hash store (optional)
redis:
image: redis:7-alpine
container_name: project_defrag_redis
command: redis-server --appendonly yes
volumes:
- redis_data:/data
ports:
- "6379:6379"
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
networks:
- defrag-network
# Application Service
app:
build: .
container_name: project_defrag_app
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
environment:
# Database Configuration
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
# Redis Configuration
REDIS_HOST: redis
REDIS_PORT: 6379
# Application Configuration
LOG_LEVEL: INFO
MAX_WORKERS: 4
CHUNK_SIZE_KB: 64
# Mount points (set these when running specific commands)
SOURCE_MOUNT: /mnt/source
TARGET_MOUNT: /mnt/target
volumes:
# Mount host directories for file operations
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
- ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
# Mount for configuration and plans
- ./config:/app/config
- ./plans:/app/plans
- ./logs:/app/logs
# Bind mount for development (optional)
- .:/app
networks:
- defrag-network
profiles:
- full-cycle
- development
# Uncomment for development with hot reload
# command: watchmedo auto-restart --pattern="*.py" --recursive -- python main.py
# Single command services for specific operations
index:
build: .
container_name: defrag_index
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "main.py", "index", "/mnt/source", "disk_d"]
profiles:
- index-only
networks:
- defrag-network
plan:
build: .
container_name: defrag_plan
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ./config:/app/config
- ./plans:/app/plans
- ./logs:/app/logs
command: ["python", "main.py", "plan", "disk_d", "disk_e"]
profiles:
- plan-only
networks:
- defrag-network
execute:
build: .
container_name: defrag_execute
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source
- ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
- ./plans:/app/plans
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "main.py", "execute", "/app/plans/plan.json"]
profiles:
- execute-only
networks:
- defrag-network
dry-run:
build: .
container_name: defrag_dry_run
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ./plans:/app/plans
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "main.py", "execute", "/app/plans/plan.json", "--dry-run"]
profiles:
- dry-run-only
networks:
- defrag-network
report:
build: .
container_name: defrag_report
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ./reports:/app/reports
- ./logs:/app/logs
command: ["python", "main.py", "report", "--format", "html"]
profiles:
- report-only
networks:
- defrag-network
# Monitoring and Admin Services
pgadmin:
image: dpage/pgadmin4:latest
container_name: defrag_pgadmin
environment:
PGADMIN_DEFAULT_EMAIL: admin@defrag.local
PGADMIN_DEFAULT_PASSWORD: admin123
volumes:
- pgadmin_data:/var/lib/pgadmin
ports:
- "5050:80"
depends_on:
- postgres
profiles:
- monitoring
networks:
- defrag-network
redis-commander:
image: rediscommander/redis-commander:latest
container_name: defrag_redis_commander
environment:
REDIS_HOSTS: local:redis:6379
ports:
- "8081:8081"
depends_on:
- redis
profiles:
- monitoring
networks:
- defrag-network
networks:
defrag-network:
driver: bridge
volumes:
postgres_data:
driver: local
redis_data:
driver: local
pgadmin_data:
driver: local

74
pyproject.toml Normal file
View File

@@ -0,0 +1,74 @@
[build-system]
requires = ["setuptools>=65.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "defrag"
version = "1.0.0"
description = "Intelligent disk reorganization system for 20TB+ data"
readme = "README.md"
requires-python = ">=3.9"
license = {text = "MIT"}
authors = [
{name = "Project Defrag"}
]
keywords = ["disk", "storage", "deduplication", "classification", "migration"]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: System Administrators",
"Topic :: System :: Filesystems",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dependencies = [
"psycopg2-binary>=2.9.0",
"psutil>=5.9.0",
"pandas>=1.5.0",
"pyarrow>=10.0.0",
"python-magic>=0.4.27",
]
[project.optional-dependencies]
redis = ["redis>=4.5.0"]
ml = ["scikit-learn>=1.2.0", "numpy>=1.24.0"]
dev = [
"pytest>=7.2.0",
"pytest-cov>=4.0.0",
"black>=23.0.0",
"mypy>=1.0.0",
"flake8>=6.0.0",
]
all = [
"redis>=4.5.0",
"scikit-learn>=1.2.0",
"numpy>=1.24.0",
]
[project.scripts]
defrag = "main:main"
[tool.black]
line-length = 100
target-version = ['py39', 'py310', 'py311', 'py312']
include = '\.pyi?$'
[tool.mypy]
python_version = "3.9"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = false
disallow_incomplete_defs = false
check_untyped_defs = true
no_implicit_optional = true
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = "-v --cov=. --cov-report=html --cov-report=term"

View File

@@ -15,3 +15,25 @@ pytest-cov>=4.0.0
black>=22.0.0
mypy>=0.950
flake8>=5.0.0
# Core dependencies
psycopg2-binary>=2.9.0
psutil>=5.9.0
# Data processing
pandas>=1.5.0
pyarrow>=10.0.0
# File type detection
python-magic>=0.4.27
# Optional dependencies
redis>=4.5.0 # For RedisHashStore (optional)
scikit-learn>=1.2.0 # For MLClassifier (optional)
numpy>=1.24.0 # For MLClassifier (optional)
# Development dependencies
pytest>=7.2.0
pytest-cov>=4.0.0
black>=23.0.0
mypy>=1.0.0
flake8>=6.0.0

164
sql/init.sql Normal file
View File

@@ -0,0 +1,164 @@
-- sql/init.sql
-- Initialize PostgreSQL database for Project Defrag
-- Enable useful extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
-- Files table
CREATE TABLE IF NOT EXISTS files (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
path TEXT NOT NULL,
size BIGINT NOT NULL,
modified_time TIMESTAMP WITH TIME ZONE,
created_time TIMESTAMP WITH TIME ZONE,
file_hash VARCHAR(64), -- SHA-256 hash
category VARCHAR(50),
disk_label VARCHAR(50),
last_verified TIMESTAMP WITH TIME ZONE,
-- Metadata
metadata JSONB DEFAULT '{}',
-- Audit fields
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
-- Constraints
CONSTRAINT unique_file_path UNIQUE(path)
);
-- Operations table (audit log)
CREATE TABLE IF NOT EXISTS operations (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
operation_type VARCHAR(50) NOT NULL,
source_path TEXT,
target_path TEXT,
status VARCHAR(20) NOT NULL,
-- File reference
file_id UUID REFERENCES files(id) ON DELETE SET NULL,
-- Performance metrics
duration_ms INTEGER,
bytes_processed BIGINT,
-- Error information
error_message TEXT,
error_details JSONB,
-- Context
session_id VARCHAR(100),
user_agent TEXT,
-- Audit fields
started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP WITH TIME ZONE,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
-- Deduplication hash store
CREATE TABLE IF NOT EXISTS deduplication_store (
hash VARCHAR(64) PRIMARY KEY,
canonical_path TEXT NOT NULL,
reference_count INTEGER DEFAULT 1,
first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
-- Migration plan table
CREATE TABLE IF NOT EXISTS migration_plans (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name VARCHAR(100) NOT NULL,
source_disk VARCHAR(50) NOT NULL,
target_disk VARCHAR(50) NOT NULL,
plan_json JSONB NOT NULL,
-- Statistics
total_files INTEGER DEFAULT 0,
total_size BIGINT DEFAULT 0,
estimated_duration INTEGER, -- in seconds
-- Status
status VARCHAR(20) DEFAULT 'draft',
-- Audit
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
executed_at TIMESTAMP WITH TIME ZONE,
completed_at TIMESTAMP WITH TIME ZONE
);
-- Indexes for performance
CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);
CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);
CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label);
CREATE INDEX IF NOT EXISTS idx_files_category ON files(category);
CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);
CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations(file_id);
CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store(canonical_path);
-- Functions for updating timestamps
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ language 'plpgsql';
-- Triggers for automatic updated_at
CREATE TRIGGER update_files_updated_at BEFORE UPDATE ON files
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
-- View for operational dashboard
CREATE OR REPLACE VIEW operational_dashboard AS
SELECT
o.status,
COUNT(*) as operation_count,
SUM(o.bytes_processed) as total_bytes,
AVG(o.duration_ms) as avg_duration_ms,
MIN(o.started_at) as earliest_operation,
MAX(o.completed_at) as latest_operation
FROM operations o
WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
GROUP BY o.status;
-- View for disk usage statistics
CREATE OR REPLACE VIEW disk_usage_stats AS
SELECT
disk_label,
COUNT(*) as file_count,
SUM(size) as total_size,
AVG(size) as avg_file_size,
MIN(created_time) as oldest_file,
MAX(modified_time) as newest_file
FROM files
GROUP BY disk_label;
-- Insert default configuration
INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
VALUES (
'Default Migration Plan',
'disk_d',
'disk_e',
'{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
'draft'
) ON CONFLICT DO NOTHING;
-- Create read-only user for monitoring
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
CREATE USER monitor_user WITH PASSWORD 'monitor_password';
END IF;
END
$$;
GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
GRANT USAGE ON SCHEMA public TO monitor_user;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
GRANT SELECT ON operational_dashboard TO monitor_user;
GRANT SELECT ON disk_usage_stats TO monitor_user;

51
src/setup.py Normal file
View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
"""Setup script for defrag disk reorganizer"""
from setuptools import setup, find_packages
from pathlib import Path
# Read requirements
requirements_path = Path(__file__).parent / 'requirements.txt'
with open(requirements_path) as f:
requirements = [
line.strip()
for line in f
if line.strip() and not line.startswith('#')
]
# Read long description from README
readme_path = Path(__file__).parent / 'README.md'
long_description = ""
if readme_path.exists():
with open(readme_path) as f:
long_description = f.read()
setup(
name='defrag',
version='1.0.0',
description='Intelligent disk reorganization system for 20TB+ data with deduplication and classification',
long_description=long_description,
long_description_content_type='text/markdown',
author='Project Defrag',
author_email='defrag@example.com',
url='https://github.com/yourusername/defrag',
packages=find_packages(),
install_requires=requirements,
python_requires='>=3.9',
entry_points={
'console_scripts': [
'defrag=main:main',
],
},
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: System Administrators',
'Topic :: System :: Filesystems',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
],
keywords='disk management storage deduplication classification migration',
)