This commit is contained in:
mike
2025-12-12 19:25:16 +01:00
parent 5e0db89d45
commit 56b2db82fc
34 changed files with 117 additions and 6556 deletions

View File

@@ -35,4 +35,4 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD python -c "import psycopg2; psycopg2.connect(dbname='${POSTGRES_DB:-disk_reorganizer_db}', user='${POSTGRES_USER:-disk_reorg_user}', password='${POSTGRES_PASSWORD}', host='${DB_HOST:-db}', port='${DB_PORT:-5432}')" || exit 1
# Default command (can be overridden in docker-compose)
CMD ["python", "main.py", "--help"]
CMD ["python", "app/main.py", "--help"]

View File

@@ -70,13 +70,13 @@ class ClassificationEngine:
if disk:
cursor.execute("""
SELECT path, checksum
FROM files
FROM files_bak
WHERE disk = %s AND category IS NULL
""", (disk,))
else:
cursor.execute("""
SELECT path, checksum
FROM files
FROM files_bak
WHERE category IS NULL
""")
@@ -149,7 +149,7 @@ class ClassificationEngine:
from psycopg2.extras import execute_batch
query = """
UPDATE files
UPDATE files_bak
SET category = %s
WHERE path = %s
"""
@@ -188,7 +188,7 @@ class ClassificationEngine:
category,
COUNT(*) as file_count,
SUM(size) as total_size
FROM files
FROM files_bak
WHERE category IS NOT NULL
GROUP BY category
ORDER BY total_size DESC
@@ -214,7 +214,7 @@ class ClassificationEngine:
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM files WHERE category IS NULL")
cursor.execute("SELECT COUNT(*) FROM files_bak WHERE category IS NULL")
count = cursor.fetchone()[0]
cursor.close()
@@ -241,7 +241,7 @@ class ClassificationEngine:
cursor = conn.cursor()
cursor.execute("""
UPDATE files
UPDATE files_bak
SET category = %s
WHERE category = %s
""", (new_category, old_category))
@@ -278,7 +278,7 @@ class ClassificationEngine:
# Get categorized files
cursor.execute("""
SELECT path, category
FROM files
FROM files_bak
WHERE category IS NOT NULL
""")
@@ -326,7 +326,7 @@ class ClassificationEngine:
cursor.execute("""
SELECT DISTINCT category
FROM files
FROM files_bak
WHERE category IS NOT NULL
ORDER BY category
""")

View File

@@ -241,7 +241,7 @@ def train_from_database(
cursor = db_connection.cursor()
cursor.execute("""
SELECT path, category
FROM files
FROM files_bak
WHERE category IS NOT NULL
""")

View File

@@ -70,14 +70,14 @@ class DeduplicationEngine:
if disk:
cursor.execute("""
SELECT path, size
FROM files
FROM files_bak
WHERE disk = %s AND checksum IS NULL
ORDER BY size DESC
""", (disk,))
else:
cursor.execute("""
SELECT path, size
FROM files
FROM files_bak
WHERE checksum IS NULL
ORDER BY size DESC
""")
@@ -108,7 +108,7 @@ class DeduplicationEngine:
if checksum:
# Update database
cursor.execute("""
UPDATE files
UPDATE files_bak
SET checksum = %s, duplicate_of = %s
WHERE path = %s
""", (checksum, duplicate_of, str(path)))
@@ -225,7 +225,7 @@ class DeduplicationEngine:
if disk:
cursor.execute("""
SELECT checksum, array_agg(path ORDER BY path) as paths
FROM files
FROM files_bak
WHERE disk = %s AND checksum IS NOT NULL
GROUP BY checksum
HAVING COUNT(*) > 1
@@ -233,7 +233,7 @@ class DeduplicationEngine:
else:
cursor.execute("""
SELECT checksum, array_agg(path ORDER BY path) as paths
FROM files
FROM files_bak
WHERE checksum IS NOT NULL
GROUP BY checksum
HAVING COUNT(*) > 1
@@ -262,18 +262,18 @@ class DeduplicationEngine:
stats = {}
# Total files
cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
cursor.execute("SELECT COUNT(*) FROM files_bak WHERE checksum IS NOT NULL")
stats['total_files'] = cursor.fetchone()[0]
# Unique files
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files_bak WHERE checksum IS NOT NULL")
stats['unique_files'] = cursor.fetchone()[0]
# Duplicate files
stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
# Total size
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files_bak WHERE checksum IS NOT NULL")
stats['total_size'] = cursor.fetchone()[0]
# Unique size
@@ -281,7 +281,7 @@ class DeduplicationEngine:
SELECT COALESCE(SUM(size), 0)
FROM (
SELECT DISTINCT ON (checksum) size
FROM files
FROM files_bak
WHERE checksum IS NOT NULL
) AS unique_files
""")
@@ -321,11 +321,11 @@ class DeduplicationEngine:
cursor.execute("""
WITH canonical AS (
SELECT DISTINCT ON (checksum) path, checksum
FROM files
FROM files_bak
WHERE checksum IS NOT NULL
ORDER BY checksum, path
)
UPDATE files
UPDATE files_bak
SET duplicate_of = NULL
WHERE path IN (SELECT path FROM canonical)
""")

View File

@@ -227,7 +227,7 @@ class HashStore:
# Get all files with their hashes
cursor.execute("""
SELECT f.path, f.checksum
FROM files f
FROM files_bak f
WHERE f.checksum IS NOT NULL
""")

View File

@@ -72,17 +72,17 @@ class DiscoveryEngine:
# Create index on path
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)
CREATE INDEX IF NOT EXISTS idx_files_path ON files_bak(path)
""")
# Create index on disk
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk)
CREATE INDEX IF NOT EXISTS idx_files_disk ON files_bak(disk)
""")
# Create index on checksum
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)
CREATE INDEX IF NOT EXISTS idx_files_checksum ON files_bak(checksum)
""")
conn.commit()
@@ -193,7 +193,7 @@ class DiscoveryEngine:
batch: List of FileRecord objects
"""
query = """
INSERT INTO files (path, size, modified_time, created_time, disk, checksum, status, category, duplicate_of)
INSERT INTO files_bak (path, size, modified_time, created_time, disk, checksum, status, category, duplicate_of)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (path) DO UPDATE SET
size = EXCLUDED.size,
@@ -276,9 +276,9 @@ class DiscoveryEngine:
cursor = conn.cursor()
if disk:
cursor.execute("SELECT COUNT(*) FROM files WHERE disk = %s", (disk,))
cursor.execute("SELECT COUNT(*) FROM files_bak WHERE disk = %s", (disk,))
else:
cursor.execute("SELECT COUNT(*) FROM files")
cursor.execute("SELECT COUNT(*) FROM files_bak")
count = cursor.fetchone()[0]
cursor.close()
@@ -298,9 +298,9 @@ class DiscoveryEngine:
cursor = conn.cursor()
if disk:
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE disk = %s", (disk,))
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files_bak WHERE disk = %s", (disk,))
else:
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files")
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files_bak")
total = cursor.fetchone()[0]
cursor.close()

View File

@@ -127,7 +127,7 @@ class DiskReorganizer:
# PostgreSQL INSERT ... ON CONFLICT for upsert
cursor.execute("""
INSERT INTO files (path, size, modified_time, disk, checksum, status)
INSERT INTO files_bak (path, size, modified_time, disk, checksum, status)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (path) DO UPDATE SET
size = EXCLUDED.size,
@@ -175,7 +175,7 @@ class DiskReorganizer:
try:
cursor.execute("""
SELECT disk, SUM(size) as total_size, COUNT(*) as file_count
FROM files
FROM files_bak
GROUP BY disk
""")
@@ -215,7 +215,7 @@ class DiskReorganizer:
cursor = conn.cursor()
cursor.execute(
"SELECT path, size, modified_time FROM files WHERE disk = %s ORDER BY size DESC",
"SELECT path, size, modified_time FROM files_bak WHERE disk = %s ORDER BY size DESC",
(target_disk,)
)
files_to_move = cursor.fetchall()
@@ -272,7 +272,7 @@ class DiskReorganizer:
# Store in database
cursor.execute(
"INSERT INTO operations (source_path, dest_path, operation_type) VALUES (%s, %s, %s)",
"INSERT INTO operations_bak (source_path, dest_path, operation_type) VALUES (%s, %s, %s)",
(f"{target_disk}:{rel_path}", f"{dest_disk}:{rel_path}", 'move')
)
@@ -384,7 +384,7 @@ class DiskReorganizer:
if self.verify_operation(source_full, dest_full):
# Update database
cursor.execute(
"UPDATE files SET disk = %s, status = 'moved' WHERE path = %s AND disk = %s",
"UPDATE files_bak SET disk = %s, status = 'moved' WHERE path = %s AND disk = %s",
(dest_disk, source_path, source_disk)
)
@@ -393,7 +393,7 @@ class DiskReorganizer:
# Log operation as executed
cursor.execute(
"UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s",
"UPDATE operations_bak SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s",
(f"{source_disk}:{source_path}",)
)
@@ -407,7 +407,7 @@ class DiskReorganizer:
except Exception as e:
logger.error(f"\n Error processing {source_path}: {e}")
cursor.execute(
"UPDATE operations SET error = %s WHERE source_path = %s",
"UPDATE operations_bak SET error = %s WHERE source_path = %s",
(str(e), f"{source_disk}:{source_path}")
)
error_count += 1
@@ -436,7 +436,7 @@ class DiskReorganizer:
try:
cursor.execute("""
SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
SELECT status, COUNT(*), SUM(size) FROM files_bak GROUP BY status
""")
print("\n=== FILE MIGRATION REPORT ===")
@@ -445,7 +445,7 @@ class DiskReorganizer:
print(f"{status:15}: {count:6} files, {self.format_size(size or 0)}")
cursor.execute("""
SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
SELECT operation_type, executed, verified, COUNT(*) FROM operations_bak GROUP BY operation_type, executed, verified
""")
print("\n=== OPERATIONS REPORT ===")

View File

@@ -77,7 +77,7 @@ class MigrationEngine:
# Create index on status
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_operations_status
ON operations(status)
ON operations_bak(status)
""")
conn.commit()
@@ -116,7 +116,7 @@ class MigrationEngine:
query = f"""
SELECT path, size, category, duplicate_of
FROM files
FROM files_bak
WHERE {' AND '.join(conditions)}
ORDER BY category, path
"""
@@ -293,7 +293,7 @@ class MigrationEngine:
cursor = conn.cursor()
cursor.execute("""
INSERT INTO operations (
INSERT INTO operations_bak (
source_path, dest_path, operation_type, size,
status, error, executed_at, verified
)
@@ -333,7 +333,7 @@ class MigrationEngine:
cursor = conn.cursor()
cursor.execute("""
UPDATE operations
UPDATE operations_bak
SET status = 'rolled_back'
WHERE source_path = %s AND dest_path = %s
""", (str(operation.source_path), str(operation.dest_path)))
@@ -359,13 +359,13 @@ class MigrationEngine:
stats = {}
# Total operations
cursor.execute("SELECT COUNT(*) FROM operations")
cursor.execute("SELECT COUNT(*) FROM operations_bak")
stats['total_operations'] = cursor.fetchone()[0]
# Operations by status
cursor.execute("""
SELECT status, COUNT(*)
FROM operations
FROM operations_bak
GROUP BY status
""")
@@ -375,7 +375,7 @@ class MigrationEngine:
# Total size migrated
cursor.execute("""
SELECT COALESCE(SUM(size), 0)
FROM operations
FROM operations_bak
WHERE status = 'completed'
""")
stats['total_size_migrated'] = cursor.fetchone()[0]
@@ -397,7 +397,7 @@ class MigrationEngine:
cursor.execute("""
SELECT source_path, dest_path, operation_type
FROM operations
FROM operations_bak
WHERE status = 'completed' AND verified = FALSE
""")

View File

@@ -86,7 +86,7 @@ services:
- full-cycle
- development
# Uncomment for development with hot reload
# command: watchmedo auto-restart --pattern="*.py" --recursive -- python main.py
# command: watchmedo auto-restart --pattern="*.py" --recursive -- python app/main.py
# Single command services for specific operations
index:
@@ -105,7 +105,7 @@ services:
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "main.py", "index", "/mnt/source", "disk_d"]
command: ["python", "app/main.py", "index", "/mnt/source", "disk_d"]
profiles:
- index-only
networks:
@@ -127,7 +127,7 @@ services:
- ./config:/app/config
- ./plans:/app/plans
- ./logs:/app/logs
command: ["python", "main.py", "plan", "disk_d", "disk_e"]
command: ["python", "app/main.py", "plan", "disk_d", "disk_e"]
profiles:
- plan-only
networks:
@@ -151,7 +151,7 @@ services:
- ./plans:/app/plans
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "main.py", "execute", "/app/plans/plan.json"]
command: ["python", "app/main.py", "execute", "/app/plans/plan.json"]
profiles:
- execute-only
networks:
@@ -173,7 +173,7 @@ services:
- ./plans:/app/plans
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "main.py", "execute", "/app/plans/plan.json", "--dry-run"]
command: ["python", "app/main.py", "execute", "/app/plans/plan.json", "--dry-run"]
profiles:
- dry-run-only
networks:
@@ -194,7 +194,7 @@ services:
volumes:
- ./reports:/app/reports
- ./logs:/app/logs
command: ["python", "main.py", "report", "--format", "html"]
command: ["python", "app/main.py", "report", "--format", "html"]
profiles:
- report-only
networks:

6492
output.md

File diff suppressed because it is too large Load Diff

51
setup.sh Normal file
View File

@@ -0,0 +1,51 @@
#!/bin/bash
# setup.sh - Complete Docker setup for Project Defrag
set -e
echo "🚀 Setting up Project Defrag with Docker..."
# 1. Create necessary directories
echo "📁 Creating directories..."
mkdir -p {config,plans,logs,reports,sql/migrations}
# 2. Copy environment file
if [ ! -f .env ]; then
echo "⚙️ Creating .env file from template..."
cp .env.example .env
echo "⚠️ Please edit .env file with your configuration!"
fi
# 3. Build the Docker image
echo "🐳 Building Docker image..."
docker compose build app
# 4. Start the database
#echo "🗄️ Starting PostgreSQL database..."
#docker-compose up -d postgres
# 5. Wait for database to be ready
#echo "⏳ Waiting for database to be ready..."
#sleep 10
# 6. Run database initialization
#echo "📊 Initializing database..."
#docker-compose exec -T postgres psql -U disk_reorg_user -d disk_reorganizer_db -f /docker-entrypoint-initdb.d/init.sql
# 7. Start optional services
echo "🔧 Starting monitoring services..."
docker compose --profile monitoring up -d
echo "✅ Setup complete!"
echo ""
echo "📋 Available commands:"
echo " docker compose up -d # Start all services"
echo " docker compose up --profile index-only index # Run index only"
echo " docker compose up --profile plan-only plan # Generate plan"
echo " docker compose up --profile dry-run-only dry-run # Dry run"
echo " docker compose up --profile execute-only execute # Execute migration"
echo " docker compose up --profile report-only report # Generate report"
echo ""
echo "🌐 Access monitoring:"
echo " - PostgreSQL Admin: http://localhost:5050"
echo " - Redis Commander: http://localhost:8081"

View File

@@ -37,7 +37,7 @@ CREATE TABLE IF NOT EXISTS operations (
status VARCHAR(20) NOT NULL,
-- File reference
file_id UUID REFERENCES files(id) ON DELETE SET NULL,
file_id UUID REFERENCES files_bak(id) ON DELETE SET NULL,
-- Performance metrics
duration_ms INTEGER,
@@ -89,14 +89,14 @@ CREATE TABLE IF NOT EXISTS migration_plans (
);
-- Indexes for performance
CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);
CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);
CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label);
CREATE INDEX IF NOT EXISTS idx_files_category ON files(category);
CREATE INDEX IF NOT EXISTS idx_files_path ON files_bak(path);
CREATE INDEX IF NOT EXISTS idx_files_hash ON files_bak(file_hash);
CREATE INDEX IF NOT EXISTS idx_files_disk ON files_bak(disk_label);
CREATE INDEX IF NOT EXISTS idx_files_category ON files_bak(category);
CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);
CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations(file_id);
CREATE INDEX IF NOT EXISTS idx_operations_status ON operations_bak(status);
CREATE INDEX IF NOT EXISTS idx_operations_created ON operations_bak(created_at);
CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations_bak(file_id);
CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store(canonical_path);
@@ -110,7 +110,7 @@ END;
$$ language 'plpgsql';
-- Triggers for automatic updated_at
CREATE TRIGGER update_files_updated_at BEFORE UPDATE ON files
CREATE TRIGGER update_files_updated_at BEFORE UPDATE ON files_bak
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
-- View for operational dashboard
@@ -122,7 +122,7 @@ SELECT
AVG(o.duration_ms) as avg_duration_ms,
MIN(o.started_at) as earliest_operation,
MAX(o.completed_at) as latest_operation
FROM operations o
FROM operations_bak o
WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
GROUP BY o.status;
@@ -135,7 +135,7 @@ SELECT
AVG(size) as avg_file_size,
MIN(created_time) as oldest_file,
MAX(modified_time) as newest_file
FROM files
FROM files_bak
GROUP BY disk_label;
-- Insert default configuration

View File

@@ -1,6 +1,8 @@
-- PostgreSQL Database Setup Script for Disk Reorganizer
-- Database: disk_reorganizer_db
-- User: disk_reorg_user
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
-- Create the database (run as superuser: auction)
CREATE DATABASE disk_reorganizer_db