base

2025-12-12 03:28:42 +01:00
parent e2772c19fd
commit bad0d82447
38 changed files with 555 additions and 999 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,13 +1,7 @@
-### PythonVanilla template
-# Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
-
-# C extensions
 *.so
-
-# Distribution / packaging
 .Python
 build/
 develop-eggs/
@@ -21,48 +15,28 @@ parts/
 sdist/
 var/
 wheels/
-share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
-MANIFEST
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+*.sqlite3
+*.db
+*.log
+coverage.xml
+*.coverage
 .coverage
 .coverage.*
 .cache
 nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
+pytest.xml
+htmlcov/
+.tox/
 .pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-
+.mypy_cache/
+.pyre/
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,10 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Ignored default folder with query files
+/queries/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
--- a/.idea/copilot.data.migration.agent.xml
+++ b/.idea/copilot.data.migration.agent.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AgentMigrationStateService">
+    <option name="migrationStatus" value="COMPLETED" />
+  </component>
+</project>
--- a/.idea/defrag.iml
+++ b/.idea/defrag.iml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.13 (.venv)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/material_theme_project_new.xml
+++ b/.idea/material_theme_project_new.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MaterialThemeProjectNewConfig">
+    <option name="metadata">
+      <MTProjectMetadataState>
+        <option name="migrated" value="true" />
+        <option name="pristineConfig" value="false" />
+        <option name="userId" value="-d8205b3:197aab68e6f:-7ffe" />
+      </MTProjectMetadataState>
+    </option>
+    <option name="titleBarState">
+      <MTProjectTitleBarConfigState>
+        <option name="overrideColor" value="false" />
+      </MTProjectTitleBarConfigState>
+    </option>
+  </component>
+</project>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13 (.venv)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (.venv)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/defrag.iml" filepath="$PROJECT_DIR$/.idea/defrag.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -0,0 +1,340 @@
+# Data Reorganization Architecture: "Project Defrag"
+
+## Executive Summary
+
+This document outlines the architecture for reorganizing 20TB of backup data across multiple NVMe drives and servers. The solution implements intelligent deduplication, systematic categorization, and optimized storage patterns for enhanced performance and maintainability.
+
+## System Architecture Overview
+
+```mermaid
+graph TB
+    subgraph "Source Environment"
+        A["Local Machine<br/>8x NVMe + 1 HDD<br/>~10TB"]
+        B["Server Machine<br/>Mixed Storage<br/>~10TB"]
+    end
+    
+    subgraph "Processing Layer"
+        C["Discovery Engine"]
+        D["Classification Engine"]
+        E["Deduplication Engine"]
+        F["Migration Engine"]
+    end
+    
+    subgraph "Target Architecture"
+        G["App Volumes"]
+        H["Gitea Repository"]
+        I["Build Cache (.maven, pycache)"]
+        J["Artifactories"]
+        K["Databases"]
+        L["Backups"]
+        M["LLM Model Cache"]
+        N["Git Infrastructure"]
+    end
+    
+    A --> C
+    B --> C
+    C --> D
+    D --> E
+    E --> F
+    F --> G
+    F --> H
+    F --> I
+    F --> J
+    F --> K
+    F --> L
+    F --> M
+    F --> N
+```
+
+## Data Flow Architecture
+
+### Phase 1: Discovery & Assessment
+```mermaid
+sequenceDiagram
+    participant D as Discovery Engine
+    participant FS as File System Scanner
+    participant DB as Metadata Database
+    participant API as System APIs
+    
+    D->>FS: Scan directory structures
+    FS->>FS: Identify file types, sizes, dates
+    FS->>DB: Store file metadata
+    D->>API: Query system information
+    API->>DB: Store system context
+    DB->>D: Return analysis summary
+```
+
+### Phase 2: Classification & Deduplication
+```mermaid
+sequenceDiagram
+    participant C as Classifier
+    participant DH as Deduplication Hash
+    participant CDB as Canonical DB
+    participant MAP as Mapping Store
+    
+    C->>C: Analyze file signatures
+    C->>DH: Generate content hashes
+    DH->>CDB: Check for duplicates
+    CDB->>DH: Return canonical reference
+    DH->>MAP: Store deduplication map
+    C->>C: Apply categorization rules
+```
+
+## Target Directory Structure
+
+```
+/mnt/organized/
+├── apps/
+│   ├── volumes/
+│   │   ├── docker-volumes/
+│   │   ├── app-data/
+│   │   └── user-profiles/
+│   └── runtime/
+├── development/
+│   ├── gitea/
+│   │   ├── repositories/
+│   │   ├── lfs-objects/
+│   │   └── avatars/
+│   ├── git-infrastructure/
+│   │   ├── hooks/
+│   │   ├── templates/
+│   │   └── config/
+│   └── build-tools/
+│       ├── .maven/repository/
+│       ├── gradle-cache/
+│       └── sbt-cache/
+├── artifacts/
+│   ├── java/
+│   │   ├── maven-central-cache/
+│   │   ├── jfrog-artifactory/
+│   │   └── gradle-build-cache/
+│   ├── python/
+│   │   ├── pypi-cache/
+│   │   ├── wheelhouse/
+│   │   └── pip-cache/
+│   ├── node/
+│   │   ├── npm-registry/
+│   │   ├── yarn-cache/
+│   │   └── pnpm-store/
+│   └── go/
+│       ├── goproxy-cache/
+│       ├── module-cache/
+│       └── sumdb-cache/
+├── cache/
+│   ├── llm-models/
+│   │   ├── hugging-face/
+│   │   ├── openai-cache/
+│   │   └── local-llm/
+│   ├── pycache/
+│   ├── node_modules-archive/
+│   └── browser-cache/
+├── databases/
+│   ├── postgresql/
+│   ├── mysql/
+│   ├── mongodb/
+│   └── redis/
+├── backups/
+│   ├── system/
+│   ├── application/
+│   ├── database/
+│   └── archive/
+└── temp/
+    ├── processing/
+    ├── staging/
+    └── cleanup/
+```
+
+## Technology Stack Recommendation
+
+### Primary Language: **Python 3.11+**
+**Rationale:**
+- Excellent file system handling capabilities
+- Rich ecosystem for data processing (pandas, pyarrow)
+- Built-in multiprocessing for I/O operations
+- Superior hash library support for deduplication
+- Cross-platform compatibility
+
+### Key Libraries:
+```python
+# Core processing
+import asyncio
+import hashlib
+import multiprocessing as mp
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+
+# Data handling
+import pandas as pd
+import pyarrow as pa
+import sqlite3
+import json
+
+# File analysis
+import magic  # python-magic
+import mimetypes
+import filetype
+
+# System integration
+import psutil
+import shutil
+import os
+```
+
+## Deduplication Strategy
+
+### Algorithm Selection: **Variable-Size Chunking with Rabin Fingerprinting**
+
+```python
+class AdvancedDeduplication:
+    def __init__(self, avg_chunk_size=8192):
+        self.chunker = RabinChunker(avg_chunk_size)
+        self.hash_store = HashStore()
+        
+    def deduplicate_file(self, file_path):
+        chunks = self.chunker.chunk_file(file_path)
+        file_hash = self.compute_file_hash(chunks)
+        
+        if self.hash_store.exists(file_hash):
+            return self.create_reference(file_hash)
+        else:
+            self.store_canonical(file_path, file_hash)
+            return file_hash
+```
+
+### Performance Optimization:
+- **Parallel Processing**: Utilize all CPU cores for hashing
+- **Memory Mapping**: For large files (>100MB)
+- **Incremental Hashing**: Process files in streams
+- **Cache Layer**: Redis for frequently accessed hashes
+
+## Classification Engine
+
+### Rule-Based Classification System:
+
+```yaml
+classification_rules:
+  build_artifacts:
+    patterns:
+      - "**/target/**"
+      - "**/build/**"
+      - "**/dist/**"
+      - "**/node_modules/**"
+    action: categorize_as_build_cache
+  
+  development_tools:
+    patterns:
+      - "**/.maven/**"
+      - "**/.gradle/**"
+      - "**/.npm/**"
+      - "**/.cache/**"
+    action: categorize_as_tool_cache
+  
+  repositories:
+    patterns:
+      - "**/.git/**"
+      - "**/repositories/**"
+      - "**/gitea/**"
+    action: categorize_as_vcs
+  
+  database_files:
+    patterns:
+      - "**/*.db"
+      - "**/*.sqlite"
+      - "**/postgresql/**"
+      - "**/mysql/**"
+    action: categorize_as_database
+  
+  model_files:
+    patterns:
+      - "**/*.bin"
+      - "**/*.onnx"
+      - "**/models/**"
+      - "**/llm*/**"
+    action: categorize_as_ai_model
+```
+
+## Performance Considerations
+
+### NVMe Optimization Strategies:
+
+1. **Parallel I/O Operations**
+    - Queue depth optimization (32-64 operations)
+    - Async I/O with io_uring where available
+    - Multi-threaded directory traversal
+
+2. **Memory Management**
+    - Streaming processing for large files
+    - Memory-mapped file access
+    - Buffer pool for frequent operations
+
+3. **CPU Optimization**
+    - SIMD instructions for hashing (AVX2/NEON)
+    - Process pool for parallel processing
+    - NUMA-aware memory allocation
+
+## Migration Strategy
+
+### Three-Phase Approach:
+
+```mermaid
+graph LR
+    A[Phase 1: Analysis] --> B[Phase 2: Staging]
+    B --> C[Phase 3: Migration]
+    
+    A --> A1[Discovery Scan]
+    A --> A2[Deduplication Analysis]
+    A --> A3[Space Calculation]
+    
+    B --> B1[Create Target Structure]
+    B --> B2[Hard Link Staging]
+    B --> B3[Validation Check]
+    
+    C --> C1[Atomic Move Operations]
+    C --> C2[Symlink Updates]
+    C --> C3[Cleanup Verification]
+```
+
+## Monitoring & Validation
+
+### Key Metrics:
+- **Processing Rate**: Files/second, GB/hour
+- **Deduplication Ratio**: Original vs. Final size
+- **Error Rate**: Failed operations percentage
+- **Resource Usage**: CPU, Memory, I/O utilization
+
+### Validation Checks:
+- File integrity verification (hash comparison)
+- Directory structure validation
+- Symlink resolution testing
+- Permission preservation audit
+
+## Risk Mitigation
+
+### Safety Measures:
+1. **Read-First Approach**: Never modify source until validation
+2. **Incremental Processing**: Process in small batches
+3. **Backup Verification**: Ensure backup integrity before operations
+4. **Rollback Capability**: Maintain reverse mapping for recovery
+5. **Dry-Run Mode**: Preview all operations before execution
+
+## Implementation Timeline
+
+### Phase 1: Tool Development (2-3 weeks)
+- Core discovery engine
+- Classification system
+- Basic deduplication
+- Testing framework
+
+### Phase 2: Staging & Validation (1-2 weeks)
+- Target structure creation
+- Sample data processing
+- Performance optimization
+- Safety verification
+
+### Phase 3: Production Migration (2-4 weeks)
+- Full data processing
+- Continuous monitoring
+- Issue resolution
+- Final validation
+
+This architecture provides a robust, scalable solution for your data reorganization needs while maintaining data integrity and optimizing for your NVMe storage infrastructure.
--- a/README.md
+++ b/README.md
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,17 @@ psycopg2-binary>=2.9.9

 # Alternative: psycopg2>=2.9.9 (requires PostgreSQL development libraries)
 # Use psycopg2-binary for easier installation without compilation
+# Core dependencies
+python>=3.9
+
+# Optional/feature dependencies
+redis>=4.5.0  # For RedisHashStore
+scikit-learn>=1.0.0  # For MLClassifier
+numpy>=1.21.0  # For MLClassifier
+
+# Development dependencies
+pytest>=7.0.0
+pytest-cov>=4.0.0
+black>=22.0.0
+mypy>=0.950
+flake8>=5.0.0
--- a/src/classification/init.py
+++ b/src/classification/init.py
--- a/src/classification/_protocols.py
+++ b/src/classification/_protocols.py
--- a/src/classification/engine.py
+++ b/src/classification/engine.py
--- a/src/classification/ml.py
+++ b/src/classification/ml.py
--- a/src/classification/rules.py
+++ b/src/classification/rules.py
--- a/src/deduplication/init.py
+++ b/src/deduplication/init.py
--- a/src/deduplication/_protocols.py
+++ b/src/deduplication/_protocols.py
--- a/src/deduplication/chunker.py
+++ b/src/deduplication/chunker.py
--- a/src/deduplication/engine.py
+++ b/src/deduplication/engine.py
--- a/src/deduplication/store.py
+++ b/src/deduplication/store.py
--- a/src/discovery/init.py
+++ b/src/discovery/init.py
--- a/src/discovery/_protocols.py
+++ b/src/discovery/_protocols.py
@@ -0,0 +1,54 @@
+"""Protocol definitions for the discovery package"""
+from typing import Iterator, Protocol, Any
+from pathlib import Path
+from dataclasses import dataclass
+
+
+@dataclass
+class FileMeta:
+    """Metadata for a discovered file"""
+    path: Path
+    size: int
+    modified_time: float
+    created_time: float
+    # Add other metadata fields as needed
+
+
+@dataclass
+class MountInfo:
+    """Information about a mounted filesystem"""
+    device: str
+    mount_point: str
+    fs_type: str
+    options: str
+    # Add other mount info fields as needed
+
+
+@dataclass
+class DiskInfo:
+    """Information about a disk/NVMe device"""
+    device: str
+    model: str
+    size: int
+    serial: str
+    # Add other disk info fields as needed
+
+
+class IFileScanner(Protocol):
+    """Protocol for file scanning operations"""
+
+    def scan(self, root: Path) -> Iterator[FileMeta]:
+        """Scan a directory tree and yield file metadata"""
+        ...
+
+
+class ISystemAPI(Protocol):
+    """Protocol for system information queries"""
+
+    def query_mounts(self) -> list[MountInfo]:
+        """Query mounted filesystems"""
+        ...
+
+    def query_nvmes(self) -> list[DiskInfo]:
+        """Query NVMe/disk information"""
+        ...
--- a/src/discovery/engine.py
+++ b/src/discovery/engine.py
--- a/src/discovery/scanner.py
+++ b/src/discovery/scanner.py
--- a/src/discovery/system.py
+++ b/src/discovery/system.py
--- a/src/migration/init.py
+++ b/src/migration/init.py
--- a/src/migration/_protocols.py
+++ b/src/migration/_protocols.py
--- a/src/migration/copy.py
+++ b/src/migration/copy.py
--- a/src/migration/engine.py
+++ b/src/migration/engine.py
--- a/src/migration/hardlink.py
+++ b/src/migration/hardlink.py
--- a/src/shared/init.py
+++ b/src/shared/init.py
--- a/src/shared/_protocols.py
+++ b/src/shared/_protocols.py
--- a/src/shared/config.py
+++ b/src/shared/config.py
--- a/src/shared/logger.py
+++ b/src/shared/logger.py
--- a/src/shared/models.py
+++ b/src/shared/models.py
--- a/src/tests/init.py
+++ b/src/tests/init.py