initial
This commit is contained in:
18
.aiignore
Normal file
18
.aiignore
Normal file
@@ -0,0 +1,18 @@
|
||||
.DS_Store
|
||||
*.log
|
||||
*.tmp
|
||||
dist/
|
||||
build/
|
||||
out/
|
||||
.idea
|
||||
node_modules/
|
||||
.vscode/
|
||||
.git
|
||||
.github
|
||||
scripts
|
||||
.pytest_cache/
|
||||
__pycache__
|
||||
.aiignore
|
||||
*.iml
|
||||
.env
|
||||
.bundle.md
|
||||
44
.gitignore
vendored
Normal file
44
.gitignore
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
*.sqlite3
|
||||
*.db
|
||||
*.log
|
||||
coverage.xml
|
||||
*.coverage
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
pytest.xml
|
||||
htmlcov/
|
||||
.tox/
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.pyre/
|
||||
.idea
|
||||
*.imlbackup_*.dump
|
||||
340
ARCHITECTURE.md
Normal file
340
ARCHITECTURE.md
Normal file
@@ -0,0 +1,340 @@
|
||||
# Data Reorganization Architecture: "Project Defrag"
|
||||
|
||||
## Executive Summary
|
||||
|
||||
This document outlines the architecture for reorganizing 20TB of backup data across multiple NVMe drives and servers. The solution implements intelligent deduplication, systematic categorization, and optimized storage patterns for enhanced performance and maintainability.
|
||||
|
||||
## System Architecture Overview
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Source Environment"
|
||||
A["Local Machine<br/>8x NVMe + 1 HDD<br/>~10TB"]
|
||||
B["Server Machine<br/>Mixed Storage<br/>~10TB"]
|
||||
end
|
||||
|
||||
subgraph "Processing Layer"
|
||||
C["Discovery Engine"]
|
||||
D["Classification Engine"]
|
||||
E["Deduplication Engine"]
|
||||
F["Migration Engine"]
|
||||
end
|
||||
|
||||
subgraph "Target Architecture"
|
||||
G["App Volumes"]
|
||||
H["Gitea Repository"]
|
||||
I["Build Cache (.maven, pycache)"]
|
||||
J["Artifactories"]
|
||||
K["Databases"]
|
||||
L["Backups"]
|
||||
M["LLM Model Cache"]
|
||||
N["Git Infrastructure"]
|
||||
end
|
||||
|
||||
A --> C
|
||||
B --> C
|
||||
C --> D
|
||||
D --> E
|
||||
E --> F
|
||||
F --> G
|
||||
F --> H
|
||||
F --> I
|
||||
F --> J
|
||||
F --> K
|
||||
F --> L
|
||||
F --> M
|
||||
F --> N
|
||||
```
|
||||
|
||||
## Data Flow Architecture
|
||||
|
||||
### Phase 1: Discovery & Assessment
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant D as Discovery Engine
|
||||
participant FS as File System Scanner
|
||||
participant DB as Metadata Database
|
||||
participant API as System APIs
|
||||
|
||||
D->>FS: Scan directory structures
|
||||
FS->>FS: Identify file types, sizes, dates
|
||||
FS->>DB: Store file metadata
|
||||
D->>API: Query system information
|
||||
API->>DB: Store system context
|
||||
DB->>D: Return analysis summary
|
||||
```
|
||||
|
||||
### Phase 2: Classification & Deduplication
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant C as Classifier
|
||||
participant DH as Deduplication Hash
|
||||
participant CDB as Canonical DB
|
||||
participant MAP as Mapping Store
|
||||
|
||||
C->>C: Analyze file signatures
|
||||
C->>DH: Generate content hashes
|
||||
DH->>CDB: Check for duplicates
|
||||
CDB->>DH: Return canonical reference
|
||||
DH->>MAP: Store deduplication map
|
||||
C->>C: Apply categorization rules
|
||||
```
|
||||
|
||||
## Target Directory Structure
|
||||
|
||||
```
|
||||
/mnt/organized/
|
||||
├── apps/
|
||||
│ ├── volumes/
|
||||
│ │ ├── docker-volumes/
|
||||
│ │ ├── app-data/
|
||||
│ │ └── user-profiles/
|
||||
│ └── runtime/
|
||||
├── development/
|
||||
│ ├── gitea/
|
||||
│ │ ├── repositories/
|
||||
│ │ ├── lfs-objects/
|
||||
│ │ └── avatars/
|
||||
│ ├── git-infrastructure/
|
||||
│ │ ├── hooks/
|
||||
│ │ ├── templates/
|
||||
│ │ └── config/
|
||||
│ └── build-tools/
|
||||
│ ├── .maven/repository/
|
||||
│ ├── gradle-cache/
|
||||
│ └── sbt-cache/
|
||||
├── artifacts/
|
||||
│ ├── java/
|
||||
│ │ ├── maven-central-cache/
|
||||
│ │ ├── jfrog-artifactory/
|
||||
│ │ └── gradle-build-cache/
|
||||
│ ├── python/
|
||||
│ │ ├── pypi-cache/
|
||||
│ │ ├── wheelhouse/
|
||||
│ │ └── pip-cache/
|
||||
│ ├── node/
|
||||
│ │ ├── npm-registry/
|
||||
│ │ ├── yarn-cache/
|
||||
│ │ └── pnpm-store/
|
||||
│ └── go/
|
||||
│ ├── goproxy-cache/
|
||||
│ ├── module-cache/
|
||||
│ └── sumdb-cache/
|
||||
├── cache/
|
||||
│ ├── llm-models/
|
||||
│ │ ├── hugging-face/
|
||||
│ │ ├── openai-cache/
|
||||
│ │ └── local-llm/
|
||||
│ ├── pycache/
|
||||
│ ├── node_modules-archive/
|
||||
│ └── browser-cache/
|
||||
├── databases/
|
||||
│ ├── postgresql/
|
||||
│ ├── mysql/
|
||||
│ ├── mongodb/
|
||||
│ └── redis/
|
||||
├── backups/
|
||||
│ ├── system/
|
||||
│ ├── application/
|
||||
│ ├── database/
|
||||
│ └── archive/
|
||||
└── temp/
|
||||
├── processing/
|
||||
├── staging/
|
||||
└── cleanup/
|
||||
```
|
||||
|
||||
## Technology Stack Recommendation
|
||||
|
||||
### Primary Language: **Python 3.11+**
|
||||
**Rationale:**
|
||||
- Excellent file system handling capabilities
|
||||
- Rich ecosystem for data processing (pandas, pyarrow)
|
||||
- Built-in multiprocessing for I/O operations
|
||||
- Superior hash library support for deduplication
|
||||
- Cross-platform compatibility
|
||||
|
||||
### Key Libraries:
|
||||
```python
|
||||
# Core processing
|
||||
import asyncio
|
||||
import hashlib
|
||||
import multiprocessing as mp
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
|
||||
|
||||
# Data handling
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
# File analysis
|
||||
import magic # python-magic
|
||||
import mimetypes
|
||||
import filetype
|
||||
|
||||
# System integration
|
||||
import psutil
|
||||
import shutil
|
||||
import os
|
||||
```
|
||||
|
||||
## Deduplication Strategy
|
||||
|
||||
### Algorithm Selection: **Variable-Size Chunking with Rabin Fingerprinting**
|
||||
|
||||
```python
|
||||
class AdvancedDeduplication:
|
||||
def __init__(self, avg_chunk_size=8192):
|
||||
self.chunker = RabinChunker(avg_chunk_size)
|
||||
self.hash_store = HashStore()
|
||||
|
||||
def deduplicate_file(self, file_path):
|
||||
chunks = self.chunker.chunk_file(file_path)
|
||||
file_hash = self.compute_file_hash(chunks)
|
||||
|
||||
if self.hash_store.exists(file_hash):
|
||||
return self.create_reference(file_hash)
|
||||
else:
|
||||
self.store_canonical(file_path, file_hash)
|
||||
return file_hash
|
||||
```
|
||||
|
||||
### Performance Optimization:
|
||||
- **Parallel Processing**: Utilize all CPU cores for hashing
|
||||
- **Memory Mapping**: For large files (>100MB)
|
||||
- **Incremental Hashing**: Process files in streams
|
||||
- **Cache Layer**: Redis for frequently accessed hashes
|
||||
|
||||
## Classification Engine
|
||||
|
||||
### Rule-Based Classification System:
|
||||
|
||||
```yaml
|
||||
classification_rules:
|
||||
build_artifacts:
|
||||
patterns:
|
||||
- "**/target/**"
|
||||
- "**/build/**"
|
||||
- "**/dist/**"
|
||||
- "**/node_modules/**"
|
||||
action: categorize_as_build_cache
|
||||
|
||||
development_tools:
|
||||
patterns:
|
||||
- "**/.maven/**"
|
||||
- "**/.gradle/**"
|
||||
- "**/.npm/**"
|
||||
- "**/.cache/**"
|
||||
action: categorize_as_tool_cache
|
||||
|
||||
repositories:
|
||||
patterns:
|
||||
- "**/.git/**"
|
||||
- "**/repositories/**"
|
||||
- "**/gitea/**"
|
||||
action: categorize_as_vcs
|
||||
|
||||
database_files:
|
||||
patterns:
|
||||
- "**/*.db"
|
||||
- "**/*.sqlite"
|
||||
- "**/postgresql/**"
|
||||
- "**/mysql/**"
|
||||
action: categorize_as_database
|
||||
|
||||
model_files:
|
||||
patterns:
|
||||
- "**/*.bin"
|
||||
- "**/*.onnx"
|
||||
- "**/models/**"
|
||||
- "**/llm*/**"
|
||||
action: categorize_as_ai_model
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### NVMe Optimization Strategies:
|
||||
|
||||
1. **Parallel I/O Operations**
|
||||
- Queue depth optimization (32-64 operations)
|
||||
- Async I/O with io_uring where available
|
||||
- Multi-threaded directory traversal
|
||||
|
||||
2. **Memory Management**
|
||||
- Streaming processing for large files
|
||||
- Memory-mapped file access
|
||||
- Buffer pool for frequent operations
|
||||
|
||||
3. **CPU Optimization**
|
||||
- SIMD instructions for hashing (AVX2/NEON)
|
||||
- Process pool for parallel processing
|
||||
- NUMA-aware memory allocation
|
||||
|
||||
## Migration Strategy
|
||||
|
||||
### Three-Phase Approach:
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
A[Phase 1: Analysis] --> B[Phase 2: Staging]
|
||||
B --> C[Phase 3: Migration]
|
||||
|
||||
A --> A1[Discovery Scan]
|
||||
A --> A2[Deduplication Analysis]
|
||||
A --> A3[Space Calculation]
|
||||
|
||||
B --> B1[Create Target Structure]
|
||||
B --> B2[Hard Link Staging]
|
||||
B --> B3[Validation Check]
|
||||
|
||||
C --> C1[Atomic Move Operations]
|
||||
C --> C2[Symlink Updates]
|
||||
C --> C3[Cleanup Verification]
|
||||
```
|
||||
|
||||
## Monitoring & Validation
|
||||
|
||||
### Key Metrics:
|
||||
- **Processing Rate**: Files/second, GB/hour
|
||||
- **Deduplication Ratio**: Original vs. Final size
|
||||
- **Error Rate**: Failed operations percentage
|
||||
- **Resource Usage**: CPU, Memory, I/O utilization
|
||||
|
||||
### Validation Checks:
|
||||
- File integrity verification (hash comparison)
|
||||
- Directory structure validation
|
||||
- Symlink resolution testing
|
||||
- Permission preservation audit
|
||||
|
||||
## Risk Mitigation
|
||||
|
||||
### Safety Measures:
|
||||
1. **Read-First Approach**: Never modify source until validation
|
||||
2. **Incremental Processing**: Process in small batches
|
||||
3. **Backup Verification**: Ensure backup integrity before operations
|
||||
4. **Rollback Capability**: Maintain reverse mapping for recovery
|
||||
5. **Dry-Run Mode**: Preview all operations before execution
|
||||
|
||||
## Implementation Timeline
|
||||
|
||||
### Phase 1: Tool Development (2-3 weeks)
|
||||
- Core discovery engine
|
||||
- Classification system
|
||||
- Basic deduplication
|
||||
- Testing framework
|
||||
|
||||
### Phase 2: Staging & Validation (1-2 weeks)
|
||||
- Target structure creation
|
||||
- Sample data processing
|
||||
- Performance optimization
|
||||
- Safety verification
|
||||
|
||||
### Phase 3: Production Migration (2-4 weeks)
|
||||
- Full data processing
|
||||
- Continuous monitoring
|
||||
- Issue resolution
|
||||
- Final validation
|
||||
|
||||
This architecture provides a robust, scalable solution for your data reorganization needs while maintaining data integrity and optimizing for your NVMe storage infrastructure.
|
||||
38
Dockerfile
Normal file
38
Dockerfile
Normal file
@@ -0,0 +1,38 @@
|
||||
# Dockerfile for Project Defrag with PostgreSQL integration
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
libpq-dev \
|
||||
postgresql-client \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PYTHONPATH=/app
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create non-root user
|
||||
RUN useradd -m -u 1000 appuser && \
|
||||
chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD python -c "import psycopg2; psycopg2.connect(dbname='${POSTGRES_DB:-disk_reorganizer_db}', user='${POSTGRES_USER:-disk_reorg_user}', password='${POSTGRES_PASSWORD}', host='${DB_HOST:-db}', port='${DB_PORT:-5432}')" || exit 1
|
||||
|
||||
# Default command (can be overridden in docker-compose)
|
||||
CMD ["python", "app/main.py", "--help"]
|
||||
114
README.md
Normal file
114
README.md
Normal file
@@ -0,0 +1,114 @@
|
||||
Hier is je **extreme short, sharp, architectural** versie — volledig gecomprimeerd, professioneel, helder.
|
||||
Bron verwerkt uit je bestand
|
||||
|
||||
---
|
||||
|
||||
# Disk Reorganizer — Architectural Summary
|
||||
|
||||
## Core Outcome
|
||||
|
||||
Migration from **SQLite → PostgreSQL** completed.
|
||||
System is now **network-capable**, **auditable**, **scalable**, and offers **real-time operational telemetry**.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
### Database Layer (PostgreSQL)
|
||||
|
||||
* Central DB: `disk_reorganizer_db`
|
||||
* User: `disk_reorg_user`
|
||||
* Tables: `files`, `operations`
|
||||
* Features: indexes, triggers, conflict-upserts, audit fields
|
||||
* Deployment: SQL + Windows/Linux setup scripts
|
||||
|
||||
### Application Layer
|
||||
|
||||
* Python driver migrated to **psycopg2**
|
||||
* Unified DB config + connection pooling
|
||||
* Refactored CRUD + batch commits
|
||||
* Robust error handling + transactional execution
|
||||
|
||||
### Operational Layer
|
||||
|
||||
* **Dynamic in-screen logging** during indexing + migration
|
||||
|
||||
* File/sec, GB processed, ETA, success/error counters
|
||||
* Clean single-line, non-spamming UI updates
|
||||
|
||||
---
|
||||
|
||||
## Workflow
|
||||
|
||||
1. **Setup**
|
||||
|
||||
```json
|
||||
{
|
||||
"host": "192.168.1.159",
|
||||
"port": 5432,
|
||||
"database": "disk_reorganizer_db",
|
||||
"user": "disk_reorg_user",
|
||||
"password": "heel-goed-wachtwoord"
|
||||
}
|
||||
```
|
||||
```bash
|
||||
./setup_database.sh # or setup_database.bat
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
2. **Index**
|
||||
|
||||
```bash
|
||||
python app/main.py index "D:\\" disk_d
|
||||
```
|
||||
|
||||
3. **Plan**
|
||||
|
||||
```bash
|
||||
python app/main.py plan disk_d disk_e
|
||||
```
|
||||
|
||||
4. **Dry-Run**
|
||||
|
||||
```bash
|
||||
python app/main.py execute plan.json --dry-run
|
||||
```
|
||||
|
||||
5. **Execute**
|
||||
|
||||
```bash
|
||||
python app/main.py execute plan.json
|
||||
```
|
||||
|
||||
6. **Report**
|
||||
|
||||
```bash
|
||||
python src/main.py report
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Guarantees
|
||||
|
||||
* No destructive actions by default
|
||||
* Originals preserved
|
||||
* Every action logged in DB
|
||||
* Error-resilient, continues safely
|
||||
* Suitable for millions of file records
|
||||
|
||||
---
|
||||
|
||||
## Failure Points to Check
|
||||
|
||||
* PostgreSQL reachable on 5432
|
||||
* Correct credentials
|
||||
* Disk permissions
|
||||
* Python + psycopg2 installed
|
||||
|
||||
---
|
||||
|
||||
## Essence
|
||||
|
||||
A lean, safe, high-visibility disk migration tool running on a proper relational backbone, engineered for clarity, scale, and operational certainty.
|
||||
|
||||
Wil je ook een **ultrakorte executive one-pager** of een **diagram-versie**?
|
||||
63
app/analysis/folder_analyzer.py
Normal file
63
app/analysis/folder_analyzer.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Set, List
|
||||
from collections import Counter
|
||||
|
||||
class FolderAnalyzer:
|
||||
|
||||
def __init__(self):
|
||||
self.manifest_files = {'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'], 'javascript': ['package.json', 'yarn.lock', 'package-lock.json'], 'python': ['pyproject.toml', 'setup.py', 'requirements.txt', 'Pipfile'], 'go': ['go.mod', 'go.sum'], 'rust': ['Cargo.toml', 'Cargo.lock'], 'docker': ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml'], 'k8s': ['helm', 'kustomization.yaml', 'deployment.yaml']}
|
||||
self.intent_keywords = {'infrastructure': ['infra', 'deploy', 'k8s', 'docker', 'terraform', 'ansible'], 'application': ['app', 'service', 'api', 'server', 'client'], 'data': ['data', 'dataset', 'models', 'training', 'ml'], 'documentation': ['docs', 'documentation', 'wiki', 'readme'], 'testing': ['test', 'tests', 'spec', 'e2e', 'integration'], 'build': ['build', 'dist', 'target', 'out', 'bin'], 'config': ['config', 'conf', 'settings', 'env']}
|
||||
|
||||
def analyze_folder(self, folder_path: Path, files: List[Dict]) -> Dict:
|
||||
files_list = [Path(f['path']) for f in files]
|
||||
has_readme = any(('readme' in f.name.lower() for f in files_list))
|
||||
has_git = any(('.git' in str(f) for f in files_list))
|
||||
manifest_types = self._detect_manifests(files_list)
|
||||
has_manifest = len(manifest_types) > 0
|
||||
file_types = Counter((f.suffix.lower() for f in files_list if f.suffix))
|
||||
dominant_types = dict(file_types.most_common(10))
|
||||
intent = self._infer_intent(folder_path.name.lower(), files_list)
|
||||
project_type = self._infer_project_type(manifest_types, dominant_types)
|
||||
structure = {'depth': len(folder_path.parts), 'has_src': any(('src' in str(f) for f in files_list[:20])), 'has_tests': any(('test' in str(f) for f in files_list[:20])), 'has_docs': any(('doc' in str(f) for f in files_list[:20]))}
|
||||
return {'has_readme': has_readme, 'has_git': has_git, 'has_manifest': has_manifest, 'manifest_types': manifest_types, 'dominant_file_types': dominant_types, 'project_type': project_type, 'intent': intent, 'structure': structure}
|
||||
|
||||
def _detect_manifests(self, files: List[Path]) -> List[str]:
|
||||
detected = []
|
||||
file_names = {f.name for f in files}
|
||||
for tech, manifests in self.manifest_files.items():
|
||||
if any((m in file_names for m in manifests)):
|
||||
detected.append(tech)
|
||||
return detected
|
||||
|
||||
def _infer_intent(self, folder_name: str, files: List[Path]) -> str:
|
||||
file_str = ' '.join((str(f) for f in files[:50]))
|
||||
for intent, keywords in self.intent_keywords.items():
|
||||
if any((kw in folder_name or kw in file_str.lower() for kw in keywords)):
|
||||
return intent
|
||||
return 'unknown'
|
||||
|
||||
def _infer_project_type(self, manifests: List[str], file_types: Dict) -> str:
|
||||
if manifests:
|
||||
return manifests[0]
|
||||
if '.py' in file_types and file_types.get('.py', 0) > 5:
|
||||
return 'python'
|
||||
if '.js' in file_types or '.ts' in file_types:
|
||||
return 'javascript'
|
||||
if '.java' in file_types:
|
||||
return 'java'
|
||||
if '.go' in file_types:
|
||||
return 'go'
|
||||
return 'mixed'
|
||||
|
||||
def generate_summary(self, folder_analysis: Dict, readme_text: str=None) -> str:
|
||||
parts = []
|
||||
if folder_analysis.get('project_type'):
|
||||
parts.append(f"{folder_analysis['project_type']} project")
|
||||
if folder_analysis.get('intent'):
|
||||
parts.append(f"for {folder_analysis['intent']}")
|
||||
if folder_analysis.get('manifest_types'):
|
||||
parts.append(f"using {', '.join(folder_analysis['manifest_types'])}")
|
||||
if readme_text:
|
||||
first_para = readme_text.split('\n\n')[0][:200]
|
||||
parts.append(f'Description: {first_para}')
|
||||
return ' '.join(parts) if parts else 'Mixed content folder'
|
||||
2
app/classification/__init__.py
Normal file
2
app/classification/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .classifier import FileClassifier
|
||||
__all__ = ['FileClassifier']
|
||||
30
app/classification/_protocols.py
Normal file
30
app/classification/_protocols.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from typing import Protocol, Optional
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class ClassificationRule:
|
||||
name: str
|
||||
category: str
|
||||
patterns: list[str]
|
||||
priority: int = 0
|
||||
description: str = ''
|
||||
|
||||
class IClassifier(Protocol):
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
|
||||
...
|
||||
|
||||
def get_category_rules(self, category: str) -> list[ClassificationRule]:
|
||||
...
|
||||
|
||||
class IRuleEngine(Protocol):
|
||||
|
||||
def add_rule(self, rule: ClassificationRule) -> None:
|
||||
...
|
||||
|
||||
def remove_rule(self, rule_name: str) -> None:
|
||||
...
|
||||
|
||||
def match_path(self, path: Path) -> Optional[str]:
|
||||
...
|
||||
74
app/classification/classifier.py
Normal file
74
app/classification/classifier.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Set, Dict, Tuple
|
||||
import re
|
||||
|
||||
class FileClassifier:
|
||||
|
||||
def __init__(self):
|
||||
self.build_patterns = {'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', 'site-packages', 'bower_components', 'jspm_packages'}
|
||||
self.artifact_patterns = {'java': {'.jar', '.war', '.ear', '.class'}, 'python': {'.pyc', '.pyo', '.whl', '.egg'}, 'node': {'node_modules'}, 'go': {'vendor', 'pkg'}, 'rust': {'target'}, 'docker': {'.dockerignore', 'Dockerfile'}}
|
||||
self.category_keywords = {'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, 'cache': {'cache', 'temp', 'tmp', '.cache'}, 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, 'backups': {'backup', 'bak', 'snapshot', 'archive'}, 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, 'artifacts': {'build', 'dist', 'release', 'output'}, 'temp': {'tmp', 'temp', 'staging', 'processing'}}
|
||||
self.media_extensions = {'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, 'presentation': {'.ppt', '.pptx', '.odp'}}
|
||||
self.code_extensions = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'}
|
||||
|
||||
def classify_path(self, path: str, size: int=0) -> Tuple[Set[str], str, bool]:
|
||||
p = Path(path)
|
||||
labels = set()
|
||||
primary_category = 'misc'
|
||||
is_build_artifact = False
|
||||
parts = p.parts
|
||||
name_lower = p.name.lower()
|
||||
for part in parts:
|
||||
part_lower = part.lower()
|
||||
if part_lower in self.build_patterns:
|
||||
is_build_artifact = True
|
||||
labels.add('build-artifact')
|
||||
break
|
||||
if is_build_artifact:
|
||||
for artifact_type, patterns in self.artifact_patterns.items():
|
||||
if any((part.lower() in patterns for part in parts)) or p.suffix in patterns:
|
||||
primary_category = f'artifacts/{artifact_type}'
|
||||
labels.add('artifact')
|
||||
return (labels, primary_category, is_build_artifact)
|
||||
if '.git' in parts:
|
||||
labels.add('vcs')
|
||||
primary_category = 'infra/git-infrastructure'
|
||||
return (labels, primary_category, False)
|
||||
for category, keywords in self.category_keywords.items():
|
||||
if any((kw in name_lower or any((kw in part.lower() for part in parts)) for kw in keywords)):
|
||||
labels.add(category)
|
||||
primary_category = category
|
||||
break
|
||||
for media_type, extensions in self.media_extensions.items():
|
||||
if p.suffix.lower() in extensions:
|
||||
labels.add(media_type)
|
||||
labels.add('media')
|
||||
primary_category = f'user/{media_type}'
|
||||
break
|
||||
if p.suffix.lower() in self.code_extensions:
|
||||
labels.add('code')
|
||||
if primary_category == 'misc':
|
||||
primary_category = 'dev'
|
||||
if size > 100 * 1024 * 1024:
|
||||
labels.add('large-file')
|
||||
if any((kw in name_lower for kw in ['test', 'spec', 'mock'])):
|
||||
labels.add('test')
|
||||
if any((kw in name_lower for kw in ['config', 'settings', 'env'])):
|
||||
labels.add('config')
|
||||
return (labels, primary_category, is_build_artifact)
|
||||
|
||||
def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str:
|
||||
p = Path(source_path)
|
||||
if 'build-artifact' in labels:
|
||||
return f'trash/build-artifacts/{source_path}'
|
||||
if category.startswith('artifacts/'):
|
||||
artifact_type = category.split('/')[-1]
|
||||
return f'artifacts/{artifact_type}/{p.name}'
|
||||
if category.startswith('user/'):
|
||||
media_type = category.split('/')[-1]
|
||||
return f'user/{media_type}/{p.name}'
|
||||
parts = [part for part in p.parts if part not in self.build_patterns]
|
||||
if len(parts) > 3:
|
||||
project_name = parts[0] if parts else 'misc'
|
||||
return f"{category}/{project_name}/{'/'.join(parts[1:])}"
|
||||
return f'{category}/{source_path}'
|
||||
148
app/classification/engine.py
Normal file
148
app/classification/engine.py
Normal file
@@ -0,0 +1,148 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
import psycopg2
|
||||
from .rules import RuleBasedClassifier
|
||||
from .ml import create_ml_classifier, DummyMLClassifier
|
||||
from ..shared.models import ProcessingStats
|
||||
from ..shared.config import DatabaseConfig
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
class ClassificationEngine:
|
||||
|
||||
def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, use_ml: bool=False):
|
||||
self.db_config = db_config
|
||||
self.logger = logger
|
||||
self.rule_classifier = RuleBasedClassifier()
|
||||
self.ml_classifier = create_ml_classifier() if use_ml else None
|
||||
self.use_ml = use_ml and (not isinstance(self.ml_classifier, DummyMLClassifier))
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
|
||||
return self._connection
|
||||
|
||||
def classify_all(self, disk: Optional[str]=None, batch_size: int=1000, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
|
||||
self.logger.section('Starting Classification')
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
if disk:
|
||||
cursor.execute('\n SELECT path, checksum\n FROM files\n WHERE disk_label = %s AND category IS NULL\n ', (disk,))
|
||||
else:
|
||||
cursor.execute('\n SELECT path, checksum\n FROM files\n WHERE category IS NULL\n ')
|
||||
files_to_classify = cursor.fetchall()
|
||||
total_files = len(files_to_classify)
|
||||
self.logger.info(f'Found {total_files} files to classify')
|
||||
stats = ProcessingStats()
|
||||
batch = []
|
||||
for path_str, checksum in files_to_classify:
|
||||
path = Path(path_str)
|
||||
category = self.rule_classifier.classify(path)
|
||||
if category is None and self.use_ml and self.ml_classifier:
|
||||
category = self.ml_classifier.classify(path)
|
||||
if category is None:
|
||||
category = 'temp/processing'
|
||||
batch.append((category, str(path)))
|
||||
stats.files_processed += 1
|
||||
if len(batch) >= batch_size:
|
||||
self._update_categories(cursor, batch)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
if progress_callback:
|
||||
progress_callback(stats.files_processed, total_files, stats)
|
||||
if stats.files_processed % (batch_size * 10) == 0:
|
||||
self.logger.progress(stats.files_processed, total_files, prefix='Files classified', elapsed_seconds=stats.elapsed_seconds)
|
||||
if batch:
|
||||
self._update_categories(cursor, batch)
|
||||
conn.commit()
|
||||
stats.files_succeeded = stats.files_processed
|
||||
cursor.close()
|
||||
self.logger.info(f'Classification complete: {stats.files_processed} files in {stats.elapsed_seconds:.1f}s')
|
||||
return stats
|
||||
|
||||
def _update_categories(self, cursor, batch: list[tuple[str, str]]):
|
||||
from psycopg2.extras import execute_batch
|
||||
query = '\n UPDATE files\n SET category = %s\n WHERE path = %s\n '
|
||||
execute_batch(cursor, query, batch)
|
||||
|
||||
def classify_path(self, path: Path) -> Optional[str]:
|
||||
category = self.rule_classifier.classify(path)
|
||||
if category is None and self.use_ml and self.ml_classifier:
|
||||
category = self.ml_classifier.classify(path)
|
||||
return category
|
||||
|
||||
def get_category_stats(self) -> dict[str, dict]:
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('\n SELECT\n category,\n COUNT(*) as file_count,\n SUM(size) as total_size\n FROM files\n WHERE category IS NOT NULL\n GROUP BY category\n ORDER BY total_size DESC\n ')
|
||||
stats = {}
|
||||
for category, file_count, total_size in cursor.fetchall():
|
||||
stats[category] = {'file_count': file_count, 'total_size': total_size}
|
||||
cursor.close()
|
||||
return stats
|
||||
|
||||
def get_uncategorized_count(self) -> int:
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT COUNT(*) FROM files WHERE category IS NULL')
|
||||
count = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
return count
|
||||
|
||||
def reclassify_category(self, old_category: str, new_category: str) -> int:
|
||||
self.logger.info(f'Reclassifying {old_category} -> {new_category}')
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('\n UPDATE files\n SET category = %s\n WHERE category = %s\n ', (new_category, old_category))
|
||||
count = cursor.rowcount
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
self.logger.info(f'Reclassified {count} files')
|
||||
return count
|
||||
|
||||
def train_ml_classifier(self, min_samples: int=10) -> bool:
|
||||
if not self.use_ml or self.ml_classifier is None:
|
||||
self.logger.warning('ML classifier not available')
|
||||
return False
|
||||
self.logger.subsection('Training ML Classifier')
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('\n SELECT path, category\n FROM files\n WHERE category IS NOT NULL\n ')
|
||||
training_data = [(Path(path), category) for path, category in cursor.fetchall()]
|
||||
cursor.close()
|
||||
if not training_data:
|
||||
self.logger.warning('No training data available')
|
||||
return False
|
||||
category_counts = {}
|
||||
for _, category in training_data:
|
||||
category_counts[category] = category_counts.get(category, 0) + 1
|
||||
filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples]
|
||||
if not filtered_data:
|
||||
self.logger.warning(f'No categories with >= {min_samples} samples')
|
||||
return False
|
||||
self.logger.info(f'Training with {len(filtered_data)} samples')
|
||||
try:
|
||||
self.ml_classifier.train(filtered_data)
|
||||
self.logger.info('ML classifier trained successfully')
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f'Failed to train ML classifier: {e}')
|
||||
return False
|
||||
|
||||
def get_all_categories(self) -> list[str]:
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('\n SELECT DISTINCT category\n FROM files\n WHERE category IS NOT NULL\n ORDER BY category\n ')
|
||||
categories = [row[0] for row in cursor.fetchall()]
|
||||
cursor.close()
|
||||
return categories
|
||||
|
||||
def close(self):
|
||||
if self._connection and (not self._connection.closed):
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
127
app/classification/ml.py
Normal file
127
app/classification/ml.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Tuple
|
||||
import pickle
|
||||
try:
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
SKLEARN_AVAILABLE = True
|
||||
except ImportError:
|
||||
SKLEARN_AVAILABLE = False
|
||||
|
||||
class MLClassifier:
|
||||
|
||||
def __init__(self):
|
||||
if not SKLEARN_AVAILABLE:
|
||||
raise ImportError('scikit-learn is required for ML classification. Install with: pip install scikit-learn')
|
||||
self.model: Optional[Pipeline] = None
|
||||
self.categories: List[str] = []
|
||||
self._is_trained = False
|
||||
|
||||
def _extract_features(self, path: Path) -> str:
|
||||
parts = path.parts
|
||||
extension = path.suffix
|
||||
filename = path.name
|
||||
features = []
|
||||
features.extend(parts)
|
||||
if extension:
|
||||
features.append(f'ext:{extension}')
|
||||
name_parts = filename.replace('-', ' ').replace('_', ' ').replace('.', ' ').split()
|
||||
features.extend([f'name:{part}' for part in name_parts])
|
||||
return ' '.join(features)
|
||||
|
||||
def train(self, training_data: List[Tuple[Path, str]]) -> None:
|
||||
if not training_data:
|
||||
raise ValueError('Training data cannot be empty')
|
||||
X = [self._extract_features(path) for path, _ in training_data]
|
||||
y = [category for _, category in training_data]
|
||||
self.categories = sorted(set(y))
|
||||
self.model = Pipeline([('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 2), min_df=1)), ('classifier', MultinomialNB())])
|
||||
self.model.fit(X, y)
|
||||
self._is_trained = True
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
|
||||
if not self._is_trained or self.model is None:
|
||||
return None
|
||||
features = self._extract_features(path)
|
||||
try:
|
||||
prediction = self.model.predict([features])[0]
|
||||
return prediction
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def predict_proba(self, path: Path) -> dict[str, float]:
|
||||
if not self._is_trained or self.model is None:
|
||||
return {}
|
||||
features = self._extract_features(path)
|
||||
try:
|
||||
probabilities = self.model.predict_proba([features])[0]
|
||||
return {category: float(prob) for category, prob in zip(self.categories, probabilities)}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def save_model(self, model_path: Path) -> None:
|
||||
if not self._is_trained:
|
||||
raise ValueError('Cannot save untrained model')
|
||||
model_data = {'model': self.model, 'categories': self.categories, 'is_trained': self._is_trained}
|
||||
with open(model_path, 'wb') as f:
|
||||
pickle.dump(model_data, f)
|
||||
|
||||
def load_model(self, model_path: Path) -> None:
|
||||
with open(model_path, 'rb') as f:
|
||||
model_data = pickle.load(f)
|
||||
self.model = model_data['model']
|
||||
self.categories = model_data['categories']
|
||||
self._is_trained = model_data['is_trained']
|
||||
|
||||
@property
|
||||
def is_trained(self) -> bool:
|
||||
return self._is_trained
|
||||
|
||||
class DummyMLClassifier:
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def train(self, training_data: List[Tuple[Path, str]]) -> None:
|
||||
raise NotImplementedError('ML classification requires scikit-learn. Install with: pip install scikit-learn')
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
|
||||
return None
|
||||
|
||||
def predict_proba(self, path: Path) -> dict[str, float]:
|
||||
return {}
|
||||
|
||||
def save_model(self, model_path: Path) -> None:
|
||||
raise NotImplementedError('ML classification not available')
|
||||
|
||||
def load_model(self, model_path: Path) -> None:
|
||||
raise NotImplementedError('ML classification not available')
|
||||
|
||||
@property
|
||||
def is_trained(self) -> bool:
|
||||
return False
|
||||
|
||||
def create_ml_classifier() -> MLClassifier | DummyMLClassifier:
|
||||
if SKLEARN_AVAILABLE:
|
||||
return MLClassifier()
|
||||
else:
|
||||
return DummyMLClassifier()
|
||||
|
||||
def train_from_database(db_connection, min_samples_per_category: int=10) -> MLClassifier | DummyMLClassifier:
|
||||
classifier = create_ml_classifier()
|
||||
if isinstance(classifier, DummyMLClassifier):
|
||||
return classifier
|
||||
cursor = db_connection.cursor()
|
||||
cursor.execute('\n SELECT path, category\n FROM files\n WHERE category IS NOT NULL\n ')
|
||||
training_data = [(Path(path), category) for path, category in cursor.fetchall()]
|
||||
cursor.close()
|
||||
if not training_data:
|
||||
return classifier
|
||||
category_counts = {}
|
||||
for _, category in training_data:
|
||||
category_counts[category] = category_counts.get(category, 0) + 1
|
||||
filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples_per_category]
|
||||
if filtered_data:
|
||||
classifier.train(filtered_data)
|
||||
return classifier
|
||||
60
app/classification/rules.py
Normal file
60
app/classification/rules.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import fnmatch
|
||||
from ._protocols import ClassificationRule
|
||||
|
||||
class RuleBasedClassifier:
|
||||
|
||||
def __init__(self):
|
||||
self.rules: list[ClassificationRule] = []
|
||||
self._load_default_rules()
|
||||
|
||||
def _load_default_rules(self):
|
||||
self.add_rule(ClassificationRule(name='maven_cache', category='artifacts/java/maven', patterns=['**/.m2/**', '**/.maven/**', '**/maven-central-cache/**'], priority=10, description='Maven repository and cache'))
|
||||
self.add_rule(ClassificationRule(name='gradle_cache', category='artifacts/java/gradle', patterns=['**/.gradle/**', '**/gradle-cache/**', '**/gradle-build-cache/**'], priority=10, description='Gradle cache and artifacts'))
|
||||
self.add_rule(ClassificationRule(name='python_cache', category='cache/pycache', patterns=['**/__pycache__/**', '**/*.pyc', '**/*.pyo'], priority=10, description='Python cache files'))
|
||||
self.add_rule(ClassificationRule(name='python_artifacts', category='artifacts/python', patterns=['**/pip-cache/**', '**/pypi-cache/**', '**/wheelhouse/**'], priority=10, description='Python package artifacts'))
|
||||
self.add_rule(ClassificationRule(name='node_modules', category='cache/node_modules-archive', patterns=['**/node_modules/**'], priority=10, description='Node.js modules'))
|
||||
self.add_rule(ClassificationRule(name='node_cache', category='artifacts/node', patterns=['**/.npm/**', '**/npm-registry/**', '**/yarn-cache/**', '**/pnpm-store/**'], priority=10, description='Node.js package managers cache'))
|
||||
self.add_rule(ClassificationRule(name='go_cache', category='artifacts/go', patterns=['**/goproxy-cache/**', '**/go/pkg/mod/**', '**/go-module-cache/**'], priority=10, description='Go module cache'))
|
||||
self.add_rule(ClassificationRule(name='git_repos', category='development/git-infrastructure', patterns=['**/.git/**', '**/gitea/repositories/**'], priority=15, description='Git repositories and infrastructure'))
|
||||
self.add_rule(ClassificationRule(name='gitea', category='development/gitea', patterns=['**/gitea/**'], priority=12, description='Gitea server data'))
|
||||
self.add_rule(ClassificationRule(name='postgresql', category='databases/postgresql', patterns=['**/postgresql/**', '**/postgres/**', '**/*.sql'], priority=10, description='PostgreSQL databases'))
|
||||
self.add_rule(ClassificationRule(name='mysql', category='databases/mysql', patterns=['**/mysql/**', '**/mariadb/**'], priority=10, description='MySQL/MariaDB databases'))
|
||||
self.add_rule(ClassificationRule(name='mongodb', category='databases/mongodb', patterns=['**/mongodb/**', '**/mongo/**'], priority=10, description='MongoDB databases'))
|
||||
self.add_rule(ClassificationRule(name='redis', category='databases/redis', patterns=['**/redis/**', '**/*.rdb'], priority=10, description='Redis databases'))
|
||||
self.add_rule(ClassificationRule(name='sqlite', category='databases/sqlite', patterns=['**/*.db', '**/*.sqlite', '**/*.sqlite3'], priority=8, description='SQLite databases'))
|
||||
self.add_rule(ClassificationRule(name='llm_models', category='cache/llm-models', patterns=['**/hugging-face/**', '**/huggingface/**', '**/.cache/huggingface/**', '**/models/**/*.bin', '**/models/**/*.onnx', '**/models/**/*.safetensors', '**/llm*/**', '**/openai-cache/**'], priority=12, description='LLM and AI model files'))
|
||||
self.add_rule(ClassificationRule(name='docker_volumes', category='apps/volumes/docker-volumes', patterns=['**/docker/volumes/**', '**/var/lib/docker/volumes/**'], priority=10, description='Docker volumes'))
|
||||
self.add_rule(ClassificationRule(name='app_data', category='apps/volumes/app-data', patterns=['**/app-data/**', '**/application-data/**'], priority=8, description='Application data'))
|
||||
self.add_rule(ClassificationRule(name='build_output', category='development/build-tools', patterns=['**/target/**', '**/build/**', '**/dist/**', '**/out/**'], priority=5, description='Build output directories'))
|
||||
self.add_rule(ClassificationRule(name='system_backups', category='backups/system', patterns=['**/backup/**', '**/backups/**', '**/*.bak', '**/*.backup'], priority=10, description='System backups'))
|
||||
self.add_rule(ClassificationRule(name='database_backups', category='backups/database', patterns=['**/*.sql.gz', '**/*.dump', '**/db-backup/**'], priority=11, description='Database backups'))
|
||||
self.add_rule(ClassificationRule(name='archives', category='backups/archive', patterns=['**/*.tar', '**/*.tar.gz', '**/*.tgz', '**/*.zip', '**/*.7z'], priority=5, description='Archive files'))
|
||||
|
||||
def add_rule(self, rule: ClassificationRule) -> None:
|
||||
self.rules.append(rule)
|
||||
self.rules.sort(key=lambda r: r.priority, reverse=True)
|
||||
|
||||
def remove_rule(self, rule_name: str) -> None:
|
||||
self.rules = [r for r in self.rules if r.name != rule_name]
|
||||
|
||||
def match_path(self, path: Path) -> Optional[str]:
|
||||
path_str = str(path)
|
||||
for rule in self.rules:
|
||||
for pattern in rule.patterns:
|
||||
if fnmatch.fnmatch(path_str, pattern):
|
||||
return rule.category
|
||||
return None
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
|
||||
return self.match_path(path)
|
||||
|
||||
def get_category_rules(self, category: str) -> list[ClassificationRule]:
|
||||
return [r for r in self.rules if r.category == category]
|
||||
|
||||
def get_all_categories(self) -> set[str]:
|
||||
return {r.category for r in self.rules}
|
||||
|
||||
def get_rules_by_priority(self, min_priority: int=0) -> list[ClassificationRule]:
|
||||
return [r for r in self.rules if r.priority >= min_priority]
|
||||
3
app/content/__init__.py
Normal file
3
app/content/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .profiler import ContentProfiler
|
||||
from .extractors import ContentExtractor
|
||||
__all__ = ['ContentProfiler', 'ContentExtractor']
|
||||
62
app/content/extractors.py
Normal file
62
app/content/extractors.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
import json
|
||||
|
||||
class ContentExtractor:
|
||||
|
||||
def __init__(self):
|
||||
self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}
|
||||
|
||||
def extract(self, file_path: Path, extractor_type: str) -> Dict:
|
||||
extractor = self.extractors.get(extractor_type)
|
||||
if not extractor:
|
||||
return {'error': f'Unknown extractor: {extractor_type}'}
|
||||
try:
|
||||
return extractor(file_path)
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _extract_text(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read(1024 * 1024)
|
||||
return {'text': content, 'char_count': len(content), 'needs_llm': False}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _extract_code(self, file_path: Path) -> Dict:
|
||||
result = self._extract_text(file_path)
|
||||
if 'error' not in result:
|
||||
result['type'] = 'code'
|
||||
result['needs_llm'] = True
|
||||
return result
|
||||
|
||||
def _extract_pdf(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
import PyPDF2
|
||||
text_parts = []
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf = PyPDF2.PdfReader(f)
|
||||
for page in pdf.pages[:10]:
|
||||
text_parts.append(page.extract_text())
|
||||
text = '\n'.join(text_parts)
|
||||
return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'needs_ocr': True}
|
||||
|
||||
def _extract_image(self, file_path: Path) -> Dict:
|
||||
return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}
|
||||
|
||||
def _extract_audio(self, file_path: Path) -> Dict:
|
||||
return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}
|
||||
|
||||
def _extract_video(self, file_path: Path) -> Dict:
|
||||
return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}
|
||||
|
||||
def _extract_document(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
import textract
|
||||
text = textract.process(str(file_path)).decode('utf-8')
|
||||
return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
|
||||
except:
|
||||
return {'error': 'textract failed', 'needs_llm': True}
|
||||
108
app/content/profiler.py
Normal file
108
app/content/profiler.py
Normal file
@@ -0,0 +1,108 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Tuple
|
||||
import mimetypes
|
||||
import magic
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
class ContentProfiler:
|
||||
|
||||
def __init__(self):
|
||||
self.mime_detector = magic.Magic(mime=True)
|
||||
self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']}
|
||||
self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
|
||||
self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
|
||||
self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
|
||||
|
||||
def profile_file(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
stat = file_path.stat()
|
||||
size = stat.st_size
|
||||
mtime = datetime.fromtimestamp(stat.st_mtime)
|
||||
mime_type = self._detect_mime(file_path)
|
||||
kind = self._determine_kind(file_path, mime_type)
|
||||
profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)}
|
||||
return profile
|
||||
except Exception as e:
|
||||
return {'path': str(file_path), 'error': str(e), 'processable': False}
|
||||
|
||||
def _detect_mime(self, file_path: Path) -> str:
|
||||
try:
|
||||
return self.mime_detector.from_file(str(file_path))
|
||||
except:
|
||||
guess = mimetypes.guess_type(str(file_path))[0]
|
||||
return guess or 'application/octet-stream'
|
||||
|
||||
def _determine_kind(self, file_path: Path, mime_type: str) -> str:
|
||||
for kind, mimes in self.kind_mapping.items():
|
||||
if any((mime in mime_type for mime in mimes)):
|
||||
return kind
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix in self.text_exts:
|
||||
return 'text'
|
||||
if suffix in self.code_exts:
|
||||
return 'code'
|
||||
return 'unknown'
|
||||
|
||||
def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
|
||||
extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'}
|
||||
return extractors.get(kind)
|
||||
|
||||
def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
|
||||
hints = {}
|
||||
if kind == 'text' or kind == 'code':
|
||||
hints['language'] = self._guess_language(file_path)
|
||||
if size < 1024 * 1024:
|
||||
hints['lines'] = self._count_lines(file_path)
|
||||
if kind == 'pdf':
|
||||
hints['page_count'] = self._get_pdf_pages(file_path)
|
||||
if kind in ['audio', 'video']:
|
||||
hints['duration'] = self._get_media_duration(file_path)
|
||||
if kind == 'image':
|
||||
hints['has_exif'] = self._has_exif(file_path)
|
||||
hints['dimensions'] = self._get_image_dimensions(file_path)
|
||||
return hints
|
||||
|
||||
def _guess_language(self, file_path: Path) -> Optional[str]:
|
||||
lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'}
|
||||
return lang_map.get(file_path.suffix.lower())
|
||||
|
||||
def _count_lines(self, file_path: Path) -> Optional[int]:
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
return sum((1 for _ in f))
|
||||
except:
|
||||
return None
|
||||
|
||||
def _get_pdf_pages(self, file_path: Path) -> Optional[int]:
|
||||
try:
|
||||
import PyPDF2
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf = PyPDF2.PdfReader(f)
|
||||
return len(pdf.pages)
|
||||
except:
|
||||
return None
|
||||
|
||||
def _get_media_duration(self, file_path: Path) -> Optional[float]:
|
||||
try:
|
||||
import ffmpeg
|
||||
probe = ffmpeg.probe(str(file_path))
|
||||
return float(probe['format']['duration'])
|
||||
except:
|
||||
return None
|
||||
|
||||
def _has_exif(self, file_path: Path) -> bool:
|
||||
try:
|
||||
from PIL import Image
|
||||
img = Image.open(file_path)
|
||||
return hasattr(img, '_getexif') and img._getexif() is not None
|
||||
except:
|
||||
return False
|
||||
|
||||
def _get_image_dimensions(self, file_path: Path) -> Optional[Tuple[int, int]]:
|
||||
try:
|
||||
from PIL import Image
|
||||
with Image.open(file_path) as img:
|
||||
return img.size
|
||||
except:
|
||||
return None
|
||||
21
app/deduplication/__init__.py
Normal file
21
app/deduplication/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""Deduplication package exports"""
|
||||
from .chunker import (
|
||||
RabinChunker,
|
||||
SimpleChunker,
|
||||
hash_chunk,
|
||||
hash_file,
|
||||
compute_file_signature
|
||||
)
|
||||
from .store import HashStore, MemoryHashStore
|
||||
from .engine import DeduplicationEngine
|
||||
|
||||
__all__ = [
|
||||
'RabinChunker',
|
||||
'SimpleChunker',
|
||||
'hash_chunk',
|
||||
'hash_file',
|
||||
'compute_file_signature',
|
||||
'HashStore',
|
||||
'MemoryHashStore',
|
||||
'DeduplicationEngine',
|
||||
]
|
||||
0
app/deduplication/_protocols.py
Normal file
0
app/deduplication/_protocols.py
Normal file
241
app/deduplication/chunker.py
Normal file
241
app/deduplication/chunker.py
Normal file
@@ -0,0 +1,241 @@
|
||||
"""Rabin fingerprint chunker for content-defined chunking"""
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional
|
||||
|
||||
|
||||
class RabinChunker:
|
||||
"""Content-defined chunking using Rabin fingerprinting
|
||||
|
||||
Uses a rolling hash to identify chunk boundaries based on content,
|
||||
allowing for efficient deduplication even when data is modified.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
avg_chunk_size: int = 8192,
|
||||
min_chunk_size: Optional[int] = None,
|
||||
max_chunk_size: Optional[int] = None,
|
||||
window_size: int = 48
|
||||
):
|
||||
"""Initialize Rabin chunker
|
||||
|
||||
Args:
|
||||
avg_chunk_size: Target average chunk size in bytes
|
||||
min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
|
||||
max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
|
||||
window_size: Rolling hash window size
|
||||
"""
|
||||
self.avg_chunk_size = avg_chunk_size
|
||||
self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4)
|
||||
self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8)
|
||||
self.window_size = window_size
|
||||
|
||||
# Calculate mask for boundary detection
|
||||
# For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
|
||||
bits = 0
|
||||
size = avg_chunk_size
|
||||
while size > 1:
|
||||
bits += 1
|
||||
size >>= 1
|
||||
self.mask = (1 << bits) - 1
|
||||
|
||||
# Polynomial for rolling hash (prime number)
|
||||
self.poly = 0x3DA3358B4DC173
|
||||
|
||||
def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
|
||||
"""Chunk a file using Rabin fingerprinting
|
||||
|
||||
Args:
|
||||
file_path: Path to file to chunk
|
||||
chunk_size: If provided, use fixed-size chunking instead
|
||||
|
||||
Yields:
|
||||
Chunk data as bytes
|
||||
"""
|
||||
if chunk_size:
|
||||
# Use fixed-size chunking
|
||||
yield from self._chunk_fixed(file_path, chunk_size)
|
||||
else:
|
||||
# Use content-defined chunking
|
||||
yield from self._chunk_rabin(file_path)
|
||||
|
||||
def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
|
||||
"""Fixed-size chunking
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
chunk_size: Chunk size in bytes
|
||||
|
||||
Yields:
|
||||
Fixed-size chunks
|
||||
"""
|
||||
with open(file_path, 'rb') as f:
|
||||
while True:
|
||||
chunk = f.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
|
||||
"""Content-defined chunking using Rabin fingerprinting
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Yields:
|
||||
Variable-size chunks based on content
|
||||
"""
|
||||
with open(file_path, 'rb') as f:
|
||||
chunk_data = bytearray()
|
||||
window = bytearray()
|
||||
hash_value = 0
|
||||
|
||||
while True:
|
||||
byte = f.read(1)
|
||||
if not byte:
|
||||
# End of file - yield remaining data
|
||||
if chunk_data:
|
||||
yield bytes(chunk_data)
|
||||
break
|
||||
|
||||
chunk_data.extend(byte)
|
||||
window.extend(byte)
|
||||
|
||||
# Maintain window size
|
||||
if len(window) > self.window_size:
|
||||
window.pop(0)
|
||||
|
||||
# Update rolling hash
|
||||
hash_value = self._rolling_hash(window)
|
||||
|
||||
# Check if we should create a boundary
|
||||
should_break = (
|
||||
len(chunk_data) >= self.min_chunk_size and
|
||||
(
|
||||
(hash_value & self.mask) == 0 or
|
||||
len(chunk_data) >= self.max_chunk_size
|
||||
)
|
||||
)
|
||||
|
||||
if should_break:
|
||||
yield bytes(chunk_data)
|
||||
chunk_data = bytearray()
|
||||
window = bytearray()
|
||||
hash_value = 0
|
||||
|
||||
def _rolling_hash(self, window: bytearray) -> int:
|
||||
"""Calculate rolling hash for window
|
||||
|
||||
Args:
|
||||
window: Byte window
|
||||
|
||||
Returns:
|
||||
Hash value
|
||||
"""
|
||||
hash_value = 0
|
||||
for byte in window:
|
||||
hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF
|
||||
return hash_value
|
||||
|
||||
|
||||
class SimpleChunker:
|
||||
"""Simple fixed-size chunker for comparison"""
|
||||
|
||||
def __init__(self, chunk_size: int = 8192):
|
||||
"""Initialize simple chunker
|
||||
|
||||
Args:
|
||||
chunk_size: Fixed chunk size in bytes
|
||||
"""
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
def chunk_file(self, file_path: Path) -> Iterator[bytes]:
|
||||
"""Chunk file into fixed-size pieces
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Yields:
|
||||
Fixed-size chunks
|
||||
"""
|
||||
with open(file_path, 'rb') as f:
|
||||
while True:
|
||||
chunk = f.read(self.chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
|
||||
def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
|
||||
"""Hash a chunk of data
|
||||
|
||||
Args:
|
||||
chunk: Chunk data
|
||||
algorithm: Hash algorithm (default: sha256)
|
||||
|
||||
Returns:
|
||||
Hex digest of hash
|
||||
"""
|
||||
hasher = hashlib.new(algorithm)
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
|
||||
"""Hash entire file
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
algorithm: Hash algorithm (default: sha256)
|
||||
chunk_size: Size of chunks to read
|
||||
|
||||
Returns:
|
||||
Hex digest of file hash
|
||||
"""
|
||||
hasher = hashlib.new(algorithm)
|
||||
|
||||
with open(file_path, 'rb') as f:
|
||||
while True:
|
||||
chunk = f.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
hasher.update(chunk)
|
||||
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def compute_file_signature(
|
||||
file_path: Path,
|
||||
use_rabin: bool = True,
|
||||
avg_chunk_size: int = 8192
|
||||
) -> tuple[str, list[str]]:
|
||||
"""Compute file signature with chunk hashes
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
use_rabin: Whether to use Rabin chunking (vs fixed-size)
|
||||
avg_chunk_size: Average chunk size for Rabin or fixed size
|
||||
|
||||
Returns:
|
||||
Tuple of (file_hash, list of chunk hashes)
|
||||
"""
|
||||
if use_rabin:
|
||||
chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
|
||||
else:
|
||||
chunker = SimpleChunker(chunk_size=avg_chunk_size)
|
||||
|
||||
chunk_hashes = []
|
||||
file_hasher = hashlib.sha256()
|
||||
|
||||
for chunk in chunker.chunk_file(file_path):
|
||||
# Hash individual chunk
|
||||
chunk_hash = hash_chunk(chunk)
|
||||
chunk_hashes.append(chunk_hash)
|
||||
|
||||
# Update file hash
|
||||
file_hasher.update(chunk)
|
||||
|
||||
file_hash = file_hasher.hexdigest()
|
||||
|
||||
return file_hash, chunk_hashes
|
||||
353
app/deduplication/engine.py
Normal file
353
app/deduplication/engine.py
Normal file
@@ -0,0 +1,353 @@
|
||||
"""Deduplication engine"""
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import psycopg2
|
||||
|
||||
from .chunker import compute_file_signature, hash_file
|
||||
from .store import HashStore
|
||||
from ..shared.models import FileRecord, ProcessingStats
|
||||
from ..shared.config import DatabaseConfig, ProcessingConfig
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
|
||||
class DeduplicationEngine:
|
||||
"""Engine for deduplicating files"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_config: DatabaseConfig,
|
||||
processing_config: ProcessingConfig,
|
||||
logger: ProgressLogger
|
||||
):
|
||||
"""Initialize deduplication engine
|
||||
|
||||
Args:
|
||||
db_config: Database configuration
|
||||
processing_config: Processing configuration
|
||||
logger: Progress logger
|
||||
"""
|
||||
self.db_config = db_config
|
||||
self.processing_config = processing_config
|
||||
self.logger = logger
|
||||
self.hash_store = HashStore(db_config)
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
"""Get or create database connection"""
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(
|
||||
host=self.db_config.host,
|
||||
port=self.db_config.port,
|
||||
database=self.db_config.database,
|
||||
user=self.db_config.user,
|
||||
password=self.db_config.password
|
||||
)
|
||||
return self._connection
|
||||
|
||||
def deduplicate_all(
|
||||
self,
|
||||
disk: Optional[str] = None,
|
||||
use_chunks: bool = True,
|
||||
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
|
||||
) -> ProcessingStats:
|
||||
"""Deduplicate all files in database
|
||||
|
||||
Args:
|
||||
disk: Optional disk filter
|
||||
use_chunks: Whether to use chunk-level deduplication
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
ProcessingStats with deduplication statistics
|
||||
"""
|
||||
self.logger.section("Starting Deduplication")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get files without checksums
|
||||
if disk:
|
||||
cursor.execute("""
|
||||
SELECT path, size
|
||||
FROM files
|
||||
WHERE disk_label = %s AND checksum IS NULL
|
||||
ORDER BY size DESC
|
||||
""", (disk,))
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT path, size
|
||||
FROM files
|
||||
WHERE checksum IS NULL
|
||||
ORDER BY size DESC
|
||||
""")
|
||||
|
||||
files_to_process = cursor.fetchall()
|
||||
total_files = len(files_to_process)
|
||||
|
||||
self.logger.info(f"Found {total_files} files to process")
|
||||
|
||||
stats = ProcessingStats()
|
||||
|
||||
# Process files with thread pool
|
||||
with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
|
||||
futures = {}
|
||||
|
||||
for path_str, size in files_to_process:
|
||||
path = Path(path_str)
|
||||
future = executor.submit(self._process_file, path, use_chunks)
|
||||
futures[future] = (path, size)
|
||||
|
||||
# Process completed futures
|
||||
for future in as_completed(futures):
|
||||
path, size = futures[future]
|
||||
|
||||
try:
|
||||
checksum, duplicate_of = future.result()
|
||||
|
||||
if checksum:
|
||||
# Update database
|
||||
cursor.execute("""
|
||||
UPDATE files
|
||||
SET checksum = %s, duplicate_of = %s
|
||||
WHERE path = %s
|
||||
""", (checksum, duplicate_of, str(path)))
|
||||
|
||||
stats.files_succeeded += 1
|
||||
stats.bytes_processed += size
|
||||
|
||||
stats.files_processed += 1
|
||||
|
||||
# Commit periodically
|
||||
if stats.files_processed % self.processing_config.commit_interval == 0:
|
||||
conn.commit()
|
||||
|
||||
# Progress callback
|
||||
if progress_callback:
|
||||
progress_callback(stats.files_processed, total_files, stats)
|
||||
|
||||
# Log progress
|
||||
self.logger.progress(
|
||||
stats.files_processed,
|
||||
total_files,
|
||||
prefix="Files processed",
|
||||
bytes_processed=stats.bytes_processed,
|
||||
elapsed_seconds=stats.elapsed_seconds
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to process {path}: {e}")
|
||||
stats.files_failed += 1
|
||||
stats.files_processed += 1
|
||||
|
||||
# Final commit
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(
|
||||
f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
|
||||
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
def _process_file(
|
||||
self,
|
||||
path: Path,
|
||||
use_chunks: bool
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Process a single file for deduplication
|
||||
|
||||
Args:
|
||||
path: Path to file
|
||||
use_chunks: Whether to use chunk-level deduplication
|
||||
|
||||
Returns:
|
||||
Tuple of (checksum, duplicate_of_path)
|
||||
"""
|
||||
if not path.exists():
|
||||
return None, None
|
||||
|
||||
try:
|
||||
if use_chunks:
|
||||
# Compute file signature with chunks
|
||||
checksum, chunk_hashes = compute_file_signature(
|
||||
path,
|
||||
use_rabin=True,
|
||||
avg_chunk_size=self.processing_config.chunk_size
|
||||
)
|
||||
else:
|
||||
# Just compute file hash
|
||||
checksum = hash_file(
|
||||
path,
|
||||
algorithm=self.processing_config.hash_algorithm
|
||||
)
|
||||
chunk_hashes = None
|
||||
|
||||
# Check if hash exists
|
||||
if self.hash_store.exists(checksum):
|
||||
# Duplicate found
|
||||
canonical_path = self.hash_store.get_canonical(checksum)
|
||||
return checksum, canonical_path
|
||||
else:
|
||||
# New unique file
|
||||
size = path.stat().st_size
|
||||
self.hash_store.store_canonical(
|
||||
checksum,
|
||||
path,
|
||||
size,
|
||||
chunk_hashes
|
||||
)
|
||||
return checksum, None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Error processing {path}: {e}")
|
||||
raise
|
||||
|
||||
def find_duplicates(
|
||||
self,
|
||||
disk: Optional[str] = None
|
||||
) -> dict[str, list[str]]:
|
||||
"""Find all duplicate files
|
||||
|
||||
Args:
|
||||
disk: Optional disk filter
|
||||
|
||||
Returns:
|
||||
Dictionary mapping canonical path to list of duplicate paths
|
||||
"""
|
||||
self.logger.subsection("Finding Duplicates")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Query for duplicates
|
||||
if disk:
|
||||
cursor.execute("""
|
||||
SELECT checksum, array_agg(path ORDER BY path) as paths
|
||||
FROM files
|
||||
WHERE disk_label = %s AND checksum IS NOT NULL
|
||||
GROUP BY checksum
|
||||
HAVING COUNT(*) > 1
|
||||
""", (disk,))
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT checksum, array_agg(path ORDER BY path) as paths
|
||||
FROM files
|
||||
WHERE checksum IS NOT NULL
|
||||
GROUP BY checksum
|
||||
HAVING COUNT(*) > 1
|
||||
""")
|
||||
|
||||
duplicates = {}
|
||||
for checksum, paths in cursor.fetchall():
|
||||
canonical = paths[0]
|
||||
duplicates[canonical] = paths[1:]
|
||||
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(f"Found {len(duplicates)} sets of duplicates")
|
||||
|
||||
return duplicates
|
||||
|
||||
def get_deduplication_stats(self) -> dict:
|
||||
"""Get deduplication statistics
|
||||
|
||||
Returns:
|
||||
Dictionary with statistics
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
stats = {}
|
||||
|
||||
# Total files
|
||||
cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
|
||||
stats['total_files'] = cursor.fetchone()[0]
|
||||
|
||||
# Unique files
|
||||
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
|
||||
stats['unique_files'] = cursor.fetchone()[0]
|
||||
|
||||
# Duplicate files
|
||||
stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
|
||||
|
||||
# Total size
|
||||
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
|
||||
stats['total_size'] = cursor.fetchone()[0]
|
||||
|
||||
# Unique size
|
||||
cursor.execute("""
|
||||
SELECT COALESCE(SUM(size), 0)
|
||||
FROM (
|
||||
SELECT DISTINCT ON (checksum) size
|
||||
FROM files
|
||||
WHERE checksum IS NOT NULL
|
||||
) AS unique_files
|
||||
""")
|
||||
stats['unique_size'] = cursor.fetchone()[0]
|
||||
|
||||
# Wasted space
|
||||
stats['wasted_space'] = stats['total_size'] - stats['unique_size']
|
||||
|
||||
# Deduplication ratio
|
||||
if stats['total_size'] > 0:
|
||||
stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
|
||||
else:
|
||||
stats['dedup_ratio'] = 1.0
|
||||
|
||||
# Space saved percentage
|
||||
if stats['total_size'] > 0:
|
||||
stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100
|
||||
else:
|
||||
stats['space_saved_percent'] = 0.0
|
||||
|
||||
cursor.close()
|
||||
|
||||
return stats
|
||||
|
||||
def mark_canonical_files(self) -> int:
|
||||
"""Mark canonical (first occurrence) files in database
|
||||
|
||||
Returns:
|
||||
Number of canonical files marked
|
||||
"""
|
||||
self.logger.subsection("Marking Canonical Files")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Find first occurrence of each checksum and mark as canonical
|
||||
cursor.execute("""
|
||||
WITH canonical AS (
|
||||
SELECT DISTINCT ON (checksum) path, checksum
|
||||
FROM files
|
||||
WHERE checksum IS NOT NULL
|
||||
ORDER BY checksum, path
|
||||
)
|
||||
UPDATE files
|
||||
SET duplicate_of = NULL
|
||||
WHERE path IN (SELECT path FROM canonical)
|
||||
""")
|
||||
|
||||
count = cursor.rowcount
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(f"Marked {count} canonical files")
|
||||
|
||||
return count
|
||||
|
||||
def close(self):
|
||||
"""Close connections"""
|
||||
self.hash_store.close()
|
||||
if self._connection and not self._connection.closed:
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close()
|
||||
412
app/deduplication/store.py
Normal file
412
app/deduplication/store.py
Normal file
@@ -0,0 +1,412 @@
|
||||
"""Hash store for deduplication with optional Redis support"""
|
||||
from typing import Optional, Dict, Set
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
from ..shared.config import DatabaseConfig
|
||||
|
||||
|
||||
class HashStore:
|
||||
"""PostgreSQL-based hash store for deduplication"""
|
||||
|
||||
def __init__(self, db_config: DatabaseConfig):
|
||||
"""Initialize hash store
|
||||
|
||||
Args:
|
||||
db_config: Database configuration
|
||||
"""
|
||||
self.db_config = db_config
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
"""Get or create database connection"""
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(
|
||||
host=self.db_config.host,
|
||||
port=self.db_config.port,
|
||||
database=self.db_config.database,
|
||||
user=self.db_config.user,
|
||||
password=self.db_config.password
|
||||
)
|
||||
return self._connection
|
||||
|
||||
def _ensure_tables(self):
|
||||
"""Ensure hash store tables exist"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create hashes table for file-level deduplication
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS file_hashes (
|
||||
checksum TEXT PRIMARY KEY,
|
||||
canonical_path TEXT NOT NULL,
|
||||
size BIGINT NOT NULL,
|
||||
first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
ref_count INTEGER DEFAULT 1
|
||||
)
|
||||
""")
|
||||
|
||||
# Create chunk hashes table for chunk-level deduplication
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS chunk_hashes (
|
||||
chunk_hash TEXT PRIMARY KEY,
|
||||
size INTEGER NOT NULL,
|
||||
first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
ref_count INTEGER DEFAULT 1
|
||||
)
|
||||
""")
|
||||
|
||||
# Create file-chunk mapping table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS file_chunks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
file_checksum TEXT NOT NULL,
|
||||
chunk_hash TEXT NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),
|
||||
FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),
|
||||
UNIQUE (file_checksum, chunk_index)
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_file_chunks_file
|
||||
ON file_chunks(file_checksum)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk
|
||||
ON file_chunks(chunk_hash)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def exists(self, checksum: str) -> bool:
|
||||
"""Check if hash exists in store
|
||||
|
||||
Args:
|
||||
checksum: File hash to check
|
||||
|
||||
Returns:
|
||||
True if hash exists
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1",
|
||||
(checksum,)
|
||||
)
|
||||
|
||||
exists = cursor.fetchone() is not None
|
||||
cursor.close()
|
||||
|
||||
return exists
|
||||
|
||||
def get_canonical(self, checksum: str) -> Optional[str]:
|
||||
"""Get canonical path for a hash
|
||||
|
||||
Args:
|
||||
checksum: File hash
|
||||
|
||||
Returns:
|
||||
Canonical file path or None if not found
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"SELECT canonical_path FROM file_hashes WHERE checksum = %s",
|
||||
(checksum,)
|
||||
)
|
||||
|
||||
result = cursor.fetchone()
|
||||
cursor.close()
|
||||
|
||||
return result[0] if result else None
|
||||
|
||||
def store_canonical(
|
||||
self,
|
||||
checksum: str,
|
||||
path: Path,
|
||||
size: int,
|
||||
chunk_hashes: Optional[list[str]] = None
|
||||
) -> None:
|
||||
"""Store canonical reference for a hash
|
||||
|
||||
Args:
|
||||
checksum: File hash
|
||||
path: Canonical file path
|
||||
size: File size in bytes
|
||||
chunk_hashes: Optional list of chunk hashes
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# Store file hash
|
||||
cursor.execute("""
|
||||
INSERT INTO file_hashes (checksum, canonical_path, size)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT (checksum) DO UPDATE SET
|
||||
ref_count = file_hashes.ref_count + 1
|
||||
""", (checksum, str(path), size))
|
||||
|
||||
# Store chunk hashes if provided
|
||||
if chunk_hashes:
|
||||
# Insert chunk hashes
|
||||
chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes]
|
||||
execute_batch(cursor, """
|
||||
INSERT INTO chunk_hashes (chunk_hash, size)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (chunk_hash) DO UPDATE SET
|
||||
ref_count = chunk_hashes.ref_count + 1
|
||||
""", chunk_data, page_size=1000)
|
||||
|
||||
# Create file-chunk mappings
|
||||
mapping_data = [
|
||||
(checksum, chunk_hash, idx)
|
||||
for idx, chunk_hash in enumerate(chunk_hashes)
|
||||
]
|
||||
execute_batch(cursor, """
|
||||
INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT (file_checksum, chunk_index) DO NOTHING
|
||||
""", mapping_data, page_size=1000)
|
||||
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
raise
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def get_chunk_hashes(self, checksum: str) -> list[str]:
|
||||
"""Get chunk hashes for a file
|
||||
|
||||
Args:
|
||||
checksum: File hash
|
||||
|
||||
Returns:
|
||||
List of chunk hashes in order
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT chunk_hash
|
||||
FROM file_chunks
|
||||
WHERE file_checksum = %s
|
||||
ORDER BY chunk_index
|
||||
""", (checksum,))
|
||||
|
||||
chunk_hashes = [row[0] for row in cursor.fetchall()]
|
||||
cursor.close()
|
||||
|
||||
return chunk_hashes
|
||||
|
||||
def get_duplicates(self) -> Dict[str, list[str]]:
|
||||
"""Get all duplicate file groups
|
||||
|
||||
Returns:
|
||||
Dictionary mapping canonical path to list of duplicate paths
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all files with their hashes
|
||||
cursor.execute("""
|
||||
SELECT f.path, f.checksum
|
||||
FROM files f
|
||||
WHERE f.checksum IS NOT NULL
|
||||
""")
|
||||
|
||||
# Group by checksum
|
||||
hash_to_paths: Dict[str, list[str]] = {}
|
||||
for path, checksum in cursor.fetchall():
|
||||
if checksum not in hash_to_paths:
|
||||
hash_to_paths[checksum] = []
|
||||
hash_to_paths[checksum].append(path)
|
||||
|
||||
cursor.close()
|
||||
|
||||
# Filter to only duplicates (more than one file)
|
||||
duplicates = {
|
||||
paths[0]: paths[1:]
|
||||
for checksum, paths in hash_to_paths.items()
|
||||
if len(paths) > 1
|
||||
}
|
||||
|
||||
return duplicates
|
||||
|
||||
def get_stats(self) -> Dict[str, int]:
|
||||
"""Get hash store statistics
|
||||
|
||||
Returns:
|
||||
Dictionary with statistics
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
stats = {}
|
||||
|
||||
# Count unique file hashes
|
||||
cursor.execute("SELECT COUNT(*) FROM file_hashes")
|
||||
stats['unique_files'] = cursor.fetchone()[0]
|
||||
|
||||
# Count unique chunk hashes
|
||||
cursor.execute("SELECT COUNT(*) FROM chunk_hashes")
|
||||
stats['unique_chunks'] = cursor.fetchone()[0]
|
||||
|
||||
# Count total references
|
||||
cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes")
|
||||
stats['total_file_refs'] = cursor.fetchone()[0]
|
||||
|
||||
# Count total chunk references
|
||||
cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes")
|
||||
stats['total_chunk_refs'] = cursor.fetchone()[0]
|
||||
|
||||
# Calculate deduplication ratio
|
||||
if stats['total_file_refs'] > 0:
|
||||
stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs']
|
||||
else:
|
||||
stats['dedup_ratio'] = 1.0
|
||||
|
||||
cursor.close()
|
||||
|
||||
return stats
|
||||
|
||||
def find_similar_files(self, checksum: str, threshold: float = 0.8) -> list[tuple[str, float]]:
|
||||
"""Find files similar to given hash based on chunk overlap
|
||||
|
||||
Args:
|
||||
checksum: File hash to compare
|
||||
threshold: Similarity threshold (0.0 to 1.0)
|
||||
|
||||
Returns:
|
||||
List of tuples (other_checksum, similarity_score)
|
||||
"""
|
||||
self._ensure_tables()
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get chunks for the target file
|
||||
target_chunks = set(self.get_chunk_hashes(checksum))
|
||||
|
||||
if not target_chunks:
|
||||
cursor.close()
|
||||
return []
|
||||
|
||||
# Find files sharing chunks
|
||||
cursor.execute("""
|
||||
SELECT DISTINCT fc.file_checksum
|
||||
FROM file_chunks fc
|
||||
WHERE fc.chunk_hash = ANY(%s)
|
||||
AND fc.file_checksum != %s
|
||||
""", (list(target_chunks), checksum))
|
||||
|
||||
similar_files = []
|
||||
|
||||
for (other_checksum,) in cursor.fetchall():
|
||||
other_chunks = set(self.get_chunk_hashes(other_checksum))
|
||||
|
||||
# Calculate Jaccard similarity
|
||||
intersection = len(target_chunks & other_chunks)
|
||||
union = len(target_chunks | other_chunks)
|
||||
|
||||
if union > 0:
|
||||
similarity = intersection / union
|
||||
|
||||
if similarity >= threshold:
|
||||
similar_files.append((other_checksum, similarity))
|
||||
|
||||
cursor.close()
|
||||
|
||||
# Sort by similarity descending
|
||||
similar_files.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return similar_files
|
||||
|
||||
def close(self):
|
||||
"""Close database connection"""
|
||||
if self._connection and not self._connection.closed:
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
self._ensure_tables()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close()
|
||||
|
||||
|
||||
class MemoryHashStore:
|
||||
"""In-memory hash store for testing and small datasets"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize in-memory hash store"""
|
||||
self.hashes: Dict[str, tuple[str, int]] = {}
|
||||
self.chunks: Dict[str, int] = {}
|
||||
self.file_chunks: Dict[str, list[str]] = {}
|
||||
|
||||
def exists(self, checksum: str) -> bool:
|
||||
"""Check if hash exists"""
|
||||
return checksum in self.hashes
|
||||
|
||||
def get_canonical(self, checksum: str) -> Optional[str]:
|
||||
"""Get canonical path"""
|
||||
return self.hashes.get(checksum, (None, 0))[0]
|
||||
|
||||
def store_canonical(
|
||||
self,
|
||||
checksum: str,
|
||||
path: Path,
|
||||
size: int,
|
||||
chunk_hashes: Optional[list[str]] = None
|
||||
) -> None:
|
||||
"""Store canonical reference"""
|
||||
self.hashes[checksum] = (str(path), size)
|
||||
|
||||
if chunk_hashes:
|
||||
self.file_chunks[checksum] = chunk_hashes
|
||||
for chunk_hash in chunk_hashes:
|
||||
self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1
|
||||
|
||||
def get_chunk_hashes(self, checksum: str) -> list[str]:
|
||||
"""Get chunk hashes"""
|
||||
return self.file_chunks.get(checksum, [])
|
||||
|
||||
def get_stats(self) -> Dict[str, int]:
|
||||
"""Get statistics"""
|
||||
return {
|
||||
'unique_files': len(self.hashes),
|
||||
'unique_chunks': len(self.chunks),
|
||||
'total_file_refs': len(self.hashes),
|
||||
'total_chunk_refs': sum(self.chunks.values()),
|
||||
'dedup_ratio': 1.0
|
||||
}
|
||||
|
||||
def close(self):
|
||||
"""No-op for compatibility"""
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
pass
|
||||
5
app/discovery/__init__.py
Normal file
5
app/discovery/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .scanner import FileScanner, FilteredScanner
|
||||
from .system import SystemAPI
|
||||
from .engine import DiscoveryEngine
|
||||
from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
|
||||
__all__ = ['FileScanner', 'FilteredScanner', 'SystemAPI', 'DiscoveryEngine', 'FileMeta', 'MountInfo', 'DiskInfo', 'IFileScanner', 'ISystemAPI']
|
||||
37
app/discovery/_protocols.py
Normal file
37
app/discovery/_protocols.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from typing import Iterator, Protocol, Any
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class FileMeta:
|
||||
path: Path
|
||||
size: int
|
||||
modified_time: float
|
||||
created_time: float
|
||||
|
||||
@dataclass
|
||||
class MountInfo:
|
||||
device: str
|
||||
mount_point: str
|
||||
fs_type: str
|
||||
options: str
|
||||
|
||||
@dataclass
|
||||
class DiskInfo:
|
||||
device: str
|
||||
model: str
|
||||
size: int
|
||||
serial: str
|
||||
|
||||
class IFileScanner(Protocol):
|
||||
|
||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||
...
|
||||
|
||||
class ISystemAPI(Protocol):
|
||||
|
||||
def query_mounts(self) -> list[MountInfo]:
|
||||
...
|
||||
|
||||
def query_nvmes(self) -> list[DiskInfo]:
|
||||
...
|
||||
133
app/discovery/engine.py
Normal file
133
app/discovery/engine.py
Normal file
@@ -0,0 +1,133 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from .scanner import FileScanner
|
||||
from .system import SystemAPI
|
||||
from ._protocols import FileMeta
|
||||
from ..shared.models import FileRecord, DiskInfo, ProcessingStats
|
||||
from ..shared.config import DatabaseConfig
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
class DiscoveryEngine:
|
||||
|
||||
def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
|
||||
self.db_config = db_config
|
||||
self.logger = logger
|
||||
self.batch_size = batch_size
|
||||
self.system_api = SystemAPI()
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
|
||||
return self._connection
|
||||
|
||||
def _ensure_tables(self):
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("\n CREATE TABLE IF NOT EXISTS files (\n id SERIAL PRIMARY KEY,\n path TEXT NOT NULL UNIQUE,\n size BIGINT NOT NULL,\n modified_time DOUBLE PRECISION NOT NULL,\n created_time DOUBLE PRECISION NOT NULL,\n disk_label TEXT NOT NULL,\n checksum TEXT,\n status TEXT DEFAULT 'indexed',\n category TEXT,\n duplicate_of TEXT,\n discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n ")
|
||||
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n ')
|
||||
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n ')
|
||||
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n ')
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
|
||||
self.logger.section(f'Discovering: {root}')
|
||||
self._ensure_tables()
|
||||
if scanner is None:
|
||||
scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
|
||||
disk = self.system_api.get_disk_for_path(root)
|
||||
if disk is None:
|
||||
disk = str(root)
|
||||
stats = ProcessingStats()
|
||||
batch = []
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
for file_meta in scanner.scan(root):
|
||||
record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
|
||||
batch.append(record)
|
||||
stats.files_processed += 1
|
||||
stats.bytes_processed += record.size
|
||||
if len(batch) >= self.batch_size:
|
||||
self._insert_batch(cursor, batch)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
if progress_callback:
|
||||
progress_callback(stats.files_processed, 0, stats)
|
||||
if stats.files_processed % (self.batch_size * 10) == 0:
|
||||
self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
|
||||
if batch:
|
||||
self._insert_batch(cursor, batch)
|
||||
conn.commit()
|
||||
stats.files_succeeded = stats.files_processed
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
self.logger.error(f'Discovery failed: {e}')
|
||||
raise
|
||||
finally:
|
||||
cursor.close()
|
||||
self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
|
||||
return stats
|
||||
|
||||
def _insert_batch(self, cursor, batch: list[FileRecord]):
|
||||
query = '\n INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n updated_at = CURRENT_TIMESTAMP\n '
|
||||
data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
|
||||
execute_batch(cursor, query, data, page_size=self.batch_size)
|
||||
|
||||
def get_disk_info(self) -> list[DiskInfo]:
|
||||
self.logger.subsection('Querying disk information')
|
||||
disks = []
|
||||
for disk_info in self.system_api.query_nvmes():
|
||||
mount_point = None
|
||||
fs_type = 'unknown'
|
||||
for mount in self.system_api.query_mounts():
|
||||
if mount.device == disk_info.device:
|
||||
mount_point = Path(mount.mount_point)
|
||||
fs_type = mount.fs_type
|
||||
break
|
||||
if mount_point:
|
||||
total, used, free = self.system_api.get_disk_usage(mount_point)
|
||||
else:
|
||||
total = disk_info.size
|
||||
used = 0
|
||||
free = disk_info.size
|
||||
disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
|
||||
disks.append(disk)
|
||||
self.logger.info(f' {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
|
||||
return disks
|
||||
|
||||
def get_file_count(self, disk: Optional[str]=None) -> int:
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
if disk:
|
||||
cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
|
||||
else:
|
||||
cursor.execute('SELECT COUNT(*) FROM files')
|
||||
count = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
return count
|
||||
|
||||
def get_total_size(self, disk: Optional[str]=None) -> int:
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
if disk:
|
||||
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
|
||||
else:
|
||||
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
|
||||
total = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
return total
|
||||
|
||||
def close(self):
|
||||
if self._connection and (not self._connection.closed):
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
112
app/discovery/scanner.py
Normal file
112
app/discovery/scanner.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional, Callable
|
||||
from datetime import datetime
|
||||
from ._protocols import FileMeta
|
||||
|
||||
class FileScanner:
|
||||
|
||||
def __init__(self, follow_symlinks: bool=False, skip_hidden: bool=True, error_handler: Optional[Callable[[Exception, Path], None]]=None):
|
||||
self.follow_symlinks = follow_symlinks
|
||||
self.skip_hidden = skip_hidden
|
||||
self.error_handler = error_handler
|
||||
self._files_scanned = 0
|
||||
self._bytes_scanned = 0
|
||||
self._errors = 0
|
||||
|
||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||
if not root.exists():
|
||||
error = FileNotFoundError(f'Path does not exist: {root}')
|
||||
if self.error_handler:
|
||||
self.error_handler(error, root)
|
||||
else:
|
||||
raise error
|
||||
return
|
||||
if not root.is_dir():
|
||||
try:
|
||||
yield self._get_file_meta(root)
|
||||
except Exception as e:
|
||||
self._errors += 1
|
||||
if self.error_handler:
|
||||
self.error_handler(e, root)
|
||||
else:
|
||||
raise
|
||||
return
|
||||
for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
|
||||
current_dir = Path(dirpath)
|
||||
if self.skip_hidden:
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
|
||||
for filename in filenames:
|
||||
if self.skip_hidden and filename.startswith('.'):
|
||||
continue
|
||||
file_path = current_dir / filename
|
||||
try:
|
||||
if file_path.is_symlink() and (not file_path.exists()):
|
||||
continue
|
||||
meta = self._get_file_meta(file_path)
|
||||
self._files_scanned += 1
|
||||
self._bytes_scanned += meta.size
|
||||
yield meta
|
||||
except PermissionError as e:
|
||||
self._errors += 1
|
||||
if self.error_handler:
|
||||
self.error_handler(e, file_path)
|
||||
continue
|
||||
except Exception as e:
|
||||
self._errors += 1
|
||||
if self.error_handler:
|
||||
self.error_handler(e, file_path)
|
||||
continue
|
||||
|
||||
def _get_file_meta(self, path: Path) -> FileMeta:
|
||||
stat = path.stat()
|
||||
created_time = stat.st_ctime
|
||||
if hasattr(stat, 'st_birthtime'):
|
||||
created_time = stat.st_birthtime
|
||||
return FileMeta(path=path, size=stat.st_size, modified_time=stat.st_mtime, created_time=created_time)
|
||||
|
||||
@property
|
||||
def files_scanned(self) -> int:
|
||||
return self._files_scanned
|
||||
|
||||
@property
|
||||
def bytes_scanned(self) -> int:
|
||||
return self._bytes_scanned
|
||||
|
||||
@property
|
||||
def errors(self) -> int:
|
||||
return self._errors
|
||||
|
||||
def reset_stats(self) -> None:
|
||||
self._files_scanned = 0
|
||||
self._bytes_scanned = 0
|
||||
self._errors = 0
|
||||
|
||||
class FilteredScanner(FileScanner):
|
||||
|
||||
def __init__(self, min_size: Optional[int]=None, max_size: Optional[int]=None, extensions: Optional[list[str]]=None, exclude_patterns: Optional[list[str]]=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.min_size = min_size
|
||||
self.max_size = max_size
|
||||
self.extensions = {ext.lower() for ext in extensions} if extensions else None
|
||||
self.exclude_patterns = exclude_patterns or []
|
||||
|
||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||
for meta in super().scan(root):
|
||||
if self.min_size is not None and meta.size < self.min_size:
|
||||
continue
|
||||
if self.max_size is not None and meta.size > self.max_size:
|
||||
continue
|
||||
if self.extensions is not None:
|
||||
if meta.path.suffix.lower() not in self.extensions:
|
||||
continue
|
||||
if self._should_exclude(meta.path):
|
||||
continue
|
||||
yield meta
|
||||
|
||||
def _should_exclude(self, path: Path) -> bool:
|
||||
path_str = str(path)
|
||||
for pattern in self.exclude_patterns:
|
||||
if pattern in path_str:
|
||||
return True
|
||||
return False
|
||||
119
app/discovery/system.py
Normal file
119
app/discovery/system.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import psutil
|
||||
from ._protocols import MountInfo, DiskInfo
|
||||
|
||||
class SystemAPI:
|
||||
|
||||
def query_mounts(self) -> list[MountInfo]:
|
||||
mounts = []
|
||||
for partition in psutil.disk_partitions(all=False):
|
||||
mount_info = MountInfo(device=partition.device, mount_point=partition.mountpoint, fs_type=partition.fstype, options=partition.opts)
|
||||
mounts.append(mount_info)
|
||||
return mounts
|
||||
|
||||
def query_nvmes(self) -> list[DiskInfo]:
|
||||
disks = []
|
||||
try:
|
||||
result = subprocess.run(['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'], capture_output=True, text=True, check=False)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split(maxsplit=3)
|
||||
if len(parts) >= 3:
|
||||
device = f'/dev/{parts[0]}'
|
||||
model = parts[1] if len(parts) > 1 else 'Unknown'
|
||||
size_str = parts[2] if len(parts) > 2 else '0'
|
||||
serial = parts[3] if len(parts) > 3 else 'Unknown'
|
||||
try:
|
||||
size = int(size_str)
|
||||
except ValueError:
|
||||
size = 0
|
||||
disk_info = DiskInfo(device=device, model=model, size=size, serial=serial)
|
||||
disks.append(disk_info)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
if not disks:
|
||||
disks = self._query_disks_fallback()
|
||||
return disks
|
||||
|
||||
def _query_disks_fallback(self) -> list[DiskInfo]:
|
||||
disks = []
|
||||
seen_devices = set()
|
||||
for partition in psutil.disk_partitions(all=True):
|
||||
device = partition.device
|
||||
if not device.startswith('/dev/'):
|
||||
continue
|
||||
base_device = self._get_base_device(device)
|
||||
if base_device in seen_devices:
|
||||
continue
|
||||
seen_devices.add(base_device)
|
||||
try:
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
size = usage.total
|
||||
except (PermissionError, OSError):
|
||||
size = 0
|
||||
disk_info = DiskInfo(device=base_device, model='Unknown', size=size, serial='Unknown')
|
||||
disks.append(disk_info)
|
||||
return disks
|
||||
|
||||
def _get_base_device(self, device: str) -> str:
|
||||
if 'nvme' in device:
|
||||
if 'p' in device:
|
||||
return device.rsplit('p', 1)[0]
|
||||
return device
|
||||
import re
|
||||
match = re.match('(/dev/[a-z]+)', device)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return device
|
||||
|
||||
def get_disk_for_path(self, path: Path) -> Optional[str]:
|
||||
path = path.resolve()
|
||||
best_match = None
|
||||
best_match_len = 0
|
||||
for partition in psutil.disk_partitions():
|
||||
mount_point = Path(partition.mountpoint)
|
||||
try:
|
||||
if path == mount_point or mount_point in path.parents:
|
||||
mount_len = len(str(mount_point))
|
||||
if mount_len > best_match_len:
|
||||
best_match = partition.device
|
||||
best_match_len = mount_len
|
||||
except (ValueError, OSError):
|
||||
continue
|
||||
return best_match
|
||||
|
||||
def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
|
||||
try:
|
||||
usage = psutil.disk_usage(str(path))
|
||||
return (usage.total, usage.used, usage.free)
|
||||
except (PermissionError, OSError):
|
||||
return (0, 0, 0)
|
||||
|
||||
def get_mount_point(self, path: Path) -> Optional[Path]:
|
||||
path = path.resolve()
|
||||
best_match = None
|
||||
best_match_len = 0
|
||||
for partition in psutil.disk_partitions():
|
||||
mount_point = Path(partition.mountpoint)
|
||||
try:
|
||||
if path == mount_point or mount_point in path.parents:
|
||||
mount_len = len(str(mount_point))
|
||||
if mount_len > best_match_len:
|
||||
best_match = mount_point
|
||||
best_match_len = mount_len
|
||||
except (ValueError, OSError):
|
||||
continue
|
||||
return best_match
|
||||
|
||||
def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
|
||||
try:
|
||||
stat1 = path1.stat()
|
||||
stat2 = path2.stat()
|
||||
return stat1.st_dev == stat2.st_dev
|
||||
except (OSError, PermissionError):
|
||||
return False
|
||||
59
app/enrichment/enricher.py
Normal file
59
app/enrichment/enricher.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from typing import Dict
|
||||
import re
|
||||
|
||||
class ContentEnricher:
|
||||
def __init__(self, llm_client=None):
|
||||
self.llm_client = llm_client
|
||||
self.pii_patterns = {
|
||||
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
||||
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
|
||||
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
|
||||
'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
|
||||
}
|
||||
|
||||
def enrich(self, text: str, use_llm: bool = False) -> Dict:
|
||||
enrichment = {
|
||||
'summary': self._basic_summary(text),
|
||||
'word_count': len(text.split()),
|
||||
'has_pii': self._detect_pii(text),
|
||||
'quality': self._assess_quality(text),
|
||||
'topics': self._extract_basic_topics(text)
|
||||
}
|
||||
|
||||
if use_llm and self.llm_client:
|
||||
llm_result = self.llm_client.classify_content(text)
|
||||
if llm_result.get('success'):
|
||||
enrichment['llm_classification'] = llm_result['text']
|
||||
|
||||
return enrichment
|
||||
|
||||
def _basic_summary(self, text: str) -> str:
|
||||
sentences = re.split(r'[.!?]+', text)
|
||||
return ' '.join(sentences[:3])[:200]
|
||||
|
||||
def _detect_pii(self, text: str) -> Dict:
|
||||
detected = {}
|
||||
for pii_type, pattern in self.pii_patterns.items():
|
||||
matches = re.findall(pattern, text)
|
||||
if matches:
|
||||
detected[pii_type] = len(matches)
|
||||
return detected
|
||||
|
||||
def _assess_quality(self, text: str) -> str:
|
||||
if len(text.strip()) < 10:
|
||||
return 'low'
|
||||
|
||||
special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
|
||||
if special_char_ratio > 0.3:
|
||||
return 'low'
|
||||
|
||||
return 'high' if len(text.split()) > 50 else 'medium'
|
||||
|
||||
def _extract_basic_topics(self, text: str) -> list:
|
||||
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
||||
word_freq = {}
|
||||
for word in words:
|
||||
if len(word) > 3:
|
||||
word_freq[word] = word_freq.get(word, 0) + 1
|
||||
|
||||
return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
|
||||
54
app/enrichment/llm_client.py
Normal file
54
app/enrichment/llm_client.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import requests
|
||||
import json
|
||||
from typing import Dict, Optional
|
||||
|
||||
class LLMClient:
|
||||
def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'):
|
||||
self.endpoint = endpoint
|
||||
self.model = model
|
||||
self.local_ollama = 'http://localhost:11434'
|
||||
|
||||
def summarize(self, text: str, max_length: int = 200) -> Dict:
|
||||
prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}"
|
||||
return self._query(prompt)
|
||||
|
||||
def extract_topics(self, text: str) -> Dict:
|
||||
prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}"
|
||||
return self._query(prompt)
|
||||
|
||||
def classify_content(self, text: str) -> Dict:
|
||||
prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}"
|
||||
return self._query(prompt)
|
||||
|
||||
def _query(self, prompt: str, use_local: bool = False) -> Dict:
|
||||
try:
|
||||
endpoint = self.local_ollama if use_local else self.endpoint
|
||||
|
||||
if use_local:
|
||||
response = requests.post(
|
||||
f'{endpoint}/api/generate',
|
||||
json={'model': 'llama3.2', 'prompt': prompt, 'stream': False},
|
||||
timeout=30
|
||||
)
|
||||
else:
|
||||
response = requests.post(
|
||||
f'{endpoint}/v1/chat/completions',
|
||||
json={
|
||||
'model': self.model,
|
||||
'messages': [{'role': 'user', 'content': prompt}],
|
||||
'max_tokens': 500
|
||||
},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if use_local:
|
||||
return {'success': True, 'text': data.get('response', '')}
|
||||
else:
|
||||
return {'success': True, 'text': data['choices'][0]['message']['content']}
|
||||
else:
|
||||
return {'success': False, 'error': f'HTTP {response.status_code}'}
|
||||
|
||||
except Exception as e:
|
||||
return {'success': False, 'error': str(e)}
|
||||
3
app/filters/__init__.py
Normal file
3
app/filters/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .gitignore import GitignoreFilter, DEFAULT_PATTERNS
|
||||
|
||||
__all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS']
|
||||
30
app/filters/gitignore.py
Normal file
30
app/filters/gitignore.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from pathlib import Path
|
||||
from typing import Set
|
||||
import fnmatch
|
||||
|
||||
DEFAULT_PATTERNS = {
|
||||
'node_modules/**', '__pycache__/**', '.git/**', 'build/**', 'dist/**',
|
||||
'.cache/**', 'target/**', 'vendor/**', '.venv/**', 'venv/**',
|
||||
'*.pyc', '*.pyo', '*.so', '*.dll', '*.dylib', '*.o', '*.a',
|
||||
'.DS_Store', 'Thumbs.db', '.pytest_cache/**', '.tox/**',
|
||||
'*.egg-info/**', '.mypy_cache/**', '.coverage', 'htmlcov/**',
|
||||
'.gradle/**', 'bin/**', 'obj/**', '.vs/**', '.idea/**'
|
||||
}
|
||||
|
||||
class GitignoreFilter:
|
||||
def __init__(self, patterns: Set[str] = None):
|
||||
self.patterns = patterns or DEFAULT_PATTERNS
|
||||
|
||||
def should_exclude(self, path: str) -> bool:
|
||||
path_obj = Path(path)
|
||||
for pattern in self.patterns:
|
||||
if '**' in pattern:
|
||||
clean_pattern = pattern.replace('/**', '').replace('**/', '')
|
||||
if clean_pattern in path_obj.parts:
|
||||
return True
|
||||
elif fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(path_obj.name, pattern):
|
||||
return True
|
||||
return False
|
||||
|
||||
def filter_files(self, files: list) -> list:
|
||||
return [f for f in files if not self.should_exclude(f)]
|
||||
918
app/main.py
Normal file
918
app/main.py
Normal file
@@ -0,0 +1,918 @@
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
import psycopg2
|
||||
import shutil
|
||||
import hashlib
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import time
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)])
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class FileRecord:
|
||||
path: str
|
||||
size: int
|
||||
modified_time: float
|
||||
disk_label: str
|
||||
checksum: Optional[str] = None
|
||||
status: str = 'indexed'
|
||||
|
||||
class DiskReorganizer:
|
||||
|
||||
def __init__(self, db_config: Dict=None):
|
||||
if db_config is None:
|
||||
db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
|
||||
self.db_config = db_config
|
||||
self.init_database()
|
||||
|
||||
def get_connection(self):
|
||||
return psycopg2.connect(**self.db_config)
|
||||
|
||||
def init_database(self):
|
||||
try:
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("\n SELECT table_name FROM information_schema.tables\n WHERE table_schema = 'public' AND table_name IN ('files', 'operations')\n ")
|
||||
tables = cursor.fetchall()
|
||||
if len(tables) < 2:
|
||||
logger.error('Database tables not found! Please run setup_database.sh first.')
|
||||
raise Exception('Database not properly initialized. Run setup_database.sh')
|
||||
cursor.close()
|
||||
conn.close()
|
||||
logger.info('Database connection verified successfully')
|
||||
except psycopg2.Error as e:
|
||||
logger.error(f'Database connection failed: {e}')
|
||||
raise
|
||||
|
||||
def index_disk(self, disk_root: str, disk_name: str):
|
||||
logger.info(f'Indexing disk: {disk_name} at {disk_root}')
|
||||
disk_path = Path(disk_root)
|
||||
if not disk_path.exists():
|
||||
logger.error(f'Disk path {disk_root} does not exist!')
|
||||
return
|
||||
files_count = 0
|
||||
total_size = 0
|
||||
start_time = time.time()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
for root, dirs, files in os.walk(disk_path):
|
||||
dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))]
|
||||
for file in files:
|
||||
try:
|
||||
file_path = Path(root) / file
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
stat = file_path.stat()
|
||||
size = stat.st_size
|
||||
mtime = datetime.fromtimestamp(stat.st_mtime)
|
||||
rel_path = str(file_path.relative_to(disk_path))
|
||||
cursor.execute('\n INSERT INTO files (path, size, modified_time, disk_label, checksum, status)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n disk_label = EXCLUDED.disk_label,\n status = EXCLUDED.status\n ', (rel_path, size, mtime, disk_name, None, 'indexed'))
|
||||
files_count += 1
|
||||
total_size += size
|
||||
if files_count % 100 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
rate = files_count / elapsed if elapsed > 0 else 0
|
||||
display_path = str(file_path)
|
||||
if len(display_path) > 60:
|
||||
display_path = '...' + display_path[-57:]
|
||||
print(f'\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}', end='', flush=True)
|
||||
if files_count % 1000 == 0:
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.warning(f'\nSkipping {file_path}: {e}')
|
||||
continue
|
||||
conn.commit()
|
||||
print()
|
||||
logger.info(f'Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}')
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def calculate_disk_usage(self) -> Dict[str, Dict]:
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute('\n SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count\n FROM files\n GROUP BY disk_label\n ')
|
||||
usage = {}
|
||||
for row in cursor.fetchall():
|
||||
disk = row[0]
|
||||
size = int(row[1] or 0)
|
||||
count = int(row[2])
|
||||
usage[disk] = {'size': size, 'count': count, 'formatted_size': self.format_size(size)}
|
||||
return usage
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict:
|
||||
logger.info(f'Planning migration to free up {target_disk}')
|
||||
usage = self.calculate_disk_usage()
|
||||
if target_disk not in usage:
|
||||
logger.error(f'Target disk {target_disk} not found in index!')
|
||||
return {}
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC', (target_disk,))
|
||||
files_to_move = cursor.fetchall()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
target_disk_usage = usage[target_disk]['size']
|
||||
logger.info(f'Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}')
|
||||
dest_availability = []
|
||||
for disk in destination_disks:
|
||||
if disk not in usage:
|
||||
available = float('inf')
|
||||
else:
|
||||
available = float('inf')
|
||||
dest_availability.append({'disk': disk, 'available': available, 'planned_usage': 0})
|
||||
plan = {'target_disk': target_disk, 'total_size': target_disk_usage, 'file_count': len(files_to_move), 'operations': [], 'destination_disks': destination_disks}
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
for file_info in files_to_move:
|
||||
rel_path, size, mtime = file_info
|
||||
dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)]
|
||||
op = {'source_disk': target_disk, 'source_path': rel_path, 'dest_disk': dest_disk, 'target_path': rel_path, 'size': int(size)}
|
||||
plan['operations'].append(op)
|
||||
cursor.execute('INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)', (f'{target_disk}:{rel_path}', f'{dest_disk}:{rel_path}', 'move', 'pending'))
|
||||
conn.commit()
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||
with open(plan_file, 'w') as f:
|
||||
json.dump(plan, f, indent=2)
|
||||
logger.info(f"Plan created with {len(plan['operations'])} operations")
|
||||
logger.info(f'Plan saved to {plan_file}')
|
||||
return plan
|
||||
|
||||
def verify_operation(self, source: Path, dest: Path) -> bool:
|
||||
if not dest.exists():
|
||||
return False
|
||||
try:
|
||||
source_stat = source.stat()
|
||||
dest_stat = dest.stat()
|
||||
if source_stat.st_size != dest_stat.st_size:
|
||||
return False
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f'Verification error: {e}')
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def file_checksum(path: Path) -> str:
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(path, 'rb') as f:
|
||||
for chunk in iter(lambda: f.read(4096), b''):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
def execute_migration(self, plan_file: str, dry_run: bool=True):
|
||||
logger.info(f"{('DRY RUN' if dry_run else 'EXECUTING')} migration from {plan_file}")
|
||||
with open(plan_file, 'r') as f:
|
||||
plan = json.load(f)
|
||||
operations = plan['operations']
|
||||
logger.info(f'Processing {len(operations)} operations...')
|
||||
success_count = 0
|
||||
error_count = 0
|
||||
start_time = time.time()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
for i, op in enumerate(operations, 1):
|
||||
source_disk = op['source_disk']
|
||||
source_path = op['source_path']
|
||||
dest_disk = op['dest_disk']
|
||||
target_path = op['target_path']
|
||||
source_full = Path(source_disk) / source_path
|
||||
dest_full = Path(dest_disk) / target_path
|
||||
elapsed = time.time() - start_time
|
||||
rate = i / elapsed if elapsed > 0 else 0
|
||||
eta = (len(operations) - i) / rate if rate > 0 else 0
|
||||
display_path = str(source_path)
|
||||
if len(display_path) > 50:
|
||||
display_path = '...' + display_path[-47:]
|
||||
print(f'\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}', end='', flush=True)
|
||||
if dry_run:
|
||||
if source_full.exists():
|
||||
success_count += 1
|
||||
else:
|
||||
logger.warning(f'\n Source does not exist: {source_full}')
|
||||
error_count += 1
|
||||
continue
|
||||
try:
|
||||
dest_full.parent.mkdir(parents=True, exist_ok=True)
|
||||
if source_full.exists():
|
||||
shutil.copy2(source_full, dest_full)
|
||||
if self.verify_operation(source_full, dest_full):
|
||||
cursor.execute("UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s", (dest_disk, source_path, source_disk))
|
||||
cursor.execute('UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s', (f'{source_disk}:{source_path}',))
|
||||
success_count += 1
|
||||
else:
|
||||
raise Exception('Verification failed')
|
||||
else:
|
||||
logger.warning(f'\n Source missing: {source_full}')
|
||||
error_count += 1
|
||||
except Exception as e:
|
||||
logger.error(f'\n Error processing {source_path}: {e}')
|
||||
cursor.execute('UPDATE operations SET error = %s WHERE source_path = %s', (str(e), f'{source_disk}:{source_path}'))
|
||||
error_count += 1
|
||||
if i % 10 == 0:
|
||||
conn.commit()
|
||||
conn.commit()
|
||||
print()
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
logger.info(f'Migration complete: {success_count} success, {error_count} errors')
|
||||
if not dry_run and error_count == 0:
|
||||
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
|
||||
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
|
||||
|
||||
def run_deduplication(self, disk: Optional[str]=None, use_chunks: bool=True):
|
||||
logger.info(f"Starting deduplication{(' for disk ' + disk if disk else '')}")
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
def hash_file_local(file_path: Path) -> str:
|
||||
hasher = hashlib.sha256()
|
||||
with open(file_path, 'rb') as f:
|
||||
while (chunk := f.read(65536)):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
try:
|
||||
if disk:
|
||||
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC', (disk,))
|
||||
else:
|
||||
cursor.execute('SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC')
|
||||
files_to_process = cursor.fetchall()
|
||||
total = len(files_to_process)
|
||||
logger.info(f'Found {total} files to hash')
|
||||
processed = 0
|
||||
skipped = 0
|
||||
start_time = time.time()
|
||||
batch = []
|
||||
print(f'Phase 1: Computing checksums...')
|
||||
for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
|
||||
try:
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
|
||||
if not full_path.exists():
|
||||
skipped += 1
|
||||
if idx % 100 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
|
||||
remaining = (total - idx) / rate if rate > 0 else 0
|
||||
pct = 100 * idx / total
|
||||
print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
|
||||
continue
|
||||
checksum = hash_file_local(full_path)
|
||||
batch.append((checksum, path_str))
|
||||
processed += 1
|
||||
if len(batch) >= 1000:
|
||||
try:
|
||||
cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
batch.clear()
|
||||
print(f'\nBatch update failed: {e}')
|
||||
if idx % 100 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
|
||||
remaining = (total - idx) / rate if rate > 0 else 0
|
||||
pct = 100 * idx / total
|
||||
print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
|
||||
except Exception as e:
|
||||
skipped += 1
|
||||
if idx <= 5:
|
||||
print(f'\nDebug: {full_path} - {e}')
|
||||
if batch:
|
||||
try:
|
||||
cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f'\nFinal batch failed: {e}')
|
||||
print()
|
||||
elapsed = time.time() - start_time
|
||||
logger.info(f'Phase 1 done: {processed:,} files in {int(elapsed / 60)}m{int(elapsed % 60):02d}s ({skipped:,} skipped)')
|
||||
print('Phase 2: Finding duplicates...')
|
||||
cursor.execute('\n UPDATE files f1 SET duplicate_of = (\n SELECT MIN(path) FROM files f2\n WHERE f2.checksum = f1.checksum AND f2.path < f1.path\n )\n WHERE checksum IS NOT NULL\n ')
|
||||
conn.commit()
|
||||
cursor.execute('SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL')
|
||||
dup_count = cursor.fetchone()[0]
|
||||
logger.info(f'Phase 2 done: Found {dup_count:,} duplicates')
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def plan_merge(self, sources: List[str], target: str, output_file: str, filter_system: bool=False, network_target: str=None):
|
||||
logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
|
||||
if filter_system:
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from filters import GitignoreFilter
|
||||
file_filter = GitignoreFilter()
|
||||
logger.info('System/build file filtering enabled')
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
placeholders = ','.join(['%s'] * len(sources))
|
||||
cursor.execute(f'\n SELECT path, size, checksum, disk_label, duplicate_of\n FROM files\n WHERE disk_label IN ({placeholders})\n ORDER BY size DESC\n ', tuple(sources))
|
||||
files = cursor.fetchall()
|
||||
total_files = len(files)
|
||||
total_size = sum((int(f[1]) for f in files))
|
||||
unique_files = {}
|
||||
duplicate_count = 0
|
||||
duplicate_size = 0
|
||||
filtered_count = 0
|
||||
filtered_size = 0
|
||||
for path, size, checksum, disk_label, duplicate_of in files:
|
||||
if filter_system and file_filter.should_exclude(path):
|
||||
filtered_count += 1
|
||||
filtered_size += int(size)
|
||||
continue
|
||||
if checksum and checksum in unique_files:
|
||||
duplicate_count += 1
|
||||
duplicate_size += int(size)
|
||||
elif checksum:
|
||||
unique_files[checksum] = (path, int(size), disk_label)
|
||||
unique_count = len(unique_files)
|
||||
unique_size = sum((f[1] for f in unique_files.values()))
|
||||
plan = {'sources': sources, 'target': target or network_target, 'network': network_target is not None, 'total_files': total_files, 'total_size': total_size, 'unique_files': unique_count, 'unique_size': unique_size, 'duplicate_files': duplicate_count, 'duplicate_size': duplicate_size, 'filtered_files': filtered_count if filter_system else 0, 'filtered_size': filtered_size if filter_system else 0, 'space_saved': duplicate_size + (filtered_size if filter_system else 0), 'operations': []}
|
||||
for checksum, (path, size, disk_label) in unique_files.items():
|
||||
plan['operations'].append({'source_disk': disk_label, 'source_path': path, 'target_disk': target or network_target, 'target_path': path, 'size': size, 'checksum': checksum})
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(plan, f, indent=2)
|
||||
logger.info(f'Merge plan saved to {output_file}')
|
||||
print(f'\n=== MERGE PLAN SUMMARY ===')
|
||||
print(f"Sources: {', '.join(sources)}")
|
||||
print(f'Target: {target or network_target}')
|
||||
print(f'Total files: {total_files:,} ({self.format_size(total_size)})')
|
||||
if filter_system:
|
||||
print(f'Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})')
|
||||
print(f'Unique files: {unique_count:,} ({self.format_size(unique_size)})')
|
||||
print(f'Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})')
|
||||
print(f"Total space saved: {self.format_size(plan['space_saved'])}")
|
||||
print(f'Space needed on target: {self.format_size(unique_size)}')
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
if preview_merge:
|
||||
with open(preview_merge, 'r') as f:
|
||||
plan = json.load(f)
|
||||
print('\n=== MERGE PLAN PREVIEW ===')
|
||||
print(f"Sources: {', '.join(plan['sources'])}")
|
||||
print(f"Target: {plan['target']}")
|
||||
print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
|
||||
print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
|
||||
print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
|
||||
print(f"Space saved: {self.format_size(plan['space_saved'])}")
|
||||
print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
|
||||
return
|
||||
cursor.execute('\n SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status\n ')
|
||||
print('\n=== FILE MIGRATION REPORT ===')
|
||||
for row in cursor.fetchall():
|
||||
status, count, size = row
|
||||
print(f'{status:15}: {count:6} files, {self.format_size(int(size or 0))}')
|
||||
cursor.execute('\n SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label\n ')
|
||||
print('\n=== DISK USAGE ===')
|
||||
for row in cursor.fetchall():
|
||||
disk, count, size = row
|
||||
print(f'{disk:20}: {count:6} files, {self.format_size(int(size or 0))}')
|
||||
cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL\n ')
|
||||
hashed_count, hashed_size = cursor.fetchone()
|
||||
cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL\n ')
|
||||
dup_count, dup_size = cursor.fetchone()
|
||||
print('\n=== DEDUPLICATION STATS ===')
|
||||
print(f'Files with checksums: {hashed_count or 0:6}')
|
||||
print(f'Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})')
|
||||
if show_duplicates and dup_count:
|
||||
print('\n=== DUPLICATE FILES ===')
|
||||
cursor.execute('\n SELECT path, size, duplicate_of FROM files\n WHERE duplicate_of IS NOT NULL\n ORDER BY size DESC\n LIMIT 20\n ')
|
||||
for path, size, dup_of in cursor.fetchall():
|
||||
print(f' {path} ({self.format_size(int(size))}) → {dup_of}')
|
||||
cursor.execute('\n SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified\n ')
|
||||
print('\n=== OPERATIONS REPORT ===')
|
||||
for row in cursor.fetchall():
|
||||
op_type, executed, verified, count = row
|
||||
status = 'EXECUTED' if executed else 'PENDING'
|
||||
if verified:
|
||||
status += '+VERIFIED'
|
||||
print(f'{op_type:10} {status:15}: {count} operations')
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def profile_content(self, disk: Optional[str]=None, update_db: bool=False, limit: Optional[int]=None):
|
||||
from content.profiler import ContentProfiler
|
||||
profiler = ContentProfiler()
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
query = 'SELECT path, size, disk_label FROM files WHERE 1=1'
|
||||
params = []
|
||||
if disk:
|
||||
query += ' AND disk_label = %s'
|
||||
params.append(disk)
|
||||
if limit:
|
||||
query += f' LIMIT {limit}'
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
total = len(files)
|
||||
logger.info(f'Profiling {total:,} files...')
|
||||
kind_stats = {}
|
||||
processable = 0
|
||||
batch = []
|
||||
for idx, (path, size, disk_label) in enumerate(files, 1):
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
||||
if not full_path.exists():
|
||||
continue
|
||||
profile = profiler.profile_file(full_path)
|
||||
if 'error' not in profile:
|
||||
kind = profile['kind']
|
||||
if kind not in kind_stats:
|
||||
kind_stats[kind] = {'count': 0, 'processable': 0}
|
||||
kind_stats[kind]['count'] += 1
|
||||
if profile['processable']:
|
||||
kind_stats[kind]['processable'] += 1
|
||||
processable += 1
|
||||
if update_db:
|
||||
profile_json = json.dumps(profile)
|
||||
batch.append((kind, profile_json, path))
|
||||
if len(batch) >= 500:
|
||||
cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
if idx % 100 == 0:
|
||||
print(f'\rProfiled: {idx:,}/{total:,}', end='', flush=True)
|
||||
if update_db and batch:
|
||||
cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
|
||||
conn.commit()
|
||||
print()
|
||||
print(f'\n=== CONTENT PROFILE SUMMARY ===')
|
||||
print(f'Total files: {total:,}')
|
||||
print(f'Processable: {processable:,}\n')
|
||||
print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
|
||||
print('-' * 60)
|
||||
for kind in sorted(kind_stats.keys()):
|
||||
stats = kind_stats[kind]
|
||||
extractor = profiler._suggest_extractor(kind, '')
|
||||
print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def extract_content(self, kind: Optional[str]=None, limit: int=10):
|
||||
from content.profiler import ContentProfiler
|
||||
from content.extractors import ContentExtractor
|
||||
profiler = ContentProfiler()
|
||||
extractor = ContentExtractor()
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
|
||||
params = []
|
||||
if kind:
|
||||
query += " AND metadata->'profile'->>'kind' = %s"
|
||||
params.append(kind)
|
||||
query += f' LIMIT {limit}'
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
print(f'\n=== EXTRACTING CONTENT ===')
|
||||
print(f'Processing {len(files)} files\n')
|
||||
for path, size, disk_label, metadata in files:
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
||||
if not full_path.exists():
|
||||
continue
|
||||
profile = metadata.get('profile', {}) if metadata else {}
|
||||
extractor_type = profile.get('extractor')
|
||||
if not extractor_type:
|
||||
continue
|
||||
print(f'Extracting: {path}')
|
||||
print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}")
|
||||
result = extractor.extract(full_path, extractor_type)
|
||||
if 'text' in result:
|
||||
preview = result['text'][:200]
|
||||
print(f' Preview: {preview}...')
|
||||
elif 'pipeline' in result:
|
||||
print(f" Pipeline: {' → '.join(result['pipeline'])}")
|
||||
print(f" Status: {result.get('status', 'pending')}")
|
||||
print()
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False):
|
||||
from parsers.text_parser import TextParser
|
||||
from parsers.code_parser import CodeParser
|
||||
from parsers.pdf_parser import PDFParser
|
||||
|
||||
parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()}
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
||||
params = []
|
||||
if kind:
|
||||
suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
|
||||
if kind in suffix_map:
|
||||
query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
|
||||
print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
|
||||
|
||||
parsed_count = 0
|
||||
for path, size, disk_label in files:
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
||||
|
||||
if not full_path.exists() or int(size) > 10 * 1024 * 1024:
|
||||
continue
|
||||
|
||||
file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text'
|
||||
parser = parsers.get(file_kind)
|
||||
if not parser:
|
||||
continue
|
||||
|
||||
result = parser.parse(full_path)
|
||||
if 'error' not in result:
|
||||
text = result.get('text', '')
|
||||
quality = result.get('quality', 'unknown')
|
||||
print(f"{path[:60]} | {file_kind} | {len(text):,} chars")
|
||||
|
||||
if update_db and text:
|
||||
cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path))
|
||||
parsed_count += 1
|
||||
if parsed_count % 10 == 0:
|
||||
conn.commit()
|
||||
|
||||
if update_db:
|
||||
conn.commit()
|
||||
print(f"\nParsed {parsed_count} files")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
|
||||
from enrichment.enricher import ContentEnricher
|
||||
|
||||
enricher = ContentEnricher()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
|
||||
files = cursor.fetchall()
|
||||
|
||||
print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
|
||||
|
||||
for path, text in files:
|
||||
enrichment = enricher.enrich(text[:5000], use_llm=False)
|
||||
print(f"{path[:60]}")
|
||||
print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
|
||||
print(f" PII: {list(enrichment.get('has_pii', {}).keys())}")
|
||||
print(f" Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
|
||||
|
||||
cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
|
||||
|
||||
conn.commit()
|
||||
print(f"Enriched {len(files)} files")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True):
|
||||
from classification.classifier import FileClassifier
|
||||
classifier = FileClassifier()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
task_name = f"classify_{disk or 'all'}"
|
||||
skip_count = 0
|
||||
|
||||
if resume and update_db:
|
||||
cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,))
|
||||
checkpoint = cursor.fetchone()
|
||||
if checkpoint:
|
||||
last_path, skip_count = checkpoint
|
||||
logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed')
|
||||
|
||||
if disk:
|
||||
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,))
|
||||
else:
|
||||
cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path')
|
||||
files = cursor.fetchall()
|
||||
total = len(files)
|
||||
logger.info(f'Classifying {total:,} files...')
|
||||
|
||||
categories = {}
|
||||
build_artifacts = 0
|
||||
batch = []
|
||||
processed = 0
|
||||
|
||||
for idx, (path, size, disk_label) in enumerate(files, 1):
|
||||
if idx <= skip_count:
|
||||
continue
|
||||
|
||||
labels, category, is_build = classifier.classify_path(path, int(size))
|
||||
if is_build:
|
||||
build_artifacts += 1
|
||||
if category not in categories:
|
||||
categories[category] = {'count': 0, 'size': 0}
|
||||
categories[category]['count'] += 1
|
||||
categories[category]['size'] += int(size)
|
||||
|
||||
if update_db:
|
||||
labels_str = ','.join(labels)
|
||||
batch.append((category, labels_str, path))
|
||||
|
||||
if len(batch) >= 1000:
|
||||
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
|
||||
cursor.execute('''
|
||||
INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
|
||||
VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT (task_name) DO UPDATE SET
|
||||
last_processed_path = EXCLUDED.last_processed_path,
|
||||
processed_count = EXCLUDED.processed_count,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
''', (task_name, path, idx))
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
|
||||
processed += 1
|
||||
if idx % 1000 == 0:
|
||||
print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True)
|
||||
|
||||
if update_db and batch:
|
||||
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
|
||||
cursor.execute('''
|
||||
INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
|
||||
VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT (task_name) DO UPDATE SET
|
||||
last_processed_path = EXCLUDED.last_processed_path,
|
||||
processed_count = EXCLUDED.processed_count,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
''', (task_name, files[-1][0] if files else '', total))
|
||||
conn.commit()
|
||||
|
||||
print()
|
||||
print(f'\n=== CLASSIFICATION SUMMARY ===')
|
||||
print(f'Total files: {total:,}')
|
||||
print(f'Build artifacts: {build_artifacts:,}')
|
||||
print(f'\nCategories:')
|
||||
for category in sorted(categories.keys()):
|
||||
info = categories[category]
|
||||
print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
|
||||
from analysis.folder_analyzer import FolderAnalyzer
|
||||
analyzer = FolderAnalyzer()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
query = '''
|
||||
SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label
|
||||
FROM files
|
||||
WHERE 1=1
|
||||
'''
|
||||
params = []
|
||||
if disk:
|
||||
query += ' AND disk_label = %s'
|
||||
params.append(disk)
|
||||
|
||||
cursor.execute(query, params)
|
||||
potential_folders = cursor.fetchall()
|
||||
|
||||
logger.info(f'Found {len(potential_folders)} potential folders to analyze')
|
||||
|
||||
processed = 0
|
||||
for folder_name, disk_label in potential_folders:
|
||||
cursor.execute('''
|
||||
SELECT path, size FROM files
|
||||
WHERE disk_label = %s AND path LIKE %s
|
||||
''', (disk_label, f'{folder_name}%'))
|
||||
|
||||
files = cursor.fetchall()
|
||||
if len(files) < min_files:
|
||||
continue
|
||||
|
||||
files_list = [{'path': f[0], 'size': int(f[1])} for f in files]
|
||||
folder_path = Path(folder_name)
|
||||
|
||||
analysis = analyzer.analyze_folder(folder_path, files_list)
|
||||
|
||||
readme_text = None
|
||||
for file_dict in files_list:
|
||||
if 'readme' in file_dict['path'].lower():
|
||||
readme_text = f"Found README at {file_dict['path']}"
|
||||
break
|
||||
|
||||
summary = analyzer.generate_summary(analysis, readme_text)
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary,
|
||||
has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (path) DO UPDATE SET
|
||||
file_count = EXCLUDED.file_count,
|
||||
total_size = EXCLUDED.total_size,
|
||||
project_type = EXCLUDED.project_type,
|
||||
intent = EXCLUDED.intent,
|
||||
summary = EXCLUDED.summary,
|
||||
has_readme = EXCLUDED.has_readme,
|
||||
has_git = EXCLUDED.has_git,
|
||||
has_manifest = EXCLUDED.has_manifest,
|
||||
manifest_types = EXCLUDED.manifest_types,
|
||||
dominant_file_types = EXCLUDED.dominant_file_types,
|
||||
structure = EXCLUDED.structure,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
''', (
|
||||
str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list),
|
||||
analysis.get('project_type'), analysis.get('intent'), summary,
|
||||
analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'),
|
||||
analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})),
|
||||
json.dumps(analysis.get('structure', {}))
|
||||
))
|
||||
|
||||
processed += 1
|
||||
if processed % 100 == 0:
|
||||
conn.commit()
|
||||
print(f'\rAnalyzed: {processed} folders', end='', flush=True)
|
||||
|
||||
conn.commit()
|
||||
print()
|
||||
logger.info(f'Completed folder analysis: {processed} folders')
|
||||
|
||||
cursor.execute('''
|
||||
SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size)
|
||||
FROM folders
|
||||
GROUP BY project_type
|
||||
''')
|
||||
print(f'\n=== FOLDER ANALYSIS SUMMARY ===')
|
||||
for row in cursor.fetchall():
|
||||
proj_type, count, files, size = row
|
||||
print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}')
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def review_migration(self, category: Optional[str]=None, show_build: bool=False):
|
||||
from classification.classifier import FileClassifier
|
||||
classifier = FileClassifier()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
query = 'SELECT path, size, category FROM files WHERE 1=1'
|
||||
params = []
|
||||
if category:
|
||||
query += ' AND category = %s'
|
||||
params.append(category)
|
||||
if not show_build:
|
||||
query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"
|
||||
query += ' ORDER BY category, size DESC LIMIT 100'
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
if not files:
|
||||
print('No files found matching criteria')
|
||||
return
|
||||
print(f'\n=== MIGRATION PREVIEW ===')
|
||||
print(f'Showing {len(files)} files\n')
|
||||
current_category = None
|
||||
for path, size, cat in files:
|
||||
if cat != current_category:
|
||||
current_category = cat
|
||||
print(f'\n{cat}:')
|
||||
labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
|
||||
target = classifier.suggest_target_path(path, suggested_cat, labels)
|
||||
print(f' {path}')
|
||||
print(f' → {target} ({self.format_size(int(size))})')
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
@staticmethod
|
||||
def format_size(size: int) -> str:
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
if size < 1024:
|
||||
return f'{size:.1f}{unit}'
|
||||
size /= 1024
|
||||
return f'{size:.1f}PB'
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
|
||||
subparsers = parser.add_subparsers(dest='command', required=True)
|
||||
index_parser = subparsers.add_parser('index', help='Index files on a disk')
|
||||
index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
|
||||
index_parser.add_argument('disk_name', help='Logical name for the disk')
|
||||
plan_parser = subparsers.add_parser('plan', help='Create migration plan')
|
||||
plan_parser.add_argument('target_disk', help='Disk to free up')
|
||||
plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
|
||||
exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
|
||||
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
|
||||
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
|
||||
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
|
||||
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
|
||||
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
|
||||
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
|
||||
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
|
||||
merge_parser.add_argument('--target', required=True, help='Target disk')
|
||||
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
|
||||
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
|
||||
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
|
||||
profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
|
||||
profile_parser.add_argument('--disk', help='Profile specific disk')
|
||||
profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
|
||||
profile_parser.add_argument('--limit', type=int, help='Limit number of files')
|
||||
extract_parser = subparsers.add_parser('extract', help='Extract content from files')
|
||||
extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
|
||||
extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
|
||||
|
||||
parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
|
||||
parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
|
||||
parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
|
||||
parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
|
||||
|
||||
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
|
||||
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
|
||||
enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
|
||||
enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
|
||||
|
||||
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
|
||||
classify_parser.add_argument('--disk', help='Classify specific disk')
|
||||
classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
|
||||
classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch instead of resuming')
|
||||
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
|
||||
folders_parser.add_argument('--disk', help='Analyze specific disk')
|
||||
folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
|
||||
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
|
||||
review_parser.add_argument('--category', help='Review specific category')
|
||||
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
|
||||
report_parser = subparsers.add_parser('report', help='Show current status')
|
||||
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
|
||||
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
|
||||
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
|
||||
args = parser.parse_args()
|
||||
tool = DiskReorganizer()
|
||||
if args.command == 'index':
|
||||
tool.index_disk(args.disk_root, args.disk_name)
|
||||
elif args.command == 'dedupe':
|
||||
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
|
||||
elif args.command == 'merge':
|
||||
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output, filter_system=args.filter_system, network_target=args.network)
|
||||
elif args.command == 'plan':
|
||||
plan = tool.plan_migration(args.target_disk, args.dest_disks)
|
||||
if plan:
|
||||
print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
|
||||
print(f"Destination disks: {', '.join(plan['destination_disks'])}")
|
||||
elif args.command == 'execute':
|
||||
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
|
||||
elif args.command == 'profile':
|
||||
tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
|
||||
elif args.command == 'extract':
|
||||
tool.extract_content(kind=args.kind, limit=args.limit)
|
||||
elif args.command == 'parse':
|
||||
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
|
||||
elif args.command == 'enrich':
|
||||
tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
|
||||
elif args.command == 'classify':
|
||||
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
|
||||
elif args.command == 'analyze-folders':
|
||||
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
|
||||
elif args.command == 'review':
|
||||
tool.review_migration(category=args.category, show_build=args.show_build)
|
||||
elif args.command == 'report':
|
||||
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
27
app/migration/__init__.py
Normal file
27
app/migration/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""Migration package exports"""
|
||||
from .copy import (
|
||||
CopyMigrationStrategy,
|
||||
FastCopyStrategy,
|
||||
SafeCopyStrategy,
|
||||
ReferenceCopyStrategy
|
||||
)
|
||||
from .hardlink import (
|
||||
HardlinkMigrationStrategy,
|
||||
SymlinkMigrationStrategy,
|
||||
DedupHardlinkStrategy
|
||||
)
|
||||
from .engine import MigrationEngine
|
||||
from ._protocols import IMigrationStrategy, IMigrationEngine
|
||||
|
||||
__all__ = [
|
||||
'CopyMigrationStrategy',
|
||||
'FastCopyStrategy',
|
||||
'SafeCopyStrategy',
|
||||
'ReferenceCopyStrategy',
|
||||
'HardlinkMigrationStrategy',
|
||||
'SymlinkMigrationStrategy',
|
||||
'DedupHardlinkStrategy',
|
||||
'MigrationEngine',
|
||||
'IMigrationStrategy',
|
||||
'IMigrationEngine',
|
||||
]
|
||||
107
app/migration/_protocols.py
Normal file
107
app/migration/_protocols.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""Protocol definitions for the migration package"""
|
||||
from typing import Protocol
|
||||
from pathlib import Path
|
||||
from ..shared.models import OperationRecord
|
||||
|
||||
|
||||
class IMigrationStrategy(Protocol):
|
||||
"""Protocol for migration strategies"""
|
||||
|
||||
def migrate(
|
||||
self,
|
||||
source: Path,
|
||||
destination: Path,
|
||||
verify: bool = True
|
||||
) -> bool:
|
||||
"""Migrate a file from source to destination
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
verify: Whether to verify the operation
|
||||
|
||||
Returns:
|
||||
True if migration successful
|
||||
"""
|
||||
...
|
||||
|
||||
def can_migrate(self, source: Path, destination: Path) -> bool:
|
||||
"""Check if migration is possible
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
|
||||
Returns:
|
||||
True if migration is possible
|
||||
"""
|
||||
...
|
||||
|
||||
def estimate_time(self, source: Path) -> float:
|
||||
"""Estimate migration time in seconds
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
|
||||
Returns:
|
||||
Estimated time in seconds
|
||||
"""
|
||||
...
|
||||
|
||||
def cleanup(self, source: Path) -> bool:
|
||||
"""Cleanup source file after successful migration
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
|
||||
Returns:
|
||||
True if cleanup successful
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class IMigrationEngine(Protocol):
|
||||
"""Protocol for migration engine"""
|
||||
|
||||
def plan_migration(
|
||||
self,
|
||||
disk: str,
|
||||
target_base: Path
|
||||
) -> list[OperationRecord]:
|
||||
"""Plan migration for a disk
|
||||
|
||||
Args:
|
||||
disk: Disk identifier
|
||||
target_base: Target base directory
|
||||
|
||||
Returns:
|
||||
List of planned operations
|
||||
"""
|
||||
...
|
||||
|
||||
def execute_migration(
|
||||
self,
|
||||
operations: list[OperationRecord],
|
||||
dry_run: bool = False
|
||||
) -> dict:
|
||||
"""Execute migration operations
|
||||
|
||||
Args:
|
||||
operations: List of operations to execute
|
||||
dry_run: Whether to perform a dry run
|
||||
|
||||
Returns:
|
||||
Dictionary with execution statistics
|
||||
"""
|
||||
...
|
||||
|
||||
def rollback(self, operation: OperationRecord) -> bool:
|
||||
"""Rollback a migration operation
|
||||
|
||||
Args:
|
||||
operation: Operation to rollback
|
||||
|
||||
Returns:
|
||||
True if rollback successful
|
||||
"""
|
||||
...
|
||||
268
app/migration/copy.py
Normal file
268
app/migration/copy.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""Copy-based migration strategy"""
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import os
|
||||
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
|
||||
class CopyMigrationStrategy:
|
||||
"""Copy files to destination with verification"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
logger: Optional[ProgressLogger] = None,
|
||||
preserve_metadata: bool = True,
|
||||
verify_checksums: bool = True
|
||||
):
|
||||
"""Initialize copy migration strategy
|
||||
|
||||
Args:
|
||||
logger: Optional progress logger
|
||||
preserve_metadata: Whether to preserve file metadata
|
||||
verify_checksums: Whether to verify checksums after copy
|
||||
"""
|
||||
self.logger = logger
|
||||
self.preserve_metadata = preserve_metadata
|
||||
self.verify_checksums = verify_checksums
|
||||
|
||||
def migrate(
|
||||
self,
|
||||
source: Path,
|
||||
destination: Path,
|
||||
verify: bool = True
|
||||
) -> bool:
|
||||
"""Migrate file by copying
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
verify: Whether to verify the operation
|
||||
|
||||
Returns:
|
||||
True if migration successful
|
||||
"""
|
||||
if not source.exists():
|
||||
if self.logger:
|
||||
self.logger.error(f"Source file does not exist: {source}")
|
||||
return False
|
||||
|
||||
# Create destination directory
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Copy file
|
||||
if self.preserve_metadata:
|
||||
shutil.copy2(source, destination)
|
||||
else:
|
||||
shutil.copy(source, destination)
|
||||
|
||||
# Verify if requested
|
||||
if verify and self.verify_checksums:
|
||||
if not self._verify_copy(source, destination):
|
||||
if self.logger:
|
||||
self.logger.error(f"Verification failed: {source} -> {destination}")
|
||||
destination.unlink()
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"Copy failed: {source} -> {destination}: {e}")
|
||||
return False
|
||||
|
||||
def _verify_copy(self, source: Path, destination: Path) -> bool:
|
||||
"""Verify copied file
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
|
||||
Returns:
|
||||
True if verification successful
|
||||
"""
|
||||
# Check size
|
||||
source_size = source.stat().st_size
|
||||
dest_size = destination.stat().st_size
|
||||
|
||||
if source_size != dest_size:
|
||||
return False
|
||||
|
||||
# Compare checksums for files larger than 1MB
|
||||
if source_size > 1024 * 1024:
|
||||
from ..deduplication.chunker import hash_file
|
||||
|
||||
source_hash = hash_file(source)
|
||||
dest_hash = hash_file(destination)
|
||||
|
||||
return source_hash == dest_hash
|
||||
|
||||
# For small files, compare content directly
|
||||
with open(source, 'rb') as f1, open(destination, 'rb') as f2:
|
||||
return f1.read() == f2.read()
|
||||
|
||||
def can_migrate(self, source: Path, destination: Path) -> bool:
|
||||
"""Check if migration is possible
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
|
||||
Returns:
|
||||
True if migration is possible
|
||||
"""
|
||||
if not source.exists():
|
||||
return False
|
||||
|
||||
# Check if destination directory is writable
|
||||
dest_dir = destination.parent
|
||||
if dest_dir.exists():
|
||||
return os.access(dest_dir, os.W_OK)
|
||||
|
||||
# Check if parent directory exists and is writable
|
||||
parent = dest_dir.parent
|
||||
while not parent.exists() and parent != parent.parent:
|
||||
parent = parent.parent
|
||||
|
||||
return parent.exists() and os.access(parent, os.W_OK)
|
||||
|
||||
def estimate_time(self, source: Path) -> float:
|
||||
"""Estimate migration time in seconds
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
|
||||
Returns:
|
||||
Estimated time in seconds
|
||||
"""
|
||||
if not source.exists():
|
||||
return 0.0
|
||||
|
||||
size = source.stat().st_size
|
||||
|
||||
# Estimate based on typical copy speed (100 MB/s)
|
||||
typical_speed = 100 * 1024 * 1024 # bytes per second
|
||||
return size / typical_speed
|
||||
|
||||
def cleanup(self, source: Path) -> bool:
|
||||
"""Cleanup source file after successful migration
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
|
||||
Returns:
|
||||
True if cleanup successful
|
||||
"""
|
||||
try:
|
||||
if source.exists():
|
||||
source.unlink()
|
||||
return True
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(f"Failed to cleanup {source}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
class FastCopyStrategy(CopyMigrationStrategy):
|
||||
"""Fast copy strategy without verification"""
|
||||
|
||||
def __init__(self, logger: Optional[ProgressLogger] = None):
|
||||
"""Initialize fast copy strategy"""
|
||||
super().__init__(
|
||||
logger=logger,
|
||||
preserve_metadata=True,
|
||||
verify_checksums=False
|
||||
)
|
||||
|
||||
|
||||
class SafeCopyStrategy(CopyMigrationStrategy):
|
||||
"""Safe copy strategy with full verification"""
|
||||
|
||||
def __init__(self, logger: Optional[ProgressLogger] = None):
|
||||
"""Initialize safe copy strategy"""
|
||||
super().__init__(
|
||||
logger=logger,
|
||||
preserve_metadata=True,
|
||||
verify_checksums=True
|
||||
)
|
||||
|
||||
|
||||
class ReferenceCopyStrategy:
|
||||
"""Create reference copy using reflinks (CoW) if supported"""
|
||||
|
||||
def __init__(self, logger: Optional[ProgressLogger] = None):
|
||||
"""Initialize reflink copy strategy"""
|
||||
self.logger = logger
|
||||
|
||||
def migrate(
|
||||
self,
|
||||
source: Path,
|
||||
destination: Path,
|
||||
verify: bool = True
|
||||
) -> bool:
|
||||
"""Migrate using reflink (copy-on-write)
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
verify: Whether to verify the operation
|
||||
|
||||
Returns:
|
||||
True if migration successful
|
||||
"""
|
||||
if not source.exists():
|
||||
if self.logger:
|
||||
self.logger.error(f"Source file does not exist: {source}")
|
||||
return False
|
||||
|
||||
# Create destination directory
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Try reflink copy (works on btrfs, xfs, etc.)
|
||||
import subprocess
|
||||
|
||||
result = subprocess.run(
|
||||
['cp', '--reflink=auto', str(source), str(destination)],
|
||||
capture_output=True,
|
||||
check=False
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
# Fallback to regular copy
|
||||
shutil.copy2(source, destination)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"Reflink copy failed: {source} -> {destination}: {e}")
|
||||
return False
|
||||
|
||||
def can_migrate(self, source: Path, destination: Path) -> bool:
|
||||
"""Check if migration is possible"""
|
||||
if not source.exists():
|
||||
return False
|
||||
|
||||
dest_dir = destination.parent
|
||||
if dest_dir.exists():
|
||||
return os.access(dest_dir, os.W_OK)
|
||||
|
||||
return True
|
||||
|
||||
def estimate_time(self, source: Path) -> float:
|
||||
"""Estimate migration time (reflinks are fast)"""
|
||||
return 0.1 # Reflinks are nearly instant
|
||||
|
||||
def cleanup(self, source: Path) -> bool:
|
||||
"""Cleanup source file"""
|
||||
try:
|
||||
if source.exists():
|
||||
source.unlink()
|
||||
return True
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(f"Failed to cleanup {source}: {e}")
|
||||
return False
|
||||
454
app/migration/engine.py
Normal file
454
app/migration/engine.py
Normal file
@@ -0,0 +1,454 @@
|
||||
"""Migration engine"""
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
from .copy import CopyMigrationStrategy, SafeCopyStrategy
|
||||
from .hardlink import HardlinkMigrationStrategy, SymlinkMigrationStrategy
|
||||
from ..shared.models import OperationRecord, ProcessingStats, MigrationPlan
|
||||
from ..shared.config import DatabaseConfig, ProcessingConfig
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
|
||||
class MigrationEngine:
|
||||
"""Engine for migrating files"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_config: DatabaseConfig,
|
||||
processing_config: ProcessingConfig,
|
||||
logger: ProgressLogger,
|
||||
target_base: Path
|
||||
):
|
||||
"""Initialize migration engine
|
||||
|
||||
Args:
|
||||
db_config: Database configuration
|
||||
processing_config: Processing configuration
|
||||
logger: Progress logger
|
||||
target_base: Target base directory for migrations
|
||||
"""
|
||||
self.db_config = db_config
|
||||
self.processing_config = processing_config
|
||||
self.logger = logger
|
||||
self.target_base = Path(target_base)
|
||||
self._connection = None
|
||||
|
||||
# Initialize strategies
|
||||
self.copy_strategy = SafeCopyStrategy(logger=logger)
|
||||
self.hardlink_strategy = HardlinkMigrationStrategy(logger=logger)
|
||||
self.symlink_strategy = SymlinkMigrationStrategy(logger=logger)
|
||||
|
||||
def _get_connection(self):
|
||||
"""Get or create database connection"""
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(
|
||||
host=self.db_config.host,
|
||||
port=self.db_config.port,
|
||||
database=self.db_config.database,
|
||||
user=self.db_config.user,
|
||||
password=self.db_config.password
|
||||
)
|
||||
return self._connection
|
||||
|
||||
def _ensure_tables(self):
|
||||
"""Ensure migration tables exist"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create operations table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS operations (
|
||||
id SERIAL PRIMARY KEY,
|
||||
source_path TEXT NOT NULL,
|
||||
target_path TEXT NOT NULL,
|
||||
operation_type TEXT NOT NULL,
|
||||
size BIGINT DEFAULT 0,
|
||||
status TEXT DEFAULT 'pending',
|
||||
error TEXT,
|
||||
executed_at TIMESTAMP,
|
||||
verified BOOLEAN DEFAULT FALSE,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# Create index on status
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_operations_status
|
||||
ON operations(status)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def plan_migration(
|
||||
self,
|
||||
disk: Optional[str] = None,
|
||||
category: Optional[str] = None
|
||||
) -> MigrationPlan:
|
||||
"""Plan migration for files
|
||||
|
||||
Args:
|
||||
disk: Optional disk filter
|
||||
category: Optional category filter
|
||||
|
||||
Returns:
|
||||
MigrationPlan with planned operations
|
||||
"""
|
||||
self.logger.section("Planning Migration")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Build query
|
||||
conditions = ["category IS NOT NULL"]
|
||||
params = []
|
||||
|
||||
if disk:
|
||||
conditions.append("disk_label = %s")
|
||||
params.append(disk)
|
||||
|
||||
if category:
|
||||
conditions.append("category = %s")
|
||||
params.append(category)
|
||||
|
||||
query = f"""
|
||||
SELECT path, size, category, duplicate_of
|
||||
FROM files
|
||||
WHERE {' AND '.join(conditions)}
|
||||
ORDER BY category, path
|
||||
"""
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
|
||||
self.logger.info(f"Found {len(files)} files to migrate")
|
||||
|
||||
operations = []
|
||||
total_size = 0
|
||||
|
||||
for path_str, size, file_category, duplicate_of in files:
|
||||
source = Path(path_str)
|
||||
|
||||
# Determine destination
|
||||
target_path = self.target_base / file_category / source.name
|
||||
|
||||
# Determine operation type
|
||||
if duplicate_of:
|
||||
# Use hardlink for duplicates
|
||||
operation_type = 'hardlink'
|
||||
else:
|
||||
# Use copy for unique files
|
||||
operation_type = 'copy'
|
||||
|
||||
operation = OperationRecord(
|
||||
source_path=source,
|
||||
target_path=target_path,
|
||||
operation_type=operation_type,
|
||||
size=size
|
||||
)
|
||||
|
||||
operations.append(operation)
|
||||
total_size += size
|
||||
|
||||
cursor.close()
|
||||
|
||||
plan = MigrationPlan(
|
||||
target_disk=str(self.target_base),
|
||||
destination_disks=[str(self.target_base)],
|
||||
operations=operations,
|
||||
total_size=total_size,
|
||||
file_count=len(operations)
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
f"Migration plan created: {plan.file_count} files, "
|
||||
f"{plan.total_size:,} bytes"
|
||||
)
|
||||
|
||||
return plan
|
||||
|
||||
def execute_migration(
|
||||
self,
|
||||
operations: list[OperationRecord],
|
||||
dry_run: bool = False,
|
||||
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
|
||||
) -> ProcessingStats:
|
||||
"""Execute migration operations
|
||||
|
||||
Args:
|
||||
operations: List of operations to execute
|
||||
dry_run: Whether to perform a dry run
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
ProcessingStats with execution statistics
|
||||
"""
|
||||
self.logger.section("Executing Migration" + (" (DRY RUN)" if dry_run else ""))
|
||||
|
||||
self._ensure_tables()
|
||||
|
||||
stats = ProcessingStats()
|
||||
total_ops = len(operations)
|
||||
|
||||
for operation in operations:
|
||||
stats.files_processed += 1
|
||||
|
||||
if dry_run:
|
||||
# In dry run, just log what would happen
|
||||
self.logger.debug(
|
||||
f"[DRY RUN] Would {operation.operation_type}: "
|
||||
f"{operation.source_path} -> {operation.target_path}"
|
||||
)
|
||||
stats.files_succeeded += 1
|
||||
else:
|
||||
# Execute actual migration
|
||||
success = self._execute_operation(operation)
|
||||
|
||||
if success:
|
||||
stats.files_succeeded += 1
|
||||
stats.bytes_processed += operation.size
|
||||
else:
|
||||
stats.files_failed += 1
|
||||
|
||||
# Progress callback
|
||||
if progress_callback and stats.files_processed % 100 == 0:
|
||||
progress_callback(stats.files_processed, total_ops, stats)
|
||||
|
||||
# Log progress
|
||||
if stats.files_processed % 1000 == 0:
|
||||
self.logger.progress(
|
||||
stats.files_processed,
|
||||
total_ops,
|
||||
prefix="Operations executed",
|
||||
bytes_processed=stats.bytes_processed,
|
||||
elapsed_seconds=stats.elapsed_seconds
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
f"Migration {'dry run' if dry_run else 'execution'} complete: "
|
||||
f"{stats.files_succeeded}/{total_ops} operations, "
|
||||
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
def _execute_operation(self, operation: OperationRecord) -> bool:
|
||||
"""Execute a single migration operation
|
||||
|
||||
Args:
|
||||
operation: Operation to execute
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
operation.status = 'in_progress'
|
||||
operation.executed_at = datetime.now()
|
||||
|
||||
try:
|
||||
# Select strategy based on operation type
|
||||
if operation.operation_type == 'copy':
|
||||
strategy = self.copy_strategy
|
||||
elif operation.operation_type == 'hardlink':
|
||||
strategy = self.hardlink_strategy
|
||||
elif operation.operation_type == 'symlink':
|
||||
strategy = self.symlink_strategy
|
||||
else:
|
||||
raise ValueError(f"Unknown operation type: {operation.operation_type}")
|
||||
|
||||
# Execute migration
|
||||
success = strategy.migrate(
|
||||
operation.source_path,
|
||||
operation.target_path,
|
||||
verify=self.processing_config.verify_operations
|
||||
)
|
||||
|
||||
if success:
|
||||
operation.status = 'completed'
|
||||
operation.verified = True
|
||||
self._record_operation(operation)
|
||||
return True
|
||||
else:
|
||||
operation.status = 'failed'
|
||||
operation.error = "Migration failed"
|
||||
self._record_operation(operation)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
operation.status = 'failed'
|
||||
operation.error = str(e)
|
||||
self._record_operation(operation)
|
||||
self.logger.error(f"Operation failed: {operation.source_path}: {e}")
|
||||
return False
|
||||
|
||||
def _record_operation(self, operation: OperationRecord):
|
||||
"""Record operation in database
|
||||
|
||||
Args:
|
||||
operation: Operation to record
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO operations (
|
||||
source_path, target_path, operation_type, bytes_processed,
|
||||
status, error, executed_at, verified
|
||||
)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
""", (
|
||||
str(operation.source_path),
|
||||
str(operation.target_path),
|
||||
operation.operation_type,
|
||||
operation.size,
|
||||
operation.status,
|
||||
operation.error,
|
||||
operation.executed_at,
|
||||
operation.verified
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def rollback(self, operation: OperationRecord) -> bool:
|
||||
"""Rollback a migration operation
|
||||
|
||||
Args:
|
||||
operation: Operation to rollback
|
||||
|
||||
Returns:
|
||||
True if rollback successful
|
||||
"""
|
||||
self.logger.warning(f"Rolling back: {operation.target_path}")
|
||||
|
||||
try:
|
||||
# Remove destination
|
||||
if operation.target_path.exists():
|
||||
operation.target_path.unlink()
|
||||
|
||||
# Update database
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE operations
|
||||
SET status = 'rolled_back'
|
||||
WHERE source_path = %s AND target_path = %s
|
||||
""", (str(operation.source_path), str(operation.target_path)))
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Rollback failed: {operation.target_path}: {e}")
|
||||
return False
|
||||
|
||||
def get_migration_stats(self) -> dict:
|
||||
"""Get migration statistics
|
||||
|
||||
Returns:
|
||||
Dictionary with statistics
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
stats = {}
|
||||
|
||||
# Total operations
|
||||
cursor.execute("SELECT COUNT(*) FROM operations")
|
||||
stats['total_operations'] = cursor.fetchone()[0]
|
||||
|
||||
# Operations by status
|
||||
cursor.execute("""
|
||||
SELECT status, COUNT(*)
|
||||
FROM operations
|
||||
GROUP BY status
|
||||
""")
|
||||
|
||||
for status, count in cursor.fetchall():
|
||||
stats[f'{status}_operations'] = count
|
||||
|
||||
# Total size migrated
|
||||
cursor.execute("""
|
||||
SELECT COALESCE(SUM(size), 0)
|
||||
FROM operations
|
||||
WHERE status = 'completed'
|
||||
""")
|
||||
stats['total_size_migrated'] = cursor.fetchone()[0]
|
||||
|
||||
cursor.close()
|
||||
|
||||
return stats
|
||||
|
||||
def verify_migrations(self) -> dict:
|
||||
"""Verify completed migrations
|
||||
|
||||
Returns:
|
||||
Dictionary with verification results
|
||||
"""
|
||||
self.logger.subsection("Verifying Migrations")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT source_path, target_path, operation_type
|
||||
FROM operations
|
||||
WHERE status = 'completed' AND verified = FALSE
|
||||
""")
|
||||
|
||||
operations = cursor.fetchall()
|
||||
cursor.close()
|
||||
|
||||
results = {
|
||||
'total': len(operations),
|
||||
'verified': 0,
|
||||
'failed': 0
|
||||
}
|
||||
|
||||
for source_str, dest_str, op_type in operations:
|
||||
source = Path(source_str)
|
||||
dest = Path(dest_str)
|
||||
|
||||
# Verify destination exists
|
||||
if not dest.exists():
|
||||
results['failed'] += 1
|
||||
self.logger.warning(f"Verification failed: {dest} does not exist")
|
||||
continue
|
||||
|
||||
# Verify based on operation type
|
||||
if op_type == 'hardlink':
|
||||
# Check if hardlinked
|
||||
if source.exists() and source.stat().st_ino == dest.stat().st_ino:
|
||||
results['verified'] += 1
|
||||
else:
|
||||
results['failed'] += 1
|
||||
else:
|
||||
# Check if destination exists and has correct size
|
||||
if dest.exists():
|
||||
results['verified'] += 1
|
||||
else:
|
||||
results['failed'] += 1
|
||||
|
||||
self.logger.info(
|
||||
f"Verification complete: {results['verified']}/{results['total']} verified"
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def close(self):
|
||||
"""Close database connection"""
|
||||
if self._connection and not self._connection.closed:
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close()
|
||||
377
app/migration/hardlink.py
Normal file
377
app/migration/hardlink.py
Normal file
@@ -0,0 +1,377 @@
|
||||
"""Hardlink-based migration strategy"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
|
||||
class HardlinkMigrationStrategy:
|
||||
"""Create hardlinks to files instead of copying"""
|
||||
|
||||
def __init__(self, logger: Optional[ProgressLogger] = None):
|
||||
"""Initialize hardlink migration strategy
|
||||
|
||||
Args:
|
||||
logger: Optional progress logger
|
||||
"""
|
||||
self.logger = logger
|
||||
|
||||
def migrate(
|
||||
self,
|
||||
source: Path,
|
||||
destination: Path,
|
||||
verify: bool = True
|
||||
) -> bool:
|
||||
"""Migrate file by creating hardlink
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
verify: Whether to verify the operation
|
||||
|
||||
Returns:
|
||||
True if migration successful
|
||||
"""
|
||||
if not source.exists():
|
||||
if self.logger:
|
||||
self.logger.error(f"Source file does not exist: {source}")
|
||||
return False
|
||||
|
||||
# Check if source and destination are on same filesystem
|
||||
if not self._same_filesystem(source, destination.parent):
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
f"Cannot hardlink across filesystems: {source} -> {destination}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Create destination directory
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Create hardlink
|
||||
os.link(source, destination)
|
||||
|
||||
# Verify if requested
|
||||
if verify:
|
||||
if not self._verify_hardlink(source, destination):
|
||||
if self.logger:
|
||||
self.logger.error(f"Verification failed: {source} -> {destination}")
|
||||
destination.unlink()
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except FileExistsError:
|
||||
if self.logger:
|
||||
self.logger.warning(f"Destination already exists: {destination}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"Hardlink failed: {source} -> {destination}: {e}")
|
||||
return False
|
||||
|
||||
def _same_filesystem(self, path1: Path, path2: Path) -> bool:
|
||||
"""Check if two paths are on the same filesystem
|
||||
|
||||
Args:
|
||||
path1: First path
|
||||
path2: Second path
|
||||
|
||||
Returns:
|
||||
True if on same filesystem
|
||||
"""
|
||||
try:
|
||||
# Get device IDs
|
||||
stat1 = path1.stat()
|
||||
stat2 = path2.stat()
|
||||
return stat1.st_dev == stat2.st_dev
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _verify_hardlink(self, source: Path, destination: Path) -> bool:
|
||||
"""Verify hardlink
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
|
||||
Returns:
|
||||
True if verification successful
|
||||
"""
|
||||
try:
|
||||
# Check if they have the same inode
|
||||
source_stat = source.stat()
|
||||
dest_stat = destination.stat()
|
||||
|
||||
return source_stat.st_ino == dest_stat.st_ino
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def can_migrate(self, source: Path, destination: Path) -> bool:
|
||||
"""Check if migration is possible
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
|
||||
Returns:
|
||||
True if migration is possible
|
||||
"""
|
||||
if not source.exists():
|
||||
return False
|
||||
|
||||
# Check if on same filesystem
|
||||
dest_dir = destination.parent
|
||||
if dest_dir.exists():
|
||||
return self._same_filesystem(source, dest_dir)
|
||||
|
||||
# Check parent directories
|
||||
parent = dest_dir.parent
|
||||
while not parent.exists() and parent != parent.parent:
|
||||
parent = parent.parent
|
||||
|
||||
return parent.exists() and self._same_filesystem(source, parent)
|
||||
|
||||
def estimate_time(self, source: Path) -> float:
|
||||
"""Estimate migration time in seconds
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
|
||||
Returns:
|
||||
Estimated time in seconds (hardlinks are instant)
|
||||
"""
|
||||
return 0.01 # Hardlinks are nearly instant
|
||||
|
||||
def cleanup(self, source: Path) -> bool:
|
||||
"""Cleanup source file after successful migration
|
||||
|
||||
Note: For hardlinks, we typically don't remove the source
|
||||
immediately as both links point to the same inode.
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
|
||||
Returns:
|
||||
True (no cleanup needed for hardlinks)
|
||||
"""
|
||||
# For hardlinks, we don't remove the source
|
||||
# Both source and destination point to the same data
|
||||
return True
|
||||
|
||||
|
||||
class SymlinkMigrationStrategy:
|
||||
"""Create symbolic links to files"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
logger: Optional[ProgressLogger] = None,
|
||||
absolute_links: bool = True
|
||||
):
|
||||
"""Initialize symlink migration strategy
|
||||
|
||||
Args:
|
||||
logger: Optional progress logger
|
||||
absolute_links: Whether to create absolute symlinks
|
||||
"""
|
||||
self.logger = logger
|
||||
self.absolute_links = absolute_links
|
||||
|
||||
def migrate(
|
||||
self,
|
||||
source: Path,
|
||||
destination: Path,
|
||||
verify: bool = True
|
||||
) -> bool:
|
||||
"""Migrate file by creating symlink
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
verify: Whether to verify the operation
|
||||
|
||||
Returns:
|
||||
True if migration successful
|
||||
"""
|
||||
if not source.exists():
|
||||
if self.logger:
|
||||
self.logger.error(f"Source file does not exist: {source}")
|
||||
return False
|
||||
|
||||
# Create destination directory
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Determine link target
|
||||
if self.absolute_links:
|
||||
target = source.resolve()
|
||||
else:
|
||||
# Create relative symlink
|
||||
target = os.path.relpath(source, destination.parent)
|
||||
|
||||
# Create symlink
|
||||
destination.symlink_to(target)
|
||||
|
||||
# Verify if requested
|
||||
if verify:
|
||||
if not self._verify_symlink(destination, source):
|
||||
if self.logger:
|
||||
self.logger.error(f"Verification failed: {source} -> {destination}")
|
||||
destination.unlink()
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except FileExistsError:
|
||||
if self.logger:
|
||||
self.logger.warning(f"Destination already exists: {destination}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"Symlink failed: {source} -> {destination}: {e}")
|
||||
return False
|
||||
|
||||
def _verify_symlink(self, symlink: Path, expected_target: Path) -> bool:
|
||||
"""Verify symlink
|
||||
|
||||
Args:
|
||||
symlink: Symlink path
|
||||
expected_target: Expected target path
|
||||
|
||||
Returns:
|
||||
True if verification successful
|
||||
"""
|
||||
try:
|
||||
# Check if it's a symlink
|
||||
if not symlink.is_symlink():
|
||||
return False
|
||||
|
||||
# Resolve and compare
|
||||
resolved = symlink.resolve()
|
||||
expected = expected_target.resolve()
|
||||
|
||||
return resolved == expected
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def can_migrate(self, source: Path, destination: Path) -> bool:
|
||||
"""Check if migration is possible
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
destination: Destination file path
|
||||
|
||||
Returns:
|
||||
True if migration is possible
|
||||
"""
|
||||
if not source.exists():
|
||||
return False
|
||||
|
||||
# Check if destination directory is writable
|
||||
dest_dir = destination.parent
|
||||
if dest_dir.exists():
|
||||
return os.access(dest_dir, os.W_OK)
|
||||
|
||||
return True
|
||||
|
||||
def estimate_time(self, source: Path) -> float:
|
||||
"""Estimate migration time in seconds
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
|
||||
Returns:
|
||||
Estimated time in seconds (symlinks are instant)
|
||||
"""
|
||||
return 0.01 # Symlinks are instant
|
||||
|
||||
def cleanup(self, source: Path) -> bool:
|
||||
"""Cleanup source file after successful migration
|
||||
|
||||
Note: For symlinks, we don't remove the source as the
|
||||
symlink points to it.
|
||||
|
||||
Args:
|
||||
source: Source file path
|
||||
|
||||
Returns:
|
||||
True (no cleanup needed for symlinks)
|
||||
"""
|
||||
# For symlinks, we don't remove the source
|
||||
return True
|
||||
|
||||
|
||||
class DedupHardlinkStrategy(HardlinkMigrationStrategy):
|
||||
"""Hardlink strategy for deduplication
|
||||
|
||||
Creates hardlinks for duplicate files to save space.
|
||||
"""
|
||||
|
||||
def __init__(self, logger: Optional[ProgressLogger] = None):
|
||||
"""Initialize dedup hardlink strategy"""
|
||||
super().__init__(logger=logger)
|
||||
|
||||
def deduplicate(
|
||||
self,
|
||||
canonical: Path,
|
||||
duplicate: Path
|
||||
) -> bool:
|
||||
"""Replace duplicate with hardlink to canonical
|
||||
|
||||
Args:
|
||||
canonical: Canonical file path
|
||||
duplicate: Duplicate file path
|
||||
|
||||
Returns:
|
||||
True if deduplication successful
|
||||
"""
|
||||
if not canonical.exists():
|
||||
if self.logger:
|
||||
self.logger.error(f"Canonical file does not exist: {canonical}")
|
||||
return False
|
||||
|
||||
if not duplicate.exists():
|
||||
if self.logger:
|
||||
self.logger.error(f"Duplicate file does not exist: {duplicate}")
|
||||
return False
|
||||
|
||||
# Check if already hardlinked
|
||||
if self._verify_hardlink(canonical, duplicate):
|
||||
return True
|
||||
|
||||
# Check if on same filesystem
|
||||
if not self._same_filesystem(canonical, duplicate):
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
f"Cannot hardlink across filesystems: {canonical} -> {duplicate}"
|
||||
)
|
||||
return False
|
||||
|
||||
try:
|
||||
# Create temporary backup
|
||||
backup = duplicate.with_suffix(duplicate.suffix + '.bak')
|
||||
duplicate.rename(backup)
|
||||
|
||||
# Create hardlink
|
||||
os.link(canonical, duplicate)
|
||||
|
||||
# Remove backup
|
||||
backup.unlink()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"Deduplication failed: {duplicate}: {e}")
|
||||
|
||||
# Restore from backup
|
||||
if backup.exists():
|
||||
backup.rename(duplicate)
|
||||
|
||||
return False
|
||||
44
app/parsers/code_parser.py
Normal file
44
app/parsers/code_parser.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
import re
|
||||
|
||||
class CodeParser:
|
||||
def __init__(self):
|
||||
self.patterns = {
|
||||
'python': {'imports': r'^import |^from .+ import', 'class': r'^class \w+', 'function': r'^def \w+'},
|
||||
'javascript': {'imports': r'^import |^require\(', 'class': r'^class \w+', 'function': r'^function \w+|^const \w+ = '},
|
||||
'java': {'package': r'^package ', 'imports': r'^import ', 'class': r'^public class \w+'},
|
||||
'go': {'package': r'^package ', 'imports': r'^import ', 'function': r'^func \w+'}
|
||||
}
|
||||
|
||||
def parse(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
text = f.read()
|
||||
|
||||
language = self._detect_language(file_path, text)
|
||||
structure = self._extract_structure(text, language)
|
||||
|
||||
return {
|
||||
'text': text,
|
||||
'language': language,
|
||||
'line_count': len(text.split('\n')),
|
||||
'structure': structure,
|
||||
'quality': 'high'
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _detect_language(self, file_path: Path, text: str) -> str:
|
||||
lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go'}
|
||||
return lang_map.get(file_path.suffix.lower(), 'unknown')
|
||||
|
||||
def _extract_structure(self, text: str, language: str) -> Dict:
|
||||
patterns = self.patterns.get(language, {})
|
||||
structure = {'type': 'code', 'language': language}
|
||||
|
||||
for key, pattern in patterns.items():
|
||||
matches = re.findall(pattern, text, re.MULTILINE)
|
||||
structure[key] = len(matches)
|
||||
|
||||
return structure
|
||||
42
app/parsers/media_parser.py
Normal file
42
app/parsers/media_parser.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
class MediaParser:
|
||||
def parse_audio(self, file_path: Path) -> Dict:
|
||||
return {
|
||||
'text': '[Audio transcription pending]',
|
||||
'needs_transcription': True,
|
||||
'transcription_service': 'whisper',
|
||||
'structure': {'type': 'audio'},
|
||||
'quality': 'pending'
|
||||
}
|
||||
|
||||
def parse_video(self, file_path: Path) -> Dict:
|
||||
return {
|
||||
'text': '[Video transcription pending]',
|
||||
'needs_transcription': True,
|
||||
'needs_scene_detection': True,
|
||||
'transcription_service': 'whisper',
|
||||
'structure': {'type': 'video'},
|
||||
'quality': 'pending'
|
||||
}
|
||||
|
||||
def parse_image(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
with Image.open(file_path) as img:
|
||||
width, height = img.size
|
||||
mode = img.mode
|
||||
|
||||
return {
|
||||
'text': '[Image caption/OCR pending]',
|
||||
'needs_ocr': True,
|
||||
'needs_caption': True,
|
||||
'dimensions': f'{width}x{height}',
|
||||
'mode': mode,
|
||||
'structure': {'type': 'image', 'width': width, 'height': height},
|
||||
'quality': 'pending'
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
31
app/parsers/pdf_parser.py
Normal file
31
app/parsers/pdf_parser.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
class PDFParser:
|
||||
def parse(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
import PyPDF2
|
||||
|
||||
pages = []
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf = PyPDF2.PdfReader(f)
|
||||
page_count = len(pdf.pages)
|
||||
|
||||
for i, page in enumerate(pdf.pages[:50]):
|
||||
text = page.extract_text()
|
||||
pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
|
||||
|
||||
full_text = '\n\n'.join([p['text'] for p in pages])
|
||||
has_text_layer = sum(p['char_count'] for p in pages) > 100
|
||||
|
||||
return {
|
||||
'text': full_text,
|
||||
'page_count': page_count,
|
||||
'pages_extracted': len(pages),
|
||||
'has_text_layer': has_text_layer,
|
||||
'needs_ocr': not has_text_layer,
|
||||
'structure': {'type': 'document', 'pages': pages[:5]},
|
||||
'quality': 'high' if has_text_layer else 'needs_ocr'
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'needs_ocr': True}
|
||||
26
app/parsers/text_parser.py
Normal file
26
app/parsers/text_parser.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
import chardet
|
||||
|
||||
class TextParser:
|
||||
def parse(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
raw_data = f.read(1024 * 1024)
|
||||
|
||||
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
|
||||
text = raw_data.decode(encoding, errors='ignore')
|
||||
|
||||
lines = text.split('\n')
|
||||
|
||||
return {
|
||||
'text': text,
|
||||
'encoding': encoding,
|
||||
'line_count': len(lines),
|
||||
'char_count': len(text),
|
||||
'word_count': len(text.split()),
|
||||
'structure': {'type': 'plain_text'},
|
||||
'quality': 'high' if encoding == 'utf-8' else 'medium'
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
51
app/setup.py
Normal file
51
app/setup.py
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Setup script for defrag disk reorganizer"""
|
||||
from setuptools import setup, find_packages
|
||||
from pathlib import Path
|
||||
|
||||
# Read requirements
|
||||
requirements_path = Path(__file__).parent / 'requirements.txt'
|
||||
with open(requirements_path) as f:
|
||||
requirements = [
|
||||
line.strip()
|
||||
for line in f
|
||||
if line.strip() and not line.startswith('#')
|
||||
]
|
||||
|
||||
# Read long description from README
|
||||
readme_path = Path(__file__).parent / 'README.md'
|
||||
long_description = ""
|
||||
if readme_path.exists():
|
||||
with open(readme_path) as f:
|
||||
long_description = f.read()
|
||||
|
||||
setup(
|
||||
name='defrag',
|
||||
version='1.0.0',
|
||||
description='Intelligent disk reorganization system for 20TB+ data with deduplication and classification',
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
author='Project Defrag',
|
||||
author_email='defrag@example.com',
|
||||
url='https://github.com/yourusername/defrag',
|
||||
packages=find_packages(),
|
||||
install_requires=requirements,
|
||||
python_requires='>=3.9',
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'defrag=main:main',
|
||||
],
|
||||
},
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta',
|
||||
'Intended Audience :: System Administrators',
|
||||
'Topic :: System :: Filesystems',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
'Programming Language :: Python :: 3.11',
|
||||
'Programming Language :: Python :: 3.12',
|
||||
],
|
||||
keywords='disk management storage deduplication classification migration',
|
||||
)
|
||||
50
app/shared/__init__.py
Normal file
50
app/shared/__init__.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""Shared package exports"""
|
||||
from .models import (
|
||||
FileRecord,
|
||||
OperationRecord,
|
||||
DiskInfo,
|
||||
MigrationPlan,
|
||||
ProcessingStats
|
||||
)
|
||||
from .config import (
|
||||
Config,
|
||||
DatabaseConfig,
|
||||
ProcessingConfig,
|
||||
LoggingConfig,
|
||||
load_config
|
||||
)
|
||||
from .logger import (
|
||||
ProgressLogger,
|
||||
create_logger,
|
||||
format_size,
|
||||
format_rate,
|
||||
format_time
|
||||
)
|
||||
from ._protocols import IDatabase, ILogger
|
||||
|
||||
__all__ = [
|
||||
# Models
|
||||
'FileRecord',
|
||||
'OperationRecord',
|
||||
'DiskInfo',
|
||||
'MigrationPlan',
|
||||
'ProcessingStats',
|
||||
|
||||
# Config
|
||||
'Config',
|
||||
'DatabaseConfig',
|
||||
'ProcessingConfig',
|
||||
'LoggingConfig',
|
||||
'load_config',
|
||||
|
||||
# Logger
|
||||
'ProgressLogger',
|
||||
'create_logger',
|
||||
'format_size',
|
||||
'format_rate',
|
||||
'format_time',
|
||||
|
||||
# Protocols
|
||||
'IDatabase',
|
||||
'ILogger',
|
||||
]
|
||||
67
app/shared/_protocols.py
Normal file
67
app/shared/_protocols.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Protocol definitions for the shared package"""
|
||||
from typing import Protocol, Any
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileRecord:
|
||||
"""Core file record with all metadata"""
|
||||
path: Path
|
||||
size: int
|
||||
modified_time: float
|
||||
created_time: float
|
||||
disk_label: str
|
||||
checksum: str | None = None
|
||||
status: str = 'indexed' # indexed, planned, moved, verified
|
||||
category: str | None = None
|
||||
duplicate_of: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class OperationRecord:
|
||||
"""Record of a migration operation"""
|
||||
source_path: Path
|
||||
target_path: Path
|
||||
operation_type: str # move, copy, hardlink, symlink
|
||||
status: str = 'pending' # pending, in_progress, completed, failed
|
||||
error: str | None = None
|
||||
executed_at: datetime | None = None
|
||||
verified: bool = False
|
||||
|
||||
|
||||
class IDatabase(Protocol):
|
||||
"""Protocol for database operations"""
|
||||
|
||||
def store_file(self, file_record: FileRecord) -> None:
|
||||
"""Store a file record"""
|
||||
...
|
||||
|
||||
def get_files_by_disk(self, disk: str) -> list[FileRecord]:
|
||||
"""Get all files on a specific disk"""
|
||||
...
|
||||
|
||||
def store_operation(self, operation: OperationRecord) -> None:
|
||||
"""Store an operation record"""
|
||||
...
|
||||
|
||||
def get_pending_operations(self) -> list[OperationRecord]:
|
||||
"""Get all pending operations"""
|
||||
...
|
||||
|
||||
|
||||
class ILogger(Protocol):
|
||||
"""Protocol for logging operations"""
|
||||
|
||||
def info(self, message: str) -> None:
|
||||
...
|
||||
|
||||
def warning(self, message: str) -> None:
|
||||
...
|
||||
|
||||
def error(self, message: str) -> None:
|
||||
...
|
||||
|
||||
def debug(self, message: str) -> None:
|
||||
...
|
||||
110
app/shared/config.py
Normal file
110
app/shared/config.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""Configuration management for disk reorganizer"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatabaseConfig:
|
||||
"""Database connection configuration"""
|
||||
host: str = '192.168.1.159'
|
||||
port: int = 5432
|
||||
database: str = 'disk_reorganizer_db'
|
||||
user: str = 'disk_reorg_user'
|
||||
password: str = 'heel-goed-wachtwoord'
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary"""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingConfig:
|
||||
"""Processing behavior configuration"""
|
||||
batch_size: int = 1000
|
||||
commit_interval: int = 100
|
||||
parallel_workers: int = 4
|
||||
chunk_size: int = 8192
|
||||
hash_algorithm: str = 'sha256'
|
||||
verify_operations: bool = True
|
||||
preserve_timestamps: bool = True
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary"""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoggingConfig:
|
||||
"""Logging configuration"""
|
||||
level: str = 'INFO'
|
||||
log_file: str = 'disk_reorganizer.log'
|
||||
console_output: bool = True
|
||||
file_output: bool = True
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary"""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
"""Main configuration container"""
|
||||
database: DatabaseConfig = None
|
||||
processing: ProcessingConfig = None
|
||||
logging: LoggingConfig = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Initialize nested configs with defaults if not provided"""
|
||||
if self.database is None:
|
||||
self.database = DatabaseConfig()
|
||||
if self.processing is None:
|
||||
self.processing = ProcessingConfig()
|
||||
if self.logging is None:
|
||||
self.logging = LoggingConfig()
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, config_path: Path) -> 'Config':
|
||||
"""Load configuration from JSON file"""
|
||||
if not config_path.exists():
|
||||
return cls()
|
||||
|
||||
with open(config_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
return cls(
|
||||
database=DatabaseConfig(**data.get('database', {})),
|
||||
processing=ProcessingConfig(**data.get('processing', {})),
|
||||
logging=LoggingConfig(**data.get('logging', {}))
|
||||
)
|
||||
|
||||
def to_file(self, config_path: Path) -> None:
|
||||
"""Save configuration to JSON file"""
|
||||
data = {
|
||||
'database': self.database.to_dict(),
|
||||
'processing': self.processing.to_dict(),
|
||||
'logging': self.logging.to_dict()
|
||||
}
|
||||
|
||||
with open(config_path, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary"""
|
||||
return {
|
||||
'database': self.database.to_dict(),
|
||||
'processing': self.processing.to_dict(),
|
||||
'logging': self.logging.to_dict()
|
||||
}
|
||||
|
||||
|
||||
def load_config(config_path: Optional[Path] = None) -> Config:
|
||||
"""Load configuration from file or return default"""
|
||||
if config_path is None:
|
||||
config_path = Path('config.json')
|
||||
|
||||
if config_path.exists():
|
||||
return Config.from_file(config_path)
|
||||
|
||||
return Config()
|
||||
217
app/shared/logger.py
Normal file
217
app/shared/logger.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""Dynamic progress logger with formatting utilities"""
|
||||
import sys
|
||||
import logging
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def format_size(bytes_size: int) -> str:
|
||||
"""Format bytes to human-readable size string
|
||||
|
||||
Args:
|
||||
bytes_size: Size in bytes
|
||||
|
||||
Returns:
|
||||
Human-readable size string (e.g., "1.5 GB", "234.5 MB")
|
||||
"""
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
|
||||
if bytes_size < 1024.0:
|
||||
return f"{bytes_size:.1f} {unit}"
|
||||
bytes_size /= 1024.0
|
||||
return f"{bytes_size:.1f} EB"
|
||||
|
||||
|
||||
def format_rate(bytes_per_second: float) -> str:
|
||||
"""Format transfer rate to human-readable string
|
||||
|
||||
Args:
|
||||
bytes_per_second: Transfer rate in bytes per second
|
||||
|
||||
Returns:
|
||||
Human-readable rate string (e.g., "125.3 MB/s")
|
||||
"""
|
||||
return f"{format_size(int(bytes_per_second))}/s"
|
||||
|
||||
|
||||
def format_time(seconds: float) -> str:
|
||||
"""Format seconds to human-readable time string
|
||||
|
||||
Args:
|
||||
seconds: Time in seconds
|
||||
|
||||
Returns:
|
||||
Human-readable time string (e.g., "2h 34m 12s", "45m 23s", "12s")
|
||||
"""
|
||||
if seconds < 60:
|
||||
return f"{int(seconds)}s"
|
||||
elif seconds < 3600:
|
||||
minutes = int(seconds // 60)
|
||||
secs = int(seconds % 60)
|
||||
return f"{minutes}m {secs}s"
|
||||
else:
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
secs = int(seconds % 60)
|
||||
return f"{hours}h {minutes}m {secs}s"
|
||||
|
||||
|
||||
class ProgressLogger:
|
||||
"""Dynamic progress logger with real-time statistics"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str = "defrag",
|
||||
level: int = logging.INFO,
|
||||
log_file: Optional[Path] = None,
|
||||
console_output: bool = True
|
||||
):
|
||||
"""Initialize progress logger
|
||||
|
||||
Args:
|
||||
name: Logger name
|
||||
level: Logging level
|
||||
log_file: Optional log file path
|
||||
console_output: Whether to output to console
|
||||
"""
|
||||
self.logger = logging.getLogger(name)
|
||||
self.logger.setLevel(level)
|
||||
self.logger.handlers.clear()
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
# Add console handler
|
||||
if console_output:
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(level)
|
||||
console_handler.setFormatter(formatter)
|
||||
self.logger.addHandler(console_handler)
|
||||
|
||||
# Add file handler
|
||||
if log_file:
|
||||
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setLevel(level)
|
||||
file_handler.setFormatter(formatter)
|
||||
self.logger.addHandler(file_handler)
|
||||
|
||||
self._last_progress_line = ""
|
||||
|
||||
def info(self, message: str) -> None:
|
||||
"""Log info message"""
|
||||
self.logger.info(message)
|
||||
|
||||
def warning(self, message: str) -> None:
|
||||
"""Log warning message"""
|
||||
self.logger.warning(message)
|
||||
|
||||
def error(self, message: str) -> None:
|
||||
"""Log error message"""
|
||||
self.logger.error(message)
|
||||
|
||||
def debug(self, message: str) -> None:
|
||||
"""Log debug message"""
|
||||
self.logger.debug(message)
|
||||
|
||||
def critical(self, message: str) -> None:
|
||||
"""Log critical message"""
|
||||
self.logger.critical(message)
|
||||
|
||||
def progress(
|
||||
self,
|
||||
current: int,
|
||||
total: int,
|
||||
prefix: str = "",
|
||||
suffix: str = "",
|
||||
bytes_processed: Optional[int] = None,
|
||||
elapsed_seconds: Optional[float] = None
|
||||
) -> None:
|
||||
"""Log progress with dynamic statistics
|
||||
|
||||
Args:
|
||||
current: Current progress count
|
||||
total: Total count
|
||||
prefix: Prefix message
|
||||
suffix: Suffix message
|
||||
bytes_processed: Optional bytes processed for rate calculation
|
||||
elapsed_seconds: Optional elapsed time for rate calculation
|
||||
"""
|
||||
if total == 0:
|
||||
percent = 0.0
|
||||
else:
|
||||
percent = (current / total) * 100
|
||||
|
||||
progress_msg = f"{prefix} [{current}/{total}] {percent:.1f}%"
|
||||
|
||||
if bytes_processed is not None and elapsed_seconds is not None and elapsed_seconds > 0:
|
||||
rate = bytes_per_second = bytes_processed / elapsed_seconds
|
||||
progress_msg += f" | {format_size(bytes_processed)} @ {format_rate(rate)}"
|
||||
|
||||
# Estimate time remaining
|
||||
if current > 0:
|
||||
estimated_total_seconds = (elapsed_seconds / current) * total
|
||||
remaining_seconds = estimated_total_seconds - elapsed_seconds
|
||||
progress_msg += f" | ETA: {format_time(remaining_seconds)}"
|
||||
|
||||
if suffix:
|
||||
progress_msg += f" | {suffix}"
|
||||
|
||||
self.info(progress_msg)
|
||||
|
||||
def section(self, title: str) -> None:
|
||||
"""Log section header
|
||||
|
||||
Args:
|
||||
title: Section title
|
||||
"""
|
||||
separator = "=" * 60
|
||||
self.info(separator)
|
||||
self.info(f" {title}")
|
||||
self.info(separator)
|
||||
|
||||
def subsection(self, title: str) -> None:
|
||||
"""Log subsection header
|
||||
|
||||
Args:
|
||||
title: Subsection title
|
||||
"""
|
||||
self.info(f"\n--- {title} ---")
|
||||
|
||||
|
||||
def create_logger(
|
||||
name: str = "defrag",
|
||||
level: str = "INFO",
|
||||
log_file: Optional[Path] = None,
|
||||
console_output: bool = True
|
||||
) -> ProgressLogger:
|
||||
"""Create and configure a progress logger
|
||||
|
||||
Args:
|
||||
name: Logger name
|
||||
level: Logging level as string
|
||||
log_file: Optional log file path
|
||||
console_output: Whether to output to console
|
||||
|
||||
Returns:
|
||||
Configured ProgressLogger instance
|
||||
"""
|
||||
level_map = {
|
||||
'DEBUG': logging.DEBUG,
|
||||
'INFO': logging.INFO,
|
||||
'WARNING': logging.WARNING,
|
||||
'ERROR': logging.ERROR,
|
||||
'CRITICAL': logging.CRITICAL
|
||||
}
|
||||
|
||||
log_level = level_map.get(level.upper(), logging.INFO)
|
||||
|
||||
return ProgressLogger(
|
||||
name=name,
|
||||
level=log_level,
|
||||
log_file=log_file,
|
||||
console_output=console_output
|
||||
)
|
||||
127
app/shared/models.py
Normal file
127
app/shared/models.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""Data models for the disk reorganizer"""
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileRecord:
|
||||
"""Core file record with all metadata"""
|
||||
path: Path
|
||||
size: int
|
||||
modified_time: float
|
||||
created_time: float
|
||||
disk_label: str
|
||||
checksum: Optional[str] = None
|
||||
status: str = 'indexed' # indexed, planned, moved, verified
|
||||
category: Optional[str] = None
|
||||
duplicate_of: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for serialization"""
|
||||
return {
|
||||
'path': str(self.path),
|
||||
'size': self.size,
|
||||
'modified_time': self.modified_time,
|
||||
'created_time': self.created_time,
|
||||
'disk_label': self.disk_label,
|
||||
'checksum': self.checksum,
|
||||
'status': self.status,
|
||||
'category': self.category,
|
||||
'duplicate_of': self.duplicate_of
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class OperationRecord:
|
||||
"""Record of a migration operation"""
|
||||
source_path: Path
|
||||
target_path: Path
|
||||
operation_type: str # move, copy, hardlink, symlink
|
||||
size: int = 0
|
||||
status: str = 'pending' # pending, in_progress, completed, failed
|
||||
error: Optional[str] = None
|
||||
executed_at: Optional[datetime] = None
|
||||
verified: bool = False
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for serialization"""
|
||||
return {
|
||||
'source_path': str(self.source_path),
|
||||
'target_path': str(self.target_path),
|
||||
'operation_type': self.operation_type,
|
||||
'size': self.size,
|
||||
'status': self.status,
|
||||
'error': self.error,
|
||||
'executed_at': self.executed_at.isoformat() if self.executed_at else None,
|
||||
'verified': self.verified
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiskInfo:
|
||||
"""Information about a disk/volume"""
|
||||
name: str
|
||||
device: str
|
||||
mount_point: Path
|
||||
total_size: int
|
||||
used_size: int
|
||||
free_size: int
|
||||
fs_type: str
|
||||
|
||||
@property
|
||||
def usage_percent(self) -> float:
|
||||
"""Calculate usage percentage"""
|
||||
if self.total_size == 0:
|
||||
return 0.0
|
||||
return (self.used_size / self.total_size) * 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class MigrationPlan:
|
||||
"""Complete migration plan"""
|
||||
target_disk: str
|
||||
destination_disks: list[str]
|
||||
operations: list[OperationRecord]
|
||||
total_size: int
|
||||
file_count: int
|
||||
created_at: datetime = field(default_factory=datetime.now)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for serialization"""
|
||||
return {
|
||||
'target_disk': self.target_disk,
|
||||
'destination_disks': self.destination_disks,
|
||||
'operations': [op.to_dict() for op in self.operations],
|
||||
'total_size': self.total_size,
|
||||
'file_count': self.file_count,
|
||||
'created_at': self.created_at.isoformat()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingStats:
|
||||
"""Statistics for processing operations"""
|
||||
files_processed: int = 0
|
||||
bytes_processed: int = 0
|
||||
files_succeeded: int = 0
|
||||
files_failed: int = 0
|
||||
start_time: datetime = field(default_factory=datetime.now)
|
||||
|
||||
@property
|
||||
def elapsed_seconds(self) -> float:
|
||||
"""Calculate elapsed time in seconds"""
|
||||
return (datetime.now() - self.start_time).total_seconds()
|
||||
|
||||
@property
|
||||
def files_per_second(self) -> float:
|
||||
"""Calculate processing rate"""
|
||||
elapsed = self.elapsed_seconds
|
||||
return self.files_processed / elapsed if elapsed > 0 else 0.0
|
||||
|
||||
@property
|
||||
def bytes_per_second(self) -> float:
|
||||
"""Calculate throughput"""
|
||||
elapsed = self.elapsed_seconds
|
||||
return self.bytes_processed / elapsed if elapsed > 0 else 0.0
|
||||
0
app/tests/__init__.py
Normal file
0
app/tests/__init__.py
Normal file
9
defrag.iml
Normal file
9
defrag.iml
Normal file
@@ -0,0 +1,9 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
20
docker-compose.override.yml
Normal file
20
docker-compose.override.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
services:
|
||||
app:
|
||||
environment:
|
||||
- LOG_LEVEL=DEBUG
|
||||
- PYTHONPATH=/app
|
||||
volumes:
|
||||
- .:/app
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
ports:
|
||||
- "8000:8000"
|
||||
command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
|
||||
|
||||
postgres:
|
||||
environment:
|
||||
- POSTGRES_LOG_STATEMENT=all
|
||||
ports:
|
||||
- "5433:5432" # Different port to avoid conflict with host PostgreSQL
|
||||
|
||||
redis:
|
||||
command: redis-server --appendonly yes --loglevel verbose
|
||||
276
docker-compose.yml
Normal file
276
docker-compose.yml
Normal file
@@ -0,0 +1,276 @@
|
||||
services:
|
||||
# PostgreSQL Database
|
||||
postgres:
|
||||
image: postgres:15-alpine
|
||||
container_name: project_defrag_db
|
||||
environment:
|
||||
POSTGRES_USER: disk_reorg_user
|
||||
POSTGRES_PASSWORD: heel-goed-wachtwoord
|
||||
POSTGRES_DB: disk_reorganizer_db
|
||||
POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C"
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
- ./sql/init.sql:/docker-entrypoint-initdb.d/init.sql
|
||||
- ./sql/migrations:/docker-entrypoint-initdb.d/migrations
|
||||
ports:
|
||||
- "5432:5432"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U disk_reorg_user -d disk_reorganizer_db"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
# Redis for deduplication hash store (optional)
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: project_defrag_redis
|
||||
command: redis-server --appendonly yes
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
ports:
|
||||
- "6379:6379"
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
# Application Service
|
||||
app:
|
||||
build: .
|
||||
container_name: project_defrag_app
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
# Database Configuration
|
||||
DB_HOST: postgres
|
||||
DB_PORT: 5432
|
||||
DB_NAME: disk_reorganizer_db
|
||||
DB_USER: disk_reorg_user
|
||||
DB_PASSWORD: heel-goed-wachtwoord
|
||||
|
||||
# Redis Configuration
|
||||
REDIS_HOST: redis
|
||||
REDIS_PORT: 6379
|
||||
|
||||
# Application Configuration
|
||||
LOG_LEVEL: INFO
|
||||
MAX_WORKERS: 4
|
||||
CHUNK_SIZE_KB: 64
|
||||
|
||||
# Mount points (set these when running specific commands)
|
||||
SOURCE_MOUNT: /mnt/source
|
||||
TARGET_MOUNT: /mnt/target
|
||||
volumes:
|
||||
# Mount host directories for file operations
|
||||
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
|
||||
- ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
|
||||
|
||||
# Mount for configuration and plans
|
||||
- ./config:/app/config
|
||||
- ./plans:/app/plans
|
||||
- ./logs:/app/logs
|
||||
|
||||
# Bind mount for development (optional)
|
||||
- .:/app
|
||||
networks:
|
||||
- defrag-network
|
||||
profiles:
|
||||
- full-cycle
|
||||
- development
|
||||
# Uncomment for development with hot reload
|
||||
# command: watchmedo auto-restart --pattern="*.py" --recursive -- python app/main.py
|
||||
|
||||
# Single command services for specific operations
|
||||
index:
|
||||
build: .
|
||||
container_name: defrag_index
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DB_HOST: postgres
|
||||
DB_PORT: 5432
|
||||
DB_NAME: disk_reorganizer_db
|
||||
DB_USER: disk_reorg_user
|
||||
DB_PASSWORD: heel-goed-wachtwoord
|
||||
volumes:
|
||||
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
|
||||
- ./config:/app/config
|
||||
- ./logs:/app/logs
|
||||
command: ["python", "app/main.py", "index", "/media/mike/SMT", "SMT"]
|
||||
profiles:
|
||||
- index-only
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
plan:
|
||||
build: .
|
||||
container_name: defrag_plan
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DB_HOST: postgres
|
||||
DB_PORT: 5432
|
||||
DB_NAME: disk_reorganizer_db
|
||||
DB_USER: disk_reorg_user
|
||||
DB_PASSWORD: heel-goed-wachtwoord
|
||||
volumes:
|
||||
- ./config:/app/config
|
||||
- ./plans:/app/plans
|
||||
- ./logs:/app/logs
|
||||
command: ["python", "app/main.py", "plan", "/media/mike/SMT", "SMT"]
|
||||
profiles:
|
||||
- plan-only
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
execute:
|
||||
build: .
|
||||
container_name: defrag_execute
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DB_HOST: postgres
|
||||
DB_PORT: 5432
|
||||
DB_NAME: disk_reorganizer_db
|
||||
DB_USER: disk_reorg_user
|
||||
DB_PASSWORD: heel-goed-wachtwoord
|
||||
volumes:
|
||||
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source
|
||||
- ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
|
||||
- ./plans:/app/plans
|
||||
- ./config:/app/config
|
||||
- ./logs:/app/logs
|
||||
command: ["python", "app/main.py", "execute", "/app/plans/plan.json"]
|
||||
profiles:
|
||||
- execute-only
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
dry-run:
|
||||
build: .
|
||||
container_name: defrag_dry_run
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DB_HOST: postgres
|
||||
DB_PORT: 5432
|
||||
DB_NAME: disk_reorganizer_db
|
||||
DB_USER: disk_reorg_user
|
||||
DB_PASSWORD: heel-goed-wachtwoord
|
||||
volumes:
|
||||
- ./plans:/app/plans
|
||||
- ./config:/app/config
|
||||
- ./logs:/app/logs
|
||||
command: ["python", "app/main.py", "execute", "/app/plans/plan.json", "--dry-run"]
|
||||
profiles:
|
||||
- dry-run-only
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
report:
|
||||
build: .
|
||||
container_name: defrag_report
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DB_HOST: postgres
|
||||
DB_PORT: 5432
|
||||
DB_NAME: disk_reorganizer_db
|
||||
DB_USER: disk_reorg_user
|
||||
DB_PASSWORD: heel-goed-wachtwoord
|
||||
volumes:
|
||||
- ./reports:/app/reports
|
||||
- ./logs:/app/logs
|
||||
command: ["python", "app/main.py", "report"]
|
||||
profiles:
|
||||
- report-only
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
# Monitoring and Admin Services
|
||||
pgadmin:
|
||||
image: dpage/pgadmin4:latest
|
||||
container_name: defrag_pgadmin
|
||||
environment:
|
||||
PGADMIN_DEFAULT_EMAIL: admin@defrag.local
|
||||
PGADMIN_DEFAULT_PASSWORD: admin123
|
||||
volumes:
|
||||
- pgadmin_data:/var/lib/pgadmin
|
||||
ports:
|
||||
- "5050:80"
|
||||
depends_on:
|
||||
- postgres
|
||||
profiles:
|
||||
- monitoring
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
redis-commander:
|
||||
image: rediscommander/redis-commander:latest
|
||||
container_name: defrag_redis_commander
|
||||
environment:
|
||||
REDIS_HOSTS: local:redis:6379
|
||||
ports:
|
||||
- "8081:8081"
|
||||
depends_on:
|
||||
- redis
|
||||
profiles:
|
||||
- monitoring
|
||||
networks:
|
||||
- defrag-network
|
||||
|
||||
flyway:
|
||||
image: flyway/flyway:latest
|
||||
container_name: flyway
|
||||
volumes:
|
||||
- ./sql/migration:/flyway/sql:ro
|
||||
environment:
|
||||
FLYWAY_URL: jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
|
||||
FLYWAY_USER: disk_reorg_user
|
||||
FLYWAY_PASSWORD: heel-goed-wachtwoord
|
||||
FLYWAY_SCHEMAS: public
|
||||
FLYWAY_LOCATIONS: filesystem:./sql
|
||||
FLYWAY_CONNECT_RETRIES: "60"
|
||||
command: migrate
|
||||
restart: "no"
|
||||
|
||||
pg_backup:
|
||||
image: postgres:16
|
||||
container_name: pg_backup
|
||||
environment:
|
||||
PGPASSWORD: heel-goed-wachtwoord
|
||||
volumes:
|
||||
- ./:/backup
|
||||
command:
|
||||
- bash
|
||||
- -lc
|
||||
- >
|
||||
pg_dump -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db
|
||||
--format=custom --no-owner --no-privileges
|
||||
-f /backup/backup_$(date +%F_%H%M)_disk_reorganizer_db.dump
|
||||
restart: "no"
|
||||
|
||||
networks:
|
||||
defrag-network:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
driver: local
|
||||
redis_data:
|
||||
driver: local
|
||||
pgadmin_data:
|
||||
driver: local
|
||||
7
flyway.conf
Normal file
7
flyway.conf
Normal file
@@ -0,0 +1,7 @@
|
||||
flyway.url=jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
|
||||
flyway.user=disk_org_user
|
||||
flyway.password=heel-goed-wachtwoord
|
||||
|
||||
flyway.locations=filesystem:sql/migration
|
||||
flyway.schemas=public
|
||||
|
||||
74
pyproject.toml
Normal file
74
pyproject.toml
Normal file
@@ -0,0 +1,74 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=65.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "defrag"
|
||||
version = "1.0.0"
|
||||
description = "Intelligent disk reorganization system for 20TB+ data"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
{name = "Project Defrag"}
|
||||
]
|
||||
keywords = ["disk", "storage", "deduplication", "classification", "migration"]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: System Administrators",
|
||||
"Topic :: System :: Filesystems",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
"psycopg2-binary>=2.9.0",
|
||||
"psutil>=5.9.0",
|
||||
"pandas>=1.5.0",
|
||||
"pyarrow>=10.0.0",
|
||||
"python-magic>=0.4.27",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
redis = ["redis>=4.5.0"]
|
||||
ml = ["scikit-learn>=1.2.0", "numpy>=1.24.0"]
|
||||
dev = [
|
||||
"pytest>=7.2.0",
|
||||
"pytest-cov>=4.0.0",
|
||||
"black>=23.0.0",
|
||||
"mypy>=1.0.0",
|
||||
"flake8>=6.0.0",
|
||||
]
|
||||
all = [
|
||||
"redis>=4.5.0",
|
||||
"scikit-learn>=1.2.0",
|
||||
"numpy>=1.24.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
defrag = "main:main"
|
||||
|
||||
[tool.black]
|
||||
line-length = 100
|
||||
target-version = ['py39', 'py310', 'py311', 'py312']
|
||||
include = '\.pyi?$'
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.9"
|
||||
warn_return_any = true
|
||||
warn_unused_configs = true
|
||||
disallow_untyped_defs = false
|
||||
disallow_incomplete_defs = false
|
||||
check_untyped_defs = true
|
||||
no_implicit_optional = true
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
python_files = ["test_*.py"]
|
||||
python_classes = ["Test*"]
|
||||
python_functions = ["test_*"]
|
||||
addopts = "-v --cov=. --cov-report=html --cov-report=term"
|
||||
41
requirements.txt
Normal file
41
requirements.txt
Normal file
@@ -0,0 +1,41 @@
|
||||
# PostgreSQL database adapter for Python
|
||||
psycopg2-binary>=2.9.9
|
||||
|
||||
# Alternative: psycopg2>=2.9.9 (requires PostgreSQL development libraries)
|
||||
# Use psycopg2-binary for easier installation without compilation
|
||||
# Core dependencies
|
||||
# Optional/feature dependencies
|
||||
redis>=4.5.0 # For RedisHashStore
|
||||
scikit-learn>=1.0.0 # For MLClassifier
|
||||
numpy>=1.21.0 # For MLClassifier
|
||||
|
||||
# Development dependencies
|
||||
pytest>=7.0.0
|
||||
pytest-cov>=4.0.0
|
||||
black>=22.0.0
|
||||
mypy>=0.950
|
||||
flake8>=5.0.0
|
||||
# Core dependencies
|
||||
psycopg2-binary>=2.9.0
|
||||
psutil>=5.9.0
|
||||
|
||||
# Data processing
|
||||
pandas>=1.5.0
|
||||
pyarrow>=10.0.0
|
||||
|
||||
# File type detection
|
||||
python-magic>=0.4.27
|
||||
|
||||
# Optional dependencies
|
||||
redis>=4.5.0 # For RedisHashStore (optional)
|
||||
scikit-learn>=1.2.0 # For MLClassifier (optional)
|
||||
numpy>=1.24.0 # For MLClassifier (optional)
|
||||
|
||||
# Development dependencies
|
||||
pytest>=7.2.0
|
||||
pytest-cov>=4.0.0
|
||||
black>=23.0.0
|
||||
mypy>=1.0.0
|
||||
flake8>=6.0.0
|
||||
|
||||
chardet
|
||||
51
setup.sh
Normal file
51
setup.sh
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
# setup.sh - Complete Docker setup for Project Defrag
|
||||
|
||||
set -e
|
||||
|
||||
echo "🚀 Setting up Project Defrag with Docker..."
|
||||
|
||||
# 1. Create necessary directories
|
||||
echo "📁 Creating directories..."
|
||||
mkdir -p {config,plans,logs,reports,sql/migrations}
|
||||
|
||||
# 2. Copy environment file
|
||||
if [ ! -f .env ]; then
|
||||
echo "⚙️ Creating .env file from template..."
|
||||
cp .env.example .env
|
||||
echo "⚠️ Please edit .env file with your configuration!"
|
||||
fi
|
||||
|
||||
# 3. Build the Docker image
|
||||
echo "🐳 Building Docker image..."
|
||||
docker compose build app
|
||||
|
||||
# 4. Start the database
|
||||
#echo "🗄️ Starting PostgreSQL database..."
|
||||
#docker-compose up -d postgres
|
||||
|
||||
# 5. Wait for database to be ready
|
||||
#echo "⏳ Waiting for database to be ready..."
|
||||
#sleep 10
|
||||
|
||||
# 6. Run database initialization
|
||||
#echo "📊 Initializing database..."
|
||||
#docker-compose exec -T postgres psql -U disk_reorg_user -d disk_reorganizer_db -f /docker-entrypoint-initdb.d/init.sql
|
||||
|
||||
# 7. Start optional services
|
||||
echo "🔧 Starting monitoring services..."
|
||||
docker compose --profile monitoring up -d
|
||||
|
||||
echo "✅ Setup complete!"
|
||||
echo ""
|
||||
echo "📋 Available commands:"
|
||||
echo " docker compose up -d # Start all services"
|
||||
echo " docker compose --profile index-only up index # Run index only"
|
||||
echo " docker compose --profile plan-only up plan # Generate plan"
|
||||
echo " docker compose --profile dry-run-only up dry-run # Dry run"
|
||||
echo " docker compose --profile execute-only up execute # Execute migration"
|
||||
echo " docker compose --profile report-only up report # Generate report"
|
||||
echo ""
|
||||
echo "🌐 Access monitoring:"
|
||||
echo " - PostgreSQL Admin: http://localhost:5050"
|
||||
echo " - Redis Commander: http://localhost:8081"
|
||||
61
sql/legacy_setup.sql
Normal file
61
sql/legacy_setup.sql
Normal file
@@ -0,0 +1,61 @@
|
||||
-- PostgreSQL Database Setup Script for Disk Reorganizer
|
||||
-- Database: disk_reorganizer_db
|
||||
-- User: disk_reorg_user
|
||||
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
||||
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
|
||||
|
||||
-- Create the database (run as superuser: auction)
|
||||
CREATE DATABASE disk_reorganizer_db
|
||||
WITH
|
||||
ENCODING = 'UTF8'
|
||||
LC_COLLATE = 'en_US.UTF-8'
|
||||
LC_CTYPE = 'en_US.UTF-8'
|
||||
TEMPLATE = template0;
|
||||
|
||||
-- Connect to the new database
|
||||
\c disk_reorganizer_db
|
||||
|
||||
-- Create the user
|
||||
CREATE USER disk_reorg_user WITH PASSWORD 'heel-goed-wachtwoord';
|
||||
|
||||
-- Create files table
|
||||
|
||||
-- Create index on disk column for faster queries
|
||||
|
||||
|
||||
-- Grant privileges to disk_reorg_user
|
||||
GRANT CONNECT ON DATABASE disk_reorganizer_db TO disk_reorg_user;
|
||||
GRANT USAGE ON SCHEMA public TO disk_reorg_user;
|
||||
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO disk_reorg_user;
|
||||
GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO disk_reorg_user;
|
||||
|
||||
-- future tables/sequences created by your owner role (pick the role that creates them)
|
||||
ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
|
||||
GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
|
||||
|
||||
ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
|
||||
GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
|
||||
|
||||
-- Create function to update updated_at timestamp
|
||||
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
||||
RETURNS TRIGGER AS
|
||||
$$
|
||||
BEGIN
|
||||
NEW.updated_at = CURRENT_TIMESTAMP;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Create trigger for files table
|
||||
CREATE TRIGGER update_files_updated_at
|
||||
BEFORE UPDATE
|
||||
ON files
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_updated_at_column();
|
||||
|
||||
-- Display success message
|
||||
\echo 'Database setup completed successfully!'
|
||||
\echo 'Database: disk_reorganizer_db'
|
||||
\echo 'User: disk_reorg_user'
|
||||
\echo 'Tables created: files, operations'
|
||||
\echo 'Indexes and triggers created 2)'
|
||||
188
sql/migration/V001__init.sql
Normal file
188
sql/migration/V001__init.sql
Normal file
@@ -0,0 +1,188 @@
|
||||
-- sql/init.sql
|
||||
-- Initialize PostgreSQL database for Project Defrag
|
||||
|
||||
-- Enable useful extensions
|
||||
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
||||
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
|
||||
-- future tables/sequences created by your owner role (pick the role that creates them)
|
||||
ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
|
||||
GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
|
||||
|
||||
ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
|
||||
GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
|
||||
ALTER DATABASE disk_reorganizer_db OWNER TO disk_reorg_user;
|
||||
-- Files table
|
||||
CREATE TABLE IF NOT EXISTS files
|
||||
(
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
path TEXT NOT NULL,
|
||||
size BIGINT NOT NULL,
|
||||
modified_time TIMESTAMP WITH TIME ZONE,
|
||||
created_time TIMESTAMP WITH TIME ZONE,
|
||||
file_hash VARCHAR(64), -- SHA-256 hash
|
||||
checksum VARCHAR(64), -- Alias for file_hash (legacy compatibility)
|
||||
category VARCHAR(50),
|
||||
disk_label VARCHAR(50),
|
||||
last_verified TIMESTAMP WITH TIME ZONE,
|
||||
status VARCHAR(20) DEFAULT 'indexed',
|
||||
duplicate_of TEXT, -- Path to canonical file if this is a duplicate
|
||||
|
||||
-- Metadata
|
||||
metadata JSONB DEFAULT '{}',
|
||||
|
||||
-- Audit fields
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- Constraints
|
||||
CONSTRAINT unique_file_path UNIQUE (path)
|
||||
);
|
||||
|
||||
-- Operations table (audit log)
|
||||
CREATE TABLE IF NOT EXISTS operations
|
||||
(
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
operation_type VARCHAR(50) NOT NULL,
|
||||
source_path TEXT,
|
||||
target_path TEXT,
|
||||
status VARCHAR(20) NOT NULL,
|
||||
|
||||
-- Legacy compatibility fields
|
||||
executed INTEGER DEFAULT 0,
|
||||
verified INTEGER DEFAULT 0,
|
||||
error TEXT,
|
||||
|
||||
-- File reference
|
||||
file_id UUID REFERENCES files (id) ON DELETE SET NULL,
|
||||
|
||||
-- Performance metrics
|
||||
duration_ms INTEGER,
|
||||
bytes_processed BIGINT,
|
||||
|
||||
-- Error information
|
||||
error_message TEXT,
|
||||
error_details JSONB,
|
||||
|
||||
-- Context
|
||||
session_id VARCHAR(100),
|
||||
user_agent TEXT,
|
||||
|
||||
-- Audit fields
|
||||
started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
completed_at TIMESTAMP WITH TIME ZONE,
|
||||
executed_at TIMESTAMP WITH TIME ZONE,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Deduplication hash store
|
||||
CREATE TABLE IF NOT EXISTS deduplication_store
|
||||
(
|
||||
hash VARCHAR(64) PRIMARY KEY,
|
||||
canonical_path TEXT NOT NULL,
|
||||
reference_count INTEGER DEFAULT 1,
|
||||
first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Migration plan table
|
||||
CREATE TABLE IF NOT EXISTS migration_plans
|
||||
(
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
name VARCHAR(100) NOT NULL,
|
||||
source_disk VARCHAR(50) NOT NULL,
|
||||
target_disk VARCHAR(50) NOT NULL,
|
||||
plan_json JSONB NOT NULL,
|
||||
|
||||
-- Statistics
|
||||
total_files INTEGER DEFAULT 0,
|
||||
total_size BIGINT DEFAULT 0,
|
||||
estimated_duration INTEGER, -- in seconds
|
||||
|
||||
-- Status
|
||||
status VARCHAR(20) DEFAULT 'draft',
|
||||
|
||||
-- Audit
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
executed_at TIMESTAMP WITH TIME ZONE,
|
||||
completed_at TIMESTAMP WITH TIME ZONE
|
||||
);
|
||||
|
||||
-- Indexes for performance
|
||||
CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
|
||||
create index on files (checksum);
|
||||
create index on files (checksum, path);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_operations_status ON operations (status);
|
||||
CREATE INDEX IF NOT EXISTS idx_operations_created ON operations (created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations (file_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store (canonical_path);
|
||||
|
||||
-- Functions for updating timestamps
|
||||
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
||||
RETURNS TRIGGER AS
|
||||
$$
|
||||
BEGIN
|
||||
NEW.updated_at = CURRENT_TIMESTAMP;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ language 'plpgsql';
|
||||
|
||||
-- Triggers for automatic updated_at
|
||||
CREATE TRIGGER update_files_updated_at
|
||||
BEFORE UPDATE
|
||||
ON files
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_updated_at_column();
|
||||
|
||||
-- View for operational dashboard
|
||||
CREATE OR REPLACE VIEW operational_dashboard AS
|
||||
SELECT o.status,
|
||||
COUNT(*) as operation_count,
|
||||
SUM(o.bytes_processed) as total_bytes,
|
||||
AVG(o.duration_ms) as avg_duration_ms,
|
||||
MIN(o.started_at) as earliest_operation,
|
||||
MAX(o.completed_at) as latest_operation
|
||||
FROM operations o
|
||||
WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
|
||||
GROUP BY o.status;
|
||||
|
||||
-- View for disk usage statistics
|
||||
CREATE OR REPLACE VIEW disk_usage_stats AS
|
||||
SELECT disk_label,
|
||||
COUNT(*) as file_count,
|
||||
SUM(size) as total_size,
|
||||
AVG(size) as avg_file_size,
|
||||
MIN(created_time) as oldest_file,
|
||||
MAX(modified_time) as newest_file
|
||||
FROM files
|
||||
GROUP BY disk_label;
|
||||
|
||||
-- Insert default configuration
|
||||
INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
|
||||
VALUES ('Default Migration Plan',
|
||||
'disk_d',
|
||||
'disk_e',
|
||||
'{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
|
||||
'draft')
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
-- Create read-only user for monitoring
|
||||
DO
|
||||
$$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
|
||||
CREATE USER monitor_user WITH PASSWORD 'monitor_password';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
|
||||
GRANT USAGE ON SCHEMA public TO monitor_user;
|
||||
GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
|
||||
GRANT SELECT ON operational_dashboard TO monitor_user;
|
||||
GRANT SELECT ON disk_usage_stats TO monitor_user;
|
||||
11
sql/migration/V002__add_extracted_text.sql
Normal file
11
sql/migration/V002__add_extracted_text.sql
Normal file
@@ -0,0 +1,11 @@
|
||||
-- Add extracted text and enrichment columns
|
||||
ALTER TABLE files ADD COLUMN IF NOT EXISTS extracted_text TEXT;
|
||||
ALTER TABLE files ADD COLUMN IF NOT EXISTS text_quality VARCHAR(20);
|
||||
ALTER TABLE files ADD COLUMN IF NOT EXISTS enrichment JSONB;
|
||||
|
||||
-- Add indexes for text search
|
||||
CREATE INDEX IF NOT EXISTS idx_files_extracted_text ON files USING gin(to_tsvector('english', extracted_text));
|
||||
CREATE INDEX IF NOT EXISTS idx_files_enrichment ON files USING gin(enrichment);
|
||||
|
||||
-- Add full text search capability
|
||||
CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING gin(to_tsvector('english', COALESCE(extracted_text, '')));
|
||||
41
sql/migration/V003__add_folder_support.sql
Normal file
41
sql/migration/V003__add_folder_support.sql
Normal file
@@ -0,0 +1,41 @@
|
||||
CREATE TABLE IF NOT EXISTS folders
|
||||
(
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
path TEXT NOT NULL UNIQUE,
|
||||
parent_path TEXT,
|
||||
disk_label VARCHAR(50),
|
||||
|
||||
file_count INT DEFAULT 0,
|
||||
total_size BIGINT DEFAULT 0,
|
||||
|
||||
project_type VARCHAR(50),
|
||||
intent TEXT,
|
||||
summary TEXT,
|
||||
|
||||
has_readme BOOLEAN DEFAULT FALSE,
|
||||
has_git BOOLEAN DEFAULT FALSE,
|
||||
has_manifest BOOLEAN DEFAULT FALSE,
|
||||
manifest_types TEXT[],
|
||||
dominant_file_types JSONB,
|
||||
|
||||
structure JSONB,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_folders_path ON folders (path);
|
||||
CREATE INDEX IF NOT EXISTS idx_folders_parent ON folders (parent_path);
|
||||
CREATE INDEX IF NOT EXISTS idx_folders_disk ON folders (disk_label);
|
||||
CREATE INDEX IF NOT EXISTS idx_folders_project_type ON folders (project_type);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS processing_checkpoints
|
||||
(
|
||||
task_name VARCHAR(100) PRIMARY KEY,
|
||||
last_processed_id TEXT,
|
||||
last_processed_path TEXT,
|
||||
processed_count INT DEFAULT 0,
|
||||
total_count INT,
|
||||
started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
Reference in New Issue
Block a user