This commit is contained in:
mike
2025-12-13 11:56:06 +01:00
commit 2b2c575385
57 changed files with 6505 additions and 0 deletions

18
.aiignore Normal file
View File

@@ -0,0 +1,18 @@
.DS_Store
*.log
*.tmp
dist/
build/
out/
.idea
node_modules/
.vscode/
.git
.github
scripts
.pytest_cache/
__pycache__
.aiignore
*.iml
.env
.bundle.md

44
.gitignore vendored Normal file
View File

@@ -0,0 +1,44 @@
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
*.sqlite3
*.db
*.log
coverage.xml
*.coverage
.coverage
.coverage.*
.cache
nosetests.xml
pytest.xml
htmlcov/
.tox/
.pytest_cache/
.mypy_cache/
.pyre/
.idea
*.imlbackup_*.dump

340
ARCHITECTURE.md Normal file
View File

@@ -0,0 +1,340 @@
# Data Reorganization Architecture: "Project Defrag"
## Executive Summary
This document outlines the architecture for reorganizing 20TB of backup data across multiple NVMe drives and servers. The solution implements intelligent deduplication, systematic categorization, and optimized storage patterns for enhanced performance and maintainability.
## System Architecture Overview
```mermaid
graph TB
subgraph "Source Environment"
A["Local Machine<br/>8x NVMe + 1 HDD<br/>~10TB"]
B["Server Machine<br/>Mixed Storage<br/>~10TB"]
end
subgraph "Processing Layer"
C["Discovery Engine"]
D["Classification Engine"]
E["Deduplication Engine"]
F["Migration Engine"]
end
subgraph "Target Architecture"
G["App Volumes"]
H["Gitea Repository"]
I["Build Cache (.maven, pycache)"]
J["Artifactories"]
K["Databases"]
L["Backups"]
M["LLM Model Cache"]
N["Git Infrastructure"]
end
A --> C
B --> C
C --> D
D --> E
E --> F
F --> G
F --> H
F --> I
F --> J
F --> K
F --> L
F --> M
F --> N
```
## Data Flow Architecture
### Phase 1: Discovery & Assessment
```mermaid
sequenceDiagram
participant D as Discovery Engine
participant FS as File System Scanner
participant DB as Metadata Database
participant API as System APIs
D->>FS: Scan directory structures
FS->>FS: Identify file types, sizes, dates
FS->>DB: Store file metadata
D->>API: Query system information
API->>DB: Store system context
DB->>D: Return analysis summary
```
### Phase 2: Classification & Deduplication
```mermaid
sequenceDiagram
participant C as Classifier
participant DH as Deduplication Hash
participant CDB as Canonical DB
participant MAP as Mapping Store
C->>C: Analyze file signatures
C->>DH: Generate content hashes
DH->>CDB: Check for duplicates
CDB->>DH: Return canonical reference
DH->>MAP: Store deduplication map
C->>C: Apply categorization rules
```
## Target Directory Structure
```
/mnt/organized/
├── apps/
│ ├── volumes/
│ │ ├── docker-volumes/
│ │ ├── app-data/
│ │ └── user-profiles/
│ └── runtime/
├── development/
│ ├── gitea/
│ │ ├── repositories/
│ │ ├── lfs-objects/
│ │ └── avatars/
│ ├── git-infrastructure/
│ │ ├── hooks/
│ │ ├── templates/
│ │ └── config/
│ └── build-tools/
│ ├── .maven/repository/
│ ├── gradle-cache/
│ └── sbt-cache/
├── artifacts/
│ ├── java/
│ │ ├── maven-central-cache/
│ │ ├── jfrog-artifactory/
│ │ └── gradle-build-cache/
│ ├── python/
│ │ ├── pypi-cache/
│ │ ├── wheelhouse/
│ │ └── pip-cache/
│ ├── node/
│ │ ├── npm-registry/
│ │ ├── yarn-cache/
│ │ └── pnpm-store/
│ └── go/
│ ├── goproxy-cache/
│ ├── module-cache/
│ └── sumdb-cache/
├── cache/
│ ├── llm-models/
│ │ ├── hugging-face/
│ │ ├── openai-cache/
│ │ └── local-llm/
│ ├── pycache/
│ ├── node_modules-archive/
│ └── browser-cache/
├── databases/
│ ├── postgresql/
│ ├── mysql/
│ ├── mongodb/
│ └── redis/
├── backups/
│ ├── system/
│ ├── application/
│ ├── database/
│ └── archive/
└── temp/
├── processing/
├── staging/
└── cleanup/
```
## Technology Stack Recommendation
### Primary Language: **Python 3.11+**
**Rationale:**
- Excellent file system handling capabilities
- Rich ecosystem for data processing (pandas, pyarrow)
- Built-in multiprocessing for I/O operations
- Superior hash library support for deduplication
- Cross-platform compatibility
### Key Libraries:
```python
# Core processing
import asyncio
import hashlib
import multiprocessing as mp
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
# Data handling
import pandas as pd
import pyarrow as pa
import sqlite3
import json
# File analysis
import magic # python-magic
import mimetypes
import filetype
# System integration
import psutil
import shutil
import os
```
## Deduplication Strategy
### Algorithm Selection: **Variable-Size Chunking with Rabin Fingerprinting**
```python
class AdvancedDeduplication:
def __init__(self, avg_chunk_size=8192):
self.chunker = RabinChunker(avg_chunk_size)
self.hash_store = HashStore()
def deduplicate_file(self, file_path):
chunks = self.chunker.chunk_file(file_path)
file_hash = self.compute_file_hash(chunks)
if self.hash_store.exists(file_hash):
return self.create_reference(file_hash)
else:
self.store_canonical(file_path, file_hash)
return file_hash
```
### Performance Optimization:
- **Parallel Processing**: Utilize all CPU cores for hashing
- **Memory Mapping**: For large files (>100MB)
- **Incremental Hashing**: Process files in streams
- **Cache Layer**: Redis for frequently accessed hashes
## Classification Engine
### Rule-Based Classification System:
```yaml
classification_rules:
build_artifacts:
patterns:
- "**/target/**"
- "**/build/**"
- "**/dist/**"
- "**/node_modules/**"
action: categorize_as_build_cache
development_tools:
patterns:
- "**/.maven/**"
- "**/.gradle/**"
- "**/.npm/**"
- "**/.cache/**"
action: categorize_as_tool_cache
repositories:
patterns:
- "**/.git/**"
- "**/repositories/**"
- "**/gitea/**"
action: categorize_as_vcs
database_files:
patterns:
- "**/*.db"
- "**/*.sqlite"
- "**/postgresql/**"
- "**/mysql/**"
action: categorize_as_database
model_files:
patterns:
- "**/*.bin"
- "**/*.onnx"
- "**/models/**"
- "**/llm*/**"
action: categorize_as_ai_model
```
## Performance Considerations
### NVMe Optimization Strategies:
1. **Parallel I/O Operations**
- Queue depth optimization (32-64 operations)
- Async I/O with io_uring where available
- Multi-threaded directory traversal
2. **Memory Management**
- Streaming processing for large files
- Memory-mapped file access
- Buffer pool for frequent operations
3. **CPU Optimization**
- SIMD instructions for hashing (AVX2/NEON)
- Process pool for parallel processing
- NUMA-aware memory allocation
## Migration Strategy
### Three-Phase Approach:
```mermaid
graph LR
A[Phase 1: Analysis] --> B[Phase 2: Staging]
B --> C[Phase 3: Migration]
A --> A1[Discovery Scan]
A --> A2[Deduplication Analysis]
A --> A3[Space Calculation]
B --> B1[Create Target Structure]
B --> B2[Hard Link Staging]
B --> B3[Validation Check]
C --> C1[Atomic Move Operations]
C --> C2[Symlink Updates]
C --> C3[Cleanup Verification]
```
## Monitoring & Validation
### Key Metrics:
- **Processing Rate**: Files/second, GB/hour
- **Deduplication Ratio**: Original vs. Final size
- **Error Rate**: Failed operations percentage
- **Resource Usage**: CPU, Memory, I/O utilization
### Validation Checks:
- File integrity verification (hash comparison)
- Directory structure validation
- Symlink resolution testing
- Permission preservation audit
## Risk Mitigation
### Safety Measures:
1. **Read-First Approach**: Never modify source until validation
2. **Incremental Processing**: Process in small batches
3. **Backup Verification**: Ensure backup integrity before operations
4. **Rollback Capability**: Maintain reverse mapping for recovery
5. **Dry-Run Mode**: Preview all operations before execution
## Implementation Timeline
### Phase 1: Tool Development (2-3 weeks)
- Core discovery engine
- Classification system
- Basic deduplication
- Testing framework
### Phase 2: Staging & Validation (1-2 weeks)
- Target structure creation
- Sample data processing
- Performance optimization
- Safety verification
### Phase 3: Production Migration (2-4 weeks)
- Full data processing
- Continuous monitoring
- Issue resolution
- Final validation
This architecture provides a robust, scalable solution for your data reorganization needs while maintaining data integrity and optimizing for your NVMe storage infrastructure.

38
Dockerfile Normal file
View File

@@ -0,0 +1,38 @@
# Dockerfile for Project Defrag with PostgreSQL integration
FROM python:3.11-slim
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
g++ \
libpq-dev \
postgresql-client \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Create non-root user
RUN useradd -m -u 1000 appuser && \
chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD python -c "import psycopg2; psycopg2.connect(dbname='${POSTGRES_DB:-disk_reorganizer_db}', user='${POSTGRES_USER:-disk_reorg_user}', password='${POSTGRES_PASSWORD}', host='${DB_HOST:-db}', port='${DB_PORT:-5432}')" || exit 1
# Default command (can be overridden in docker-compose)
CMD ["python", "app/main.py", "--help"]

114
README.md Normal file
View File

@@ -0,0 +1,114 @@
Hier is je **extreme short, sharp, architectural** versie — volledig gecomprimeerd, professioneel, helder.
Bron verwerkt uit je bestand
---
# Disk Reorganizer — Architectural Summary
## Core Outcome
Migration from **SQLite → PostgreSQL** completed.
System is now **network-capable**, **auditable**, **scalable**, and offers **real-time operational telemetry**.
---
## Architecture
### Database Layer (PostgreSQL)
* Central DB: `disk_reorganizer_db`
* User: `disk_reorg_user`
* Tables: `files`, `operations`
* Features: indexes, triggers, conflict-upserts, audit fields
* Deployment: SQL + Windows/Linux setup scripts
### Application Layer
* Python driver migrated to **psycopg2**
* Unified DB config + connection pooling
* Refactored CRUD + batch commits
* Robust error handling + transactional execution
### Operational Layer
* **Dynamic in-screen logging** during indexing + migration
* File/sec, GB processed, ETA, success/error counters
* Clean single-line, non-spamming UI updates
---
## Workflow
1. **Setup**
```json
{
"host": "192.168.1.159",
"port": 5432,
"database": "disk_reorganizer_db",
"user": "disk_reorg_user",
"password": "heel-goed-wachtwoord"
}
```
```bash
./setup_database.sh # or setup_database.bat
pip install -r requirements.txt
```
2. **Index**
```bash
python app/main.py index "D:\\" disk_d
```
3. **Plan**
```bash
python app/main.py plan disk_d disk_e
```
4. **Dry-Run**
```bash
python app/main.py execute plan.json --dry-run
```
5. **Execute**
```bash
python app/main.py execute plan.json
```
6. **Report**
```bash
python src/main.py report
```
---
## Guarantees
* No destructive actions by default
* Originals preserved
* Every action logged in DB
* Error-resilient, continues safely
* Suitable for millions of file records
---
## Failure Points to Check
* PostgreSQL reachable on 5432
* Correct credentials
* Disk permissions
* Python + psycopg2 installed
---
## Essence
A lean, safe, high-visibility disk migration tool running on a proper relational backbone, engineered for clarity, scale, and operational certainty.
Wil je ook een **ultrakorte executive one-pager** of een **diagram-versie**?

View File

@@ -0,0 +1,63 @@
from pathlib import Path
from typing import Dict, Set, List
from collections import Counter
class FolderAnalyzer:
def __init__(self):
self.manifest_files = {'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'], 'javascript': ['package.json', 'yarn.lock', 'package-lock.json'], 'python': ['pyproject.toml', 'setup.py', 'requirements.txt', 'Pipfile'], 'go': ['go.mod', 'go.sum'], 'rust': ['Cargo.toml', 'Cargo.lock'], 'docker': ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml'], 'k8s': ['helm', 'kustomization.yaml', 'deployment.yaml']}
self.intent_keywords = {'infrastructure': ['infra', 'deploy', 'k8s', 'docker', 'terraform', 'ansible'], 'application': ['app', 'service', 'api', 'server', 'client'], 'data': ['data', 'dataset', 'models', 'training', 'ml'], 'documentation': ['docs', 'documentation', 'wiki', 'readme'], 'testing': ['test', 'tests', 'spec', 'e2e', 'integration'], 'build': ['build', 'dist', 'target', 'out', 'bin'], 'config': ['config', 'conf', 'settings', 'env']}
def analyze_folder(self, folder_path: Path, files: List[Dict]) -> Dict:
files_list = [Path(f['path']) for f in files]
has_readme = any(('readme' in f.name.lower() for f in files_list))
has_git = any(('.git' in str(f) for f in files_list))
manifest_types = self._detect_manifests(files_list)
has_manifest = len(manifest_types) > 0
file_types = Counter((f.suffix.lower() for f in files_list if f.suffix))
dominant_types = dict(file_types.most_common(10))
intent = self._infer_intent(folder_path.name.lower(), files_list)
project_type = self._infer_project_type(manifest_types, dominant_types)
structure = {'depth': len(folder_path.parts), 'has_src': any(('src' in str(f) for f in files_list[:20])), 'has_tests': any(('test' in str(f) for f in files_list[:20])), 'has_docs': any(('doc' in str(f) for f in files_list[:20]))}
return {'has_readme': has_readme, 'has_git': has_git, 'has_manifest': has_manifest, 'manifest_types': manifest_types, 'dominant_file_types': dominant_types, 'project_type': project_type, 'intent': intent, 'structure': structure}
def _detect_manifests(self, files: List[Path]) -> List[str]:
detected = []
file_names = {f.name for f in files}
for tech, manifests in self.manifest_files.items():
if any((m in file_names for m in manifests)):
detected.append(tech)
return detected
def _infer_intent(self, folder_name: str, files: List[Path]) -> str:
file_str = ' '.join((str(f) for f in files[:50]))
for intent, keywords in self.intent_keywords.items():
if any((kw in folder_name or kw in file_str.lower() for kw in keywords)):
return intent
return 'unknown'
def _infer_project_type(self, manifests: List[str], file_types: Dict) -> str:
if manifests:
return manifests[0]
if '.py' in file_types and file_types.get('.py', 0) > 5:
return 'python'
if '.js' in file_types or '.ts' in file_types:
return 'javascript'
if '.java' in file_types:
return 'java'
if '.go' in file_types:
return 'go'
return 'mixed'
def generate_summary(self, folder_analysis: Dict, readme_text: str=None) -> str:
parts = []
if folder_analysis.get('project_type'):
parts.append(f"{folder_analysis['project_type']} project")
if folder_analysis.get('intent'):
parts.append(f"for {folder_analysis['intent']}")
if folder_analysis.get('manifest_types'):
parts.append(f"using {', '.join(folder_analysis['manifest_types'])}")
if readme_text:
first_para = readme_text.split('\n\n')[0][:200]
parts.append(f'Description: {first_para}')
return ' '.join(parts) if parts else 'Mixed content folder'

View File

@@ -0,0 +1,2 @@
from .classifier import FileClassifier
__all__ = ['FileClassifier']

View File

@@ -0,0 +1,30 @@
from typing import Protocol, Optional
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ClassificationRule:
name: str
category: str
patterns: list[str]
priority: int = 0
description: str = ''
class IClassifier(Protocol):
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
...
def get_category_rules(self, category: str) -> list[ClassificationRule]:
...
class IRuleEngine(Protocol):
def add_rule(self, rule: ClassificationRule) -> None:
...
def remove_rule(self, rule_name: str) -> None:
...
def match_path(self, path: Path) -> Optional[str]:
...

View File

@@ -0,0 +1,74 @@
from pathlib import Path
from typing import List, Set, Dict, Tuple
import re
class FileClassifier:
def __init__(self):
self.build_patterns = {'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', 'site-packages', 'bower_components', 'jspm_packages'}
self.artifact_patterns = {'java': {'.jar', '.war', '.ear', '.class'}, 'python': {'.pyc', '.pyo', '.whl', '.egg'}, 'node': {'node_modules'}, 'go': {'vendor', 'pkg'}, 'rust': {'target'}, 'docker': {'.dockerignore', 'Dockerfile'}}
self.category_keywords = {'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, 'cache': {'cache', 'temp', 'tmp', '.cache'}, 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, 'backups': {'backup', 'bak', 'snapshot', 'archive'}, 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, 'artifacts': {'build', 'dist', 'release', 'output'}, 'temp': {'tmp', 'temp', 'staging', 'processing'}}
self.media_extensions = {'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, 'presentation': {'.ppt', '.pptx', '.odp'}}
self.code_extensions = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'}
def classify_path(self, path: str, size: int=0) -> Tuple[Set[str], str, bool]:
p = Path(path)
labels = set()
primary_category = 'misc'
is_build_artifact = False
parts = p.parts
name_lower = p.name.lower()
for part in parts:
part_lower = part.lower()
if part_lower in self.build_patterns:
is_build_artifact = True
labels.add('build-artifact')
break
if is_build_artifact:
for artifact_type, patterns in self.artifact_patterns.items():
if any((part.lower() in patterns for part in parts)) or p.suffix in patterns:
primary_category = f'artifacts/{artifact_type}'
labels.add('artifact')
return (labels, primary_category, is_build_artifact)
if '.git' in parts:
labels.add('vcs')
primary_category = 'infra/git-infrastructure'
return (labels, primary_category, False)
for category, keywords in self.category_keywords.items():
if any((kw in name_lower or any((kw in part.lower() for part in parts)) for kw in keywords)):
labels.add(category)
primary_category = category
break
for media_type, extensions in self.media_extensions.items():
if p.suffix.lower() in extensions:
labels.add(media_type)
labels.add('media')
primary_category = f'user/{media_type}'
break
if p.suffix.lower() in self.code_extensions:
labels.add('code')
if primary_category == 'misc':
primary_category = 'dev'
if size > 100 * 1024 * 1024:
labels.add('large-file')
if any((kw in name_lower for kw in ['test', 'spec', 'mock'])):
labels.add('test')
if any((kw in name_lower for kw in ['config', 'settings', 'env'])):
labels.add('config')
return (labels, primary_category, is_build_artifact)
def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str:
p = Path(source_path)
if 'build-artifact' in labels:
return f'trash/build-artifacts/{source_path}'
if category.startswith('artifacts/'):
artifact_type = category.split('/')[-1]
return f'artifacts/{artifact_type}/{p.name}'
if category.startswith('user/'):
media_type = category.split('/')[-1]
return f'user/{media_type}/{p.name}'
parts = [part for part in p.parts if part not in self.build_patterns]
if len(parts) > 3:
project_name = parts[0] if parts else 'misc'
return f"{category}/{project_name}/{'/'.join(parts[1:])}"
return f'{category}/{source_path}'

View File

@@ -0,0 +1,148 @@
from pathlib import Path
from typing import Optional, Callable
import psycopg2
from .rules import RuleBasedClassifier
from .ml import create_ml_classifier, DummyMLClassifier
from ..shared.models import ProcessingStats
from ..shared.config import DatabaseConfig
from ..shared.logger import ProgressLogger
class ClassificationEngine:
def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, use_ml: bool=False):
self.db_config = db_config
self.logger = logger
self.rule_classifier = RuleBasedClassifier()
self.ml_classifier = create_ml_classifier() if use_ml else None
self.use_ml = use_ml and (not isinstance(self.ml_classifier, DummyMLClassifier))
self._connection = None
def _get_connection(self):
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
return self._connection
def classify_all(self, disk: Optional[str]=None, batch_size: int=1000, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
self.logger.section('Starting Classification')
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute('\n SELECT path, checksum\n FROM files\n WHERE disk_label = %s AND category IS NULL\n ', (disk,))
else:
cursor.execute('\n SELECT path, checksum\n FROM files\n WHERE category IS NULL\n ')
files_to_classify = cursor.fetchall()
total_files = len(files_to_classify)
self.logger.info(f'Found {total_files} files to classify')
stats = ProcessingStats()
batch = []
for path_str, checksum in files_to_classify:
path = Path(path_str)
category = self.rule_classifier.classify(path)
if category is None and self.use_ml and self.ml_classifier:
category = self.ml_classifier.classify(path)
if category is None:
category = 'temp/processing'
batch.append((category, str(path)))
stats.files_processed += 1
if len(batch) >= batch_size:
self._update_categories(cursor, batch)
conn.commit()
batch.clear()
if progress_callback:
progress_callback(stats.files_processed, total_files, stats)
if stats.files_processed % (batch_size * 10) == 0:
self.logger.progress(stats.files_processed, total_files, prefix='Files classified', elapsed_seconds=stats.elapsed_seconds)
if batch:
self._update_categories(cursor, batch)
conn.commit()
stats.files_succeeded = stats.files_processed
cursor.close()
self.logger.info(f'Classification complete: {stats.files_processed} files in {stats.elapsed_seconds:.1f}s')
return stats
def _update_categories(self, cursor, batch: list[tuple[str, str]]):
from psycopg2.extras import execute_batch
query = '\n UPDATE files\n SET category = %s\n WHERE path = %s\n '
execute_batch(cursor, query, batch)
def classify_path(self, path: Path) -> Optional[str]:
category = self.rule_classifier.classify(path)
if category is None and self.use_ml and self.ml_classifier:
category = self.ml_classifier.classify(path)
return category
def get_category_stats(self) -> dict[str, dict]:
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('\n SELECT\n category,\n COUNT(*) as file_count,\n SUM(size) as total_size\n FROM files\n WHERE category IS NOT NULL\n GROUP BY category\n ORDER BY total_size DESC\n ')
stats = {}
for category, file_count, total_size in cursor.fetchall():
stats[category] = {'file_count': file_count, 'total_size': total_size}
cursor.close()
return stats
def get_uncategorized_count(self) -> int:
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('SELECT COUNT(*) FROM files WHERE category IS NULL')
count = cursor.fetchone()[0]
cursor.close()
return count
def reclassify_category(self, old_category: str, new_category: str) -> int:
self.logger.info(f'Reclassifying {old_category} -> {new_category}')
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('\n UPDATE files\n SET category = %s\n WHERE category = %s\n ', (new_category, old_category))
count = cursor.rowcount
conn.commit()
cursor.close()
self.logger.info(f'Reclassified {count} files')
return count
def train_ml_classifier(self, min_samples: int=10) -> bool:
if not self.use_ml or self.ml_classifier is None:
self.logger.warning('ML classifier not available')
return False
self.logger.subsection('Training ML Classifier')
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('\n SELECT path, category\n FROM files\n WHERE category IS NOT NULL\n ')
training_data = [(Path(path), category) for path, category in cursor.fetchall()]
cursor.close()
if not training_data:
self.logger.warning('No training data available')
return False
category_counts = {}
for _, category in training_data:
category_counts[category] = category_counts.get(category, 0) + 1
filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples]
if not filtered_data:
self.logger.warning(f'No categories with >= {min_samples} samples')
return False
self.logger.info(f'Training with {len(filtered_data)} samples')
try:
self.ml_classifier.train(filtered_data)
self.logger.info('ML classifier trained successfully')
return True
except Exception as e:
self.logger.error(f'Failed to train ML classifier: {e}')
return False
def get_all_categories(self) -> list[str]:
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute('\n SELECT DISTINCT category\n FROM files\n WHERE category IS NOT NULL\n ORDER BY category\n ')
categories = [row[0] for row in cursor.fetchall()]
cursor.close()
return categories
def close(self):
if self._connection and (not self._connection.closed):
self._connection.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()

127
app/classification/ml.py Normal file
View File

@@ -0,0 +1,127 @@
from pathlib import Path
from typing import Optional, List, Tuple
import pickle
try:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
SKLEARN_AVAILABLE = True
except ImportError:
SKLEARN_AVAILABLE = False
class MLClassifier:
def __init__(self):
if not SKLEARN_AVAILABLE:
raise ImportError('scikit-learn is required for ML classification. Install with: pip install scikit-learn')
self.model: Optional[Pipeline] = None
self.categories: List[str] = []
self._is_trained = False
def _extract_features(self, path: Path) -> str:
parts = path.parts
extension = path.suffix
filename = path.name
features = []
features.extend(parts)
if extension:
features.append(f'ext:{extension}')
name_parts = filename.replace('-', ' ').replace('_', ' ').replace('.', ' ').split()
features.extend([f'name:{part}' for part in name_parts])
return ' '.join(features)
def train(self, training_data: List[Tuple[Path, str]]) -> None:
if not training_data:
raise ValueError('Training data cannot be empty')
X = [self._extract_features(path) for path, _ in training_data]
y = [category for _, category in training_data]
self.categories = sorted(set(y))
self.model = Pipeline([('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 2), min_df=1)), ('classifier', MultinomialNB())])
self.model.fit(X, y)
self._is_trained = True
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
if not self._is_trained or self.model is None:
return None
features = self._extract_features(path)
try:
prediction = self.model.predict([features])[0]
return prediction
except Exception:
return None
def predict_proba(self, path: Path) -> dict[str, float]:
if not self._is_trained or self.model is None:
return {}
features = self._extract_features(path)
try:
probabilities = self.model.predict_proba([features])[0]
return {category: float(prob) for category, prob in zip(self.categories, probabilities)}
except Exception:
return {}
def save_model(self, model_path: Path) -> None:
if not self._is_trained:
raise ValueError('Cannot save untrained model')
model_data = {'model': self.model, 'categories': self.categories, 'is_trained': self._is_trained}
with open(model_path, 'wb') as f:
pickle.dump(model_data, f)
def load_model(self, model_path: Path) -> None:
with open(model_path, 'rb') as f:
model_data = pickle.load(f)
self.model = model_data['model']
self.categories = model_data['categories']
self._is_trained = model_data['is_trained']
@property
def is_trained(self) -> bool:
return self._is_trained
class DummyMLClassifier:
def __init__(self):
pass
def train(self, training_data: List[Tuple[Path, str]]) -> None:
raise NotImplementedError('ML classification requires scikit-learn. Install with: pip install scikit-learn')
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
return None
def predict_proba(self, path: Path) -> dict[str, float]:
return {}
def save_model(self, model_path: Path) -> None:
raise NotImplementedError('ML classification not available')
def load_model(self, model_path: Path) -> None:
raise NotImplementedError('ML classification not available')
@property
def is_trained(self) -> bool:
return False
def create_ml_classifier() -> MLClassifier | DummyMLClassifier:
if SKLEARN_AVAILABLE:
return MLClassifier()
else:
return DummyMLClassifier()
def train_from_database(db_connection, min_samples_per_category: int=10) -> MLClassifier | DummyMLClassifier:
classifier = create_ml_classifier()
if isinstance(classifier, DummyMLClassifier):
return classifier
cursor = db_connection.cursor()
cursor.execute('\n SELECT path, category\n FROM files\n WHERE category IS NOT NULL\n ')
training_data = [(Path(path), category) for path, category in cursor.fetchall()]
cursor.close()
if not training_data:
return classifier
category_counts = {}
for _, category in training_data:
category_counts[category] = category_counts.get(category, 0) + 1
filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples_per_category]
if filtered_data:
classifier.train(filtered_data)
return classifier

View File

@@ -0,0 +1,60 @@
from pathlib import Path
from typing import Optional
import fnmatch
from ._protocols import ClassificationRule
class RuleBasedClassifier:
def __init__(self):
self.rules: list[ClassificationRule] = []
self._load_default_rules()
def _load_default_rules(self):
self.add_rule(ClassificationRule(name='maven_cache', category='artifacts/java/maven', patterns=['**/.m2/**', '**/.maven/**', '**/maven-central-cache/**'], priority=10, description='Maven repository and cache'))
self.add_rule(ClassificationRule(name='gradle_cache', category='artifacts/java/gradle', patterns=['**/.gradle/**', '**/gradle-cache/**', '**/gradle-build-cache/**'], priority=10, description='Gradle cache and artifacts'))
self.add_rule(ClassificationRule(name='python_cache', category='cache/pycache', patterns=['**/__pycache__/**', '**/*.pyc', '**/*.pyo'], priority=10, description='Python cache files'))
self.add_rule(ClassificationRule(name='python_artifacts', category='artifacts/python', patterns=['**/pip-cache/**', '**/pypi-cache/**', '**/wheelhouse/**'], priority=10, description='Python package artifacts'))
self.add_rule(ClassificationRule(name='node_modules', category='cache/node_modules-archive', patterns=['**/node_modules/**'], priority=10, description='Node.js modules'))
self.add_rule(ClassificationRule(name='node_cache', category='artifacts/node', patterns=['**/.npm/**', '**/npm-registry/**', '**/yarn-cache/**', '**/pnpm-store/**'], priority=10, description='Node.js package managers cache'))
self.add_rule(ClassificationRule(name='go_cache', category='artifacts/go', patterns=['**/goproxy-cache/**', '**/go/pkg/mod/**', '**/go-module-cache/**'], priority=10, description='Go module cache'))
self.add_rule(ClassificationRule(name='git_repos', category='development/git-infrastructure', patterns=['**/.git/**', '**/gitea/repositories/**'], priority=15, description='Git repositories and infrastructure'))
self.add_rule(ClassificationRule(name='gitea', category='development/gitea', patterns=['**/gitea/**'], priority=12, description='Gitea server data'))
self.add_rule(ClassificationRule(name='postgresql', category='databases/postgresql', patterns=['**/postgresql/**', '**/postgres/**', '**/*.sql'], priority=10, description='PostgreSQL databases'))
self.add_rule(ClassificationRule(name='mysql', category='databases/mysql', patterns=['**/mysql/**', '**/mariadb/**'], priority=10, description='MySQL/MariaDB databases'))
self.add_rule(ClassificationRule(name='mongodb', category='databases/mongodb', patterns=['**/mongodb/**', '**/mongo/**'], priority=10, description='MongoDB databases'))
self.add_rule(ClassificationRule(name='redis', category='databases/redis', patterns=['**/redis/**', '**/*.rdb'], priority=10, description='Redis databases'))
self.add_rule(ClassificationRule(name='sqlite', category='databases/sqlite', patterns=['**/*.db', '**/*.sqlite', '**/*.sqlite3'], priority=8, description='SQLite databases'))
self.add_rule(ClassificationRule(name='llm_models', category='cache/llm-models', patterns=['**/hugging-face/**', '**/huggingface/**', '**/.cache/huggingface/**', '**/models/**/*.bin', '**/models/**/*.onnx', '**/models/**/*.safetensors', '**/llm*/**', '**/openai-cache/**'], priority=12, description='LLM and AI model files'))
self.add_rule(ClassificationRule(name='docker_volumes', category='apps/volumes/docker-volumes', patterns=['**/docker/volumes/**', '**/var/lib/docker/volumes/**'], priority=10, description='Docker volumes'))
self.add_rule(ClassificationRule(name='app_data', category='apps/volumes/app-data', patterns=['**/app-data/**', '**/application-data/**'], priority=8, description='Application data'))
self.add_rule(ClassificationRule(name='build_output', category='development/build-tools', patterns=['**/target/**', '**/build/**', '**/dist/**', '**/out/**'], priority=5, description='Build output directories'))
self.add_rule(ClassificationRule(name='system_backups', category='backups/system', patterns=['**/backup/**', '**/backups/**', '**/*.bak', '**/*.backup'], priority=10, description='System backups'))
self.add_rule(ClassificationRule(name='database_backups', category='backups/database', patterns=['**/*.sql.gz', '**/*.dump', '**/db-backup/**'], priority=11, description='Database backups'))
self.add_rule(ClassificationRule(name='archives', category='backups/archive', patterns=['**/*.tar', '**/*.tar.gz', '**/*.tgz', '**/*.zip', '**/*.7z'], priority=5, description='Archive files'))
def add_rule(self, rule: ClassificationRule) -> None:
self.rules.append(rule)
self.rules.sort(key=lambda r: r.priority, reverse=True)
def remove_rule(self, rule_name: str) -> None:
self.rules = [r for r in self.rules if r.name != rule_name]
def match_path(self, path: Path) -> Optional[str]:
path_str = str(path)
for rule in self.rules:
for pattern in rule.patterns:
if fnmatch.fnmatch(path_str, pattern):
return rule.category
return None
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
return self.match_path(path)
def get_category_rules(self, category: str) -> list[ClassificationRule]:
return [r for r in self.rules if r.category == category]
def get_all_categories(self) -> set[str]:
return {r.category for r in self.rules}
def get_rules_by_priority(self, min_priority: int=0) -> list[ClassificationRule]:
return [r for r in self.rules if r.priority >= min_priority]

3
app/content/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .profiler import ContentProfiler
from .extractors import ContentExtractor
__all__ = ['ContentProfiler', 'ContentExtractor']

62
app/content/extractors.py Normal file
View File

@@ -0,0 +1,62 @@
from pathlib import Path
from typing import Dict, Optional
import json
class ContentExtractor:
def __init__(self):
self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}
def extract(self, file_path: Path, extractor_type: str) -> Dict:
extractor = self.extractors.get(extractor_type)
if not extractor:
return {'error': f'Unknown extractor: {extractor_type}'}
try:
return extractor(file_path)
except Exception as e:
return {'error': str(e)}
def _extract_text(self, file_path: Path) -> Dict:
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read(1024 * 1024)
return {'text': content, 'char_count': len(content), 'needs_llm': False}
except Exception as e:
return {'error': str(e)}
def _extract_code(self, file_path: Path) -> Dict:
result = self._extract_text(file_path)
if 'error' not in result:
result['type'] = 'code'
result['needs_llm'] = True
return result
def _extract_pdf(self, file_path: Path) -> Dict:
try:
import PyPDF2
text_parts = []
with open(file_path, 'rb') as f:
pdf = PyPDF2.PdfReader(f)
for page in pdf.pages[:10]:
text_parts.append(page.extract_text())
text = '\n'.join(text_parts)
return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
except Exception as e:
return {'error': str(e), 'needs_ocr': True}
def _extract_image(self, file_path: Path) -> Dict:
return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}
def _extract_audio(self, file_path: Path) -> Dict:
return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}
def _extract_video(self, file_path: Path) -> Dict:
return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}
def _extract_document(self, file_path: Path) -> Dict:
try:
import textract
text = textract.process(str(file_path)).decode('utf-8')
return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
except:
return {'error': 'textract failed', 'needs_llm': True}

108
app/content/profiler.py Normal file
View File

@@ -0,0 +1,108 @@
from pathlib import Path
from typing import Dict, Optional, Tuple
import mimetypes
import magic
import json
from datetime import datetime
class ContentProfiler:
def __init__(self):
self.mime_detector = magic.Magic(mime=True)
self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']}
self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
def profile_file(self, file_path: Path) -> Dict:
try:
stat = file_path.stat()
size = stat.st_size
mtime = datetime.fromtimestamp(stat.st_mtime)
mime_type = self._detect_mime(file_path)
kind = self._determine_kind(file_path, mime_type)
profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)}
return profile
except Exception as e:
return {'path': str(file_path), 'error': str(e), 'processable': False}
def _detect_mime(self, file_path: Path) -> str:
try:
return self.mime_detector.from_file(str(file_path))
except:
guess = mimetypes.guess_type(str(file_path))[0]
return guess or 'application/octet-stream'
def _determine_kind(self, file_path: Path, mime_type: str) -> str:
for kind, mimes in self.kind_mapping.items():
if any((mime in mime_type for mime in mimes)):
return kind
suffix = file_path.suffix.lower()
if suffix in self.text_exts:
return 'text'
if suffix in self.code_exts:
return 'code'
return 'unknown'
def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'}
return extractors.get(kind)
def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
hints = {}
if kind == 'text' or kind == 'code':
hints['language'] = self._guess_language(file_path)
if size < 1024 * 1024:
hints['lines'] = self._count_lines(file_path)
if kind == 'pdf':
hints['page_count'] = self._get_pdf_pages(file_path)
if kind in ['audio', 'video']:
hints['duration'] = self._get_media_duration(file_path)
if kind == 'image':
hints['has_exif'] = self._has_exif(file_path)
hints['dimensions'] = self._get_image_dimensions(file_path)
return hints
def _guess_language(self, file_path: Path) -> Optional[str]:
lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'}
return lang_map.get(file_path.suffix.lower())
def _count_lines(self, file_path: Path) -> Optional[int]:
try:
with open(file_path, 'rb') as f:
return sum((1 for _ in f))
except:
return None
def _get_pdf_pages(self, file_path: Path) -> Optional[int]:
try:
import PyPDF2
with open(file_path, 'rb') as f:
pdf = PyPDF2.PdfReader(f)
return len(pdf.pages)
except:
return None
def _get_media_duration(self, file_path: Path) -> Optional[float]:
try:
import ffmpeg
probe = ffmpeg.probe(str(file_path))
return float(probe['format']['duration'])
except:
return None
def _has_exif(self, file_path: Path) -> bool:
try:
from PIL import Image
img = Image.open(file_path)
return hasattr(img, '_getexif') and img._getexif() is not None
except:
return False
def _get_image_dimensions(self, file_path: Path) -> Optional[Tuple[int, int]]:
try:
from PIL import Image
with Image.open(file_path) as img:
return img.size
except:
return None

View File

@@ -0,0 +1,21 @@
"""Deduplication package exports"""
from .chunker import (
RabinChunker,
SimpleChunker,
hash_chunk,
hash_file,
compute_file_signature
)
from .store import HashStore, MemoryHashStore
from .engine import DeduplicationEngine
__all__ = [
'RabinChunker',
'SimpleChunker',
'hash_chunk',
'hash_file',
'compute_file_signature',
'HashStore',
'MemoryHashStore',
'DeduplicationEngine',
]

View File

View File

@@ -0,0 +1,241 @@
"""Rabin fingerprint chunker for content-defined chunking"""
import hashlib
from pathlib import Path
from typing import Iterator, Optional
class RabinChunker:
"""Content-defined chunking using Rabin fingerprinting
Uses a rolling hash to identify chunk boundaries based on content,
allowing for efficient deduplication even when data is modified.
"""
def __init__(
self,
avg_chunk_size: int = 8192,
min_chunk_size: Optional[int] = None,
max_chunk_size: Optional[int] = None,
window_size: int = 48
):
"""Initialize Rabin chunker
Args:
avg_chunk_size: Target average chunk size in bytes
min_chunk_size: Minimum chunk size (default: avg_chunk_size // 4)
max_chunk_size: Maximum chunk size (default: avg_chunk_size * 8)
window_size: Rolling hash window size
"""
self.avg_chunk_size = avg_chunk_size
self.min_chunk_size = min_chunk_size or (avg_chunk_size // 4)
self.max_chunk_size = max_chunk_size or (avg_chunk_size * 8)
self.window_size = window_size
# Calculate mask for boundary detection
# For avg_chunk_size, we want boundaries at 1/avg_chunk_size probability
bits = 0
size = avg_chunk_size
while size > 1:
bits += 1
size >>= 1
self.mask = (1 << bits) - 1
# Polynomial for rolling hash (prime number)
self.poly = 0x3DA3358B4DC173
def chunk_file(self, file_path: Path, chunk_size: Optional[int] = None) -> Iterator[bytes]:
"""Chunk a file using Rabin fingerprinting
Args:
file_path: Path to file to chunk
chunk_size: If provided, use fixed-size chunking instead
Yields:
Chunk data as bytes
"""
if chunk_size:
# Use fixed-size chunking
yield from self._chunk_fixed(file_path, chunk_size)
else:
# Use content-defined chunking
yield from self._chunk_rabin(file_path)
def _chunk_fixed(self, file_path: Path, chunk_size: int) -> Iterator[bytes]:
"""Fixed-size chunking
Args:
file_path: Path to file
chunk_size: Chunk size in bytes
Yields:
Fixed-size chunks
"""
with open(file_path, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
yield chunk
def _chunk_rabin(self, file_path: Path) -> Iterator[bytes]:
"""Content-defined chunking using Rabin fingerprinting
Args:
file_path: Path to file
Yields:
Variable-size chunks based on content
"""
with open(file_path, 'rb') as f:
chunk_data = bytearray()
window = bytearray()
hash_value = 0
while True:
byte = f.read(1)
if not byte:
# End of file - yield remaining data
if chunk_data:
yield bytes(chunk_data)
break
chunk_data.extend(byte)
window.extend(byte)
# Maintain window size
if len(window) > self.window_size:
window.pop(0)
# Update rolling hash
hash_value = self._rolling_hash(window)
# Check if we should create a boundary
should_break = (
len(chunk_data) >= self.min_chunk_size and
(
(hash_value & self.mask) == 0 or
len(chunk_data) >= self.max_chunk_size
)
)
if should_break:
yield bytes(chunk_data)
chunk_data = bytearray()
window = bytearray()
hash_value = 0
def _rolling_hash(self, window: bytearray) -> int:
"""Calculate rolling hash for window
Args:
window: Byte window
Returns:
Hash value
"""
hash_value = 0
for byte in window:
hash_value = ((hash_value << 1) + byte) & 0xFFFFFFFFFFFFFFFF
return hash_value
class SimpleChunker:
"""Simple fixed-size chunker for comparison"""
def __init__(self, chunk_size: int = 8192):
"""Initialize simple chunker
Args:
chunk_size: Fixed chunk size in bytes
"""
self.chunk_size = chunk_size
def chunk_file(self, file_path: Path) -> Iterator[bytes]:
"""Chunk file into fixed-size pieces
Args:
file_path: Path to file
Yields:
Fixed-size chunks
"""
with open(file_path, 'rb') as f:
while True:
chunk = f.read(self.chunk_size)
if not chunk:
break
yield chunk
def hash_chunk(chunk: bytes, algorithm: str = 'sha256') -> str:
"""Hash a chunk of data
Args:
chunk: Chunk data
algorithm: Hash algorithm (default: sha256)
Returns:
Hex digest of hash
"""
hasher = hashlib.new(algorithm)
hasher.update(chunk)
return hasher.hexdigest()
def hash_file(file_path: Path, algorithm: str = 'sha256', chunk_size: int = 65536) -> str:
"""Hash entire file
Args:
file_path: Path to file
algorithm: Hash algorithm (default: sha256)
chunk_size: Size of chunks to read
Returns:
Hex digest of file hash
"""
hasher = hashlib.new(algorithm)
with open(file_path, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
hasher.update(chunk)
return hasher.hexdigest()
def compute_file_signature(
file_path: Path,
use_rabin: bool = True,
avg_chunk_size: int = 8192
) -> tuple[str, list[str]]:
"""Compute file signature with chunk hashes
Args:
file_path: Path to file
use_rabin: Whether to use Rabin chunking (vs fixed-size)
avg_chunk_size: Average chunk size for Rabin or fixed size
Returns:
Tuple of (file_hash, list of chunk hashes)
"""
if use_rabin:
chunker = RabinChunker(avg_chunk_size=avg_chunk_size)
else:
chunker = SimpleChunker(chunk_size=avg_chunk_size)
chunk_hashes = []
file_hasher = hashlib.sha256()
for chunk in chunker.chunk_file(file_path):
# Hash individual chunk
chunk_hash = hash_chunk(chunk)
chunk_hashes.append(chunk_hash)
# Update file hash
file_hasher.update(chunk)
file_hash = file_hasher.hexdigest()
return file_hash, chunk_hashes

353
app/deduplication/engine.py Normal file
View File

@@ -0,0 +1,353 @@
"""Deduplication engine"""
from pathlib import Path
from typing import Optional, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed
import psycopg2
from .chunker import compute_file_signature, hash_file
from .store import HashStore
from ..shared.models import FileRecord, ProcessingStats
from ..shared.config import DatabaseConfig, ProcessingConfig
from ..shared.logger import ProgressLogger
class DeduplicationEngine:
"""Engine for deduplicating files"""
def __init__(
self,
db_config: DatabaseConfig,
processing_config: ProcessingConfig,
logger: ProgressLogger
):
"""Initialize deduplication engine
Args:
db_config: Database configuration
processing_config: Processing configuration
logger: Progress logger
"""
self.db_config = db_config
self.processing_config = processing_config
self.logger = logger
self.hash_store = HashStore(db_config)
self._connection = None
def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection
def deduplicate_all(
self,
disk: Optional[str] = None,
use_chunks: bool = True,
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
) -> ProcessingStats:
"""Deduplicate all files in database
Args:
disk: Optional disk filter
use_chunks: Whether to use chunk-level deduplication
progress_callback: Optional callback for progress updates
Returns:
ProcessingStats with deduplication statistics
"""
self.logger.section("Starting Deduplication")
conn = self._get_connection()
cursor = conn.cursor()
# Get files without checksums
if disk:
cursor.execute("""
SELECT path, size
FROM files
WHERE disk_label = %s AND checksum IS NULL
ORDER BY size DESC
""", (disk,))
else:
cursor.execute("""
SELECT path, size
FROM files
WHERE checksum IS NULL
ORDER BY size DESC
""")
files_to_process = cursor.fetchall()
total_files = len(files_to_process)
self.logger.info(f"Found {total_files} files to process")
stats = ProcessingStats()
# Process files with thread pool
with ThreadPoolExecutor(max_workers=self.processing_config.parallel_workers) as executor:
futures = {}
for path_str, size in files_to_process:
path = Path(path_str)
future = executor.submit(self._process_file, path, use_chunks)
futures[future] = (path, size)
# Process completed futures
for future in as_completed(futures):
path, size = futures[future]
try:
checksum, duplicate_of = future.result()
if checksum:
# Update database
cursor.execute("""
UPDATE files
SET checksum = %s, duplicate_of = %s
WHERE path = %s
""", (checksum, duplicate_of, str(path)))
stats.files_succeeded += 1
stats.bytes_processed += size
stats.files_processed += 1
# Commit periodically
if stats.files_processed % self.processing_config.commit_interval == 0:
conn.commit()
# Progress callback
if progress_callback:
progress_callback(stats.files_processed, total_files, stats)
# Log progress
self.logger.progress(
stats.files_processed,
total_files,
prefix="Files processed",
bytes_processed=stats.bytes_processed,
elapsed_seconds=stats.elapsed_seconds
)
except Exception as e:
self.logger.warning(f"Failed to process {path}: {e}")
stats.files_failed += 1
stats.files_processed += 1
# Final commit
conn.commit()
cursor.close()
self.logger.info(
f"Deduplication complete: {stats.files_succeeded}/{total_files} files, "
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
)
return stats
def _process_file(
self,
path: Path,
use_chunks: bool
) -> tuple[Optional[str], Optional[str]]:
"""Process a single file for deduplication
Args:
path: Path to file
use_chunks: Whether to use chunk-level deduplication
Returns:
Tuple of (checksum, duplicate_of_path)
"""
if not path.exists():
return None, None
try:
if use_chunks:
# Compute file signature with chunks
checksum, chunk_hashes = compute_file_signature(
path,
use_rabin=True,
avg_chunk_size=self.processing_config.chunk_size
)
else:
# Just compute file hash
checksum = hash_file(
path,
algorithm=self.processing_config.hash_algorithm
)
chunk_hashes = None
# Check if hash exists
if self.hash_store.exists(checksum):
# Duplicate found
canonical_path = self.hash_store.get_canonical(checksum)
return checksum, canonical_path
else:
# New unique file
size = path.stat().st_size
self.hash_store.store_canonical(
checksum,
path,
size,
chunk_hashes
)
return checksum, None
except Exception as e:
self.logger.debug(f"Error processing {path}: {e}")
raise
def find_duplicates(
self,
disk: Optional[str] = None
) -> dict[str, list[str]]:
"""Find all duplicate files
Args:
disk: Optional disk filter
Returns:
Dictionary mapping canonical path to list of duplicate paths
"""
self.logger.subsection("Finding Duplicates")
conn = self._get_connection()
cursor = conn.cursor()
# Query for duplicates
if disk:
cursor.execute("""
SELECT checksum, array_agg(path ORDER BY path) as paths
FROM files
WHERE disk_label = %s AND checksum IS NOT NULL
GROUP BY checksum
HAVING COUNT(*) > 1
""", (disk,))
else:
cursor.execute("""
SELECT checksum, array_agg(path ORDER BY path) as paths
FROM files
WHERE checksum IS NOT NULL
GROUP BY checksum
HAVING COUNT(*) > 1
""")
duplicates = {}
for checksum, paths in cursor.fetchall():
canonical = paths[0]
duplicates[canonical] = paths[1:]
cursor.close()
self.logger.info(f"Found {len(duplicates)} sets of duplicates")
return duplicates
def get_deduplication_stats(self) -> dict:
"""Get deduplication statistics
Returns:
Dictionary with statistics
"""
conn = self._get_connection()
cursor = conn.cursor()
stats = {}
# Total files
cursor.execute("SELECT COUNT(*) FROM files WHERE checksum IS NOT NULL")
stats['total_files'] = cursor.fetchone()[0]
# Unique files
cursor.execute("SELECT COUNT(DISTINCT checksum) FROM files WHERE checksum IS NOT NULL")
stats['unique_files'] = cursor.fetchone()[0]
# Duplicate files
stats['duplicate_files'] = stats['total_files'] - stats['unique_files']
# Total size
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE checksum IS NOT NULL")
stats['total_size'] = cursor.fetchone()[0]
# Unique size
cursor.execute("""
SELECT COALESCE(SUM(size), 0)
FROM (
SELECT DISTINCT ON (checksum) size
FROM files
WHERE checksum IS NOT NULL
) AS unique_files
""")
stats['unique_size'] = cursor.fetchone()[0]
# Wasted space
stats['wasted_space'] = stats['total_size'] - stats['unique_size']
# Deduplication ratio
if stats['total_size'] > 0:
stats['dedup_ratio'] = stats['unique_size'] / stats['total_size']
else:
stats['dedup_ratio'] = 1.0
# Space saved percentage
if stats['total_size'] > 0:
stats['space_saved_percent'] = (stats['wasted_space'] / stats['total_size']) * 100
else:
stats['space_saved_percent'] = 0.0
cursor.close()
return stats
def mark_canonical_files(self) -> int:
"""Mark canonical (first occurrence) files in database
Returns:
Number of canonical files marked
"""
self.logger.subsection("Marking Canonical Files")
conn = self._get_connection()
cursor = conn.cursor()
# Find first occurrence of each checksum and mark as canonical
cursor.execute("""
WITH canonical AS (
SELECT DISTINCT ON (checksum) path, checksum
FROM files
WHERE checksum IS NOT NULL
ORDER BY checksum, path
)
UPDATE files
SET duplicate_of = NULL
WHERE path IN (SELECT path FROM canonical)
""")
count = cursor.rowcount
conn.commit()
cursor.close()
self.logger.info(f"Marked {count} canonical files")
return count
def close(self):
"""Close connections"""
self.hash_store.close()
if self._connection and not self._connection.closed:
self._connection.close()
def __enter__(self):
"""Context manager entry"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close()

412
app/deduplication/store.py Normal file
View File

@@ -0,0 +1,412 @@
"""Hash store for deduplication with optional Redis support"""
from typing import Optional, Dict, Set
from pathlib import Path
import psycopg2
from psycopg2.extras import execute_batch
from ..shared.config import DatabaseConfig
class HashStore:
"""PostgreSQL-based hash store for deduplication"""
def __init__(self, db_config: DatabaseConfig):
"""Initialize hash store
Args:
db_config: Database configuration
"""
self.db_config = db_config
self._connection = None
def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection
def _ensure_tables(self):
"""Ensure hash store tables exist"""
conn = self._get_connection()
cursor = conn.cursor()
# Create hashes table for file-level deduplication
cursor.execute("""
CREATE TABLE IF NOT EXISTS file_hashes (
checksum TEXT PRIMARY KEY,
canonical_path TEXT NOT NULL,
size BIGINT NOT NULL,
first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ref_count INTEGER DEFAULT 1
)
""")
# Create chunk hashes table for chunk-level deduplication
cursor.execute("""
CREATE TABLE IF NOT EXISTS chunk_hashes (
chunk_hash TEXT PRIMARY KEY,
size INTEGER NOT NULL,
first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ref_count INTEGER DEFAULT 1
)
""")
# Create file-chunk mapping table
cursor.execute("""
CREATE TABLE IF NOT EXISTS file_chunks (
id SERIAL PRIMARY KEY,
file_checksum TEXT NOT NULL,
chunk_hash TEXT NOT NULL,
chunk_index INTEGER NOT NULL,
FOREIGN KEY (file_checksum) REFERENCES file_hashes(checksum),
FOREIGN KEY (chunk_hash) REFERENCES chunk_hashes(chunk_hash),
UNIQUE (file_checksum, chunk_index)
)
""")
# Create indexes
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_file_chunks_file
ON file_chunks(file_checksum)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk
ON file_chunks(chunk_hash)
""")
conn.commit()
cursor.close()
def exists(self, checksum: str) -> bool:
"""Check if hash exists in store
Args:
checksum: File hash to check
Returns:
True if hash exists
"""
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute(
"SELECT 1 FROM file_hashes WHERE checksum = %s LIMIT 1",
(checksum,)
)
exists = cursor.fetchone() is not None
cursor.close()
return exists
def get_canonical(self, checksum: str) -> Optional[str]:
"""Get canonical path for a hash
Args:
checksum: File hash
Returns:
Canonical file path or None if not found
"""
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute(
"SELECT canonical_path FROM file_hashes WHERE checksum = %s",
(checksum,)
)
result = cursor.fetchone()
cursor.close()
return result[0] if result else None
def store_canonical(
self,
checksum: str,
path: Path,
size: int,
chunk_hashes: Optional[list[str]] = None
) -> None:
"""Store canonical reference for a hash
Args:
checksum: File hash
path: Canonical file path
size: File size in bytes
chunk_hashes: Optional list of chunk hashes
"""
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
try:
# Store file hash
cursor.execute("""
INSERT INTO file_hashes (checksum, canonical_path, size)
VALUES (%s, %s, %s)
ON CONFLICT (checksum) DO UPDATE SET
ref_count = file_hashes.ref_count + 1
""", (checksum, str(path), size))
# Store chunk hashes if provided
if chunk_hashes:
# Insert chunk hashes
chunk_data = [(chunk_hash, 0) for chunk_hash in chunk_hashes]
execute_batch(cursor, """
INSERT INTO chunk_hashes (chunk_hash, size)
VALUES (%s, %s)
ON CONFLICT (chunk_hash) DO UPDATE SET
ref_count = chunk_hashes.ref_count + 1
""", chunk_data, page_size=1000)
# Create file-chunk mappings
mapping_data = [
(checksum, chunk_hash, idx)
for idx, chunk_hash in enumerate(chunk_hashes)
]
execute_batch(cursor, """
INSERT INTO file_chunks (file_checksum, chunk_hash, chunk_index)
VALUES (%s, %s, %s)
ON CONFLICT (file_checksum, chunk_index) DO NOTHING
""", mapping_data, page_size=1000)
conn.commit()
except Exception as e:
conn.rollback()
raise
finally:
cursor.close()
def get_chunk_hashes(self, checksum: str) -> list[str]:
"""Get chunk hashes for a file
Args:
checksum: File hash
Returns:
List of chunk hashes in order
"""
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT chunk_hash
FROM file_chunks
WHERE file_checksum = %s
ORDER BY chunk_index
""", (checksum,))
chunk_hashes = [row[0] for row in cursor.fetchall()]
cursor.close()
return chunk_hashes
def get_duplicates(self) -> Dict[str, list[str]]:
"""Get all duplicate file groups
Returns:
Dictionary mapping canonical path to list of duplicate paths
"""
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
# Get all files with their hashes
cursor.execute("""
SELECT f.path, f.checksum
FROM files f
WHERE f.checksum IS NOT NULL
""")
# Group by checksum
hash_to_paths: Dict[str, list[str]] = {}
for path, checksum in cursor.fetchall():
if checksum not in hash_to_paths:
hash_to_paths[checksum] = []
hash_to_paths[checksum].append(path)
cursor.close()
# Filter to only duplicates (more than one file)
duplicates = {
paths[0]: paths[1:]
for checksum, paths in hash_to_paths.items()
if len(paths) > 1
}
return duplicates
def get_stats(self) -> Dict[str, int]:
"""Get hash store statistics
Returns:
Dictionary with statistics
"""
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
stats = {}
# Count unique file hashes
cursor.execute("SELECT COUNT(*) FROM file_hashes")
stats['unique_files'] = cursor.fetchone()[0]
# Count unique chunk hashes
cursor.execute("SELECT COUNT(*) FROM chunk_hashes")
stats['unique_chunks'] = cursor.fetchone()[0]
# Count total references
cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM file_hashes")
stats['total_file_refs'] = cursor.fetchone()[0]
# Count total chunk references
cursor.execute("SELECT COALESCE(SUM(ref_count), 0) FROM chunk_hashes")
stats['total_chunk_refs'] = cursor.fetchone()[0]
# Calculate deduplication ratio
if stats['total_file_refs'] > 0:
stats['dedup_ratio'] = stats['unique_files'] / stats['total_file_refs']
else:
stats['dedup_ratio'] = 1.0
cursor.close()
return stats
def find_similar_files(self, checksum: str, threshold: float = 0.8) -> list[tuple[str, float]]:
"""Find files similar to given hash based on chunk overlap
Args:
checksum: File hash to compare
threshold: Similarity threshold (0.0 to 1.0)
Returns:
List of tuples (other_checksum, similarity_score)
"""
self._ensure_tables()
conn = self._get_connection()
cursor = conn.cursor()
# Get chunks for the target file
target_chunks = set(self.get_chunk_hashes(checksum))
if not target_chunks:
cursor.close()
return []
# Find files sharing chunks
cursor.execute("""
SELECT DISTINCT fc.file_checksum
FROM file_chunks fc
WHERE fc.chunk_hash = ANY(%s)
AND fc.file_checksum != %s
""", (list(target_chunks), checksum))
similar_files = []
for (other_checksum,) in cursor.fetchall():
other_chunks = set(self.get_chunk_hashes(other_checksum))
# Calculate Jaccard similarity
intersection = len(target_chunks & other_chunks)
union = len(target_chunks | other_chunks)
if union > 0:
similarity = intersection / union
if similarity >= threshold:
similar_files.append((other_checksum, similarity))
cursor.close()
# Sort by similarity descending
similar_files.sort(key=lambda x: x[1], reverse=True)
return similar_files
def close(self):
"""Close database connection"""
if self._connection and not self._connection.closed:
self._connection.close()
def __enter__(self):
"""Context manager entry"""
self._ensure_tables()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close()
class MemoryHashStore:
"""In-memory hash store for testing and small datasets"""
def __init__(self):
"""Initialize in-memory hash store"""
self.hashes: Dict[str, tuple[str, int]] = {}
self.chunks: Dict[str, int] = {}
self.file_chunks: Dict[str, list[str]] = {}
def exists(self, checksum: str) -> bool:
"""Check if hash exists"""
return checksum in self.hashes
def get_canonical(self, checksum: str) -> Optional[str]:
"""Get canonical path"""
return self.hashes.get(checksum, (None, 0))[0]
def store_canonical(
self,
checksum: str,
path: Path,
size: int,
chunk_hashes: Optional[list[str]] = None
) -> None:
"""Store canonical reference"""
self.hashes[checksum] = (str(path), size)
if chunk_hashes:
self.file_chunks[checksum] = chunk_hashes
for chunk_hash in chunk_hashes:
self.chunks[chunk_hash] = self.chunks.get(chunk_hash, 0) + 1
def get_chunk_hashes(self, checksum: str) -> list[str]:
"""Get chunk hashes"""
return self.file_chunks.get(checksum, [])
def get_stats(self) -> Dict[str, int]:
"""Get statistics"""
return {
'unique_files': len(self.hashes),
'unique_chunks': len(self.chunks),
'total_file_refs': len(self.hashes),
'total_chunk_refs': sum(self.chunks.values()),
'dedup_ratio': 1.0
}
def close(self):
"""No-op for compatibility"""
pass
def __enter__(self):
"""Context manager entry"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
pass

View File

@@ -0,0 +1,5 @@
from .scanner import FileScanner, FilteredScanner
from .system import SystemAPI
from .engine import DiscoveryEngine
from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
__all__ = ['FileScanner', 'FilteredScanner', 'SystemAPI', 'DiscoveryEngine', 'FileMeta', 'MountInfo', 'DiskInfo', 'IFileScanner', 'ISystemAPI']

View File

@@ -0,0 +1,37 @@
from typing import Iterator, Protocol, Any
from pathlib import Path
from dataclasses import dataclass
@dataclass
class FileMeta:
path: Path
size: int
modified_time: float
created_time: float
@dataclass
class MountInfo:
device: str
mount_point: str
fs_type: str
options: str
@dataclass
class DiskInfo:
device: str
model: str
size: int
serial: str
class IFileScanner(Protocol):
def scan(self, root: Path) -> Iterator[FileMeta]:
...
class ISystemAPI(Protocol):
def query_mounts(self) -> list[MountInfo]:
...
def query_nvmes(self) -> list[DiskInfo]:
...

133
app/discovery/engine.py Normal file
View File

@@ -0,0 +1,133 @@
from pathlib import Path
from typing import Optional, Callable
from datetime import datetime
import psycopg2
from psycopg2.extras import execute_batch
from .scanner import FileScanner
from .system import SystemAPI
from ._protocols import FileMeta
from ..shared.models import FileRecord, DiskInfo, ProcessingStats
from ..shared.config import DatabaseConfig
from ..shared.logger import ProgressLogger
class DiscoveryEngine:
def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
self.db_config = db_config
self.logger = logger
self.batch_size = batch_size
self.system_api = SystemAPI()
self._connection = None
def _get_connection(self):
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
return self._connection
def _ensure_tables(self):
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("\n CREATE TABLE IF NOT EXISTS files (\n id SERIAL PRIMARY KEY,\n path TEXT NOT NULL UNIQUE,\n size BIGINT NOT NULL,\n modified_time DOUBLE PRECISION NOT NULL,\n created_time DOUBLE PRECISION NOT NULL,\n disk_label TEXT NOT NULL,\n checksum TEXT,\n status TEXT DEFAULT 'indexed',\n category TEXT,\n duplicate_of TEXT,\n discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n ")
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n ')
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n ')
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n ')
conn.commit()
cursor.close()
def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
self.logger.section(f'Discovering: {root}')
self._ensure_tables()
if scanner is None:
scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
disk = self.system_api.get_disk_for_path(root)
if disk is None:
disk = str(root)
stats = ProcessingStats()
batch = []
conn = self._get_connection()
cursor = conn.cursor()
try:
for file_meta in scanner.scan(root):
record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
batch.append(record)
stats.files_processed += 1
stats.bytes_processed += record.size
if len(batch) >= self.batch_size:
self._insert_batch(cursor, batch)
conn.commit()
batch.clear()
if progress_callback:
progress_callback(stats.files_processed, 0, stats)
if stats.files_processed % (self.batch_size * 10) == 0:
self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
if batch:
self._insert_batch(cursor, batch)
conn.commit()
stats.files_succeeded = stats.files_processed
except Exception as e:
conn.rollback()
self.logger.error(f'Discovery failed: {e}')
raise
finally:
cursor.close()
self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
return stats
def _insert_batch(self, cursor, batch: list[FileRecord]):
query = '\n INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n updated_at = CURRENT_TIMESTAMP\n '
data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
execute_batch(cursor, query, data, page_size=self.batch_size)
def get_disk_info(self) -> list[DiskInfo]:
self.logger.subsection('Querying disk information')
disks = []
for disk_info in self.system_api.query_nvmes():
mount_point = None
fs_type = 'unknown'
for mount in self.system_api.query_mounts():
if mount.device == disk_info.device:
mount_point = Path(mount.mount_point)
fs_type = mount.fs_type
break
if mount_point:
total, used, free = self.system_api.get_disk_usage(mount_point)
else:
total = disk_info.size
used = 0
free = disk_info.size
disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
disks.append(disk)
self.logger.info(f' {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
return disks
def get_file_count(self, disk: Optional[str]=None) -> int:
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
else:
cursor.execute('SELECT COUNT(*) FROM files')
count = cursor.fetchone()[0]
cursor.close()
return count
def get_total_size(self, disk: Optional[str]=None) -> int:
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
else:
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
total = cursor.fetchone()[0]
cursor.close()
return total
def close(self):
if self._connection and (not self._connection.closed):
self._connection.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()

112
app/discovery/scanner.py Normal file
View File

@@ -0,0 +1,112 @@
import os
from pathlib import Path
from typing import Iterator, Optional, Callable
from datetime import datetime
from ._protocols import FileMeta
class FileScanner:
def __init__(self, follow_symlinks: bool=False, skip_hidden: bool=True, error_handler: Optional[Callable[[Exception, Path], None]]=None):
self.follow_symlinks = follow_symlinks
self.skip_hidden = skip_hidden
self.error_handler = error_handler
self._files_scanned = 0
self._bytes_scanned = 0
self._errors = 0
def scan(self, root: Path) -> Iterator[FileMeta]:
if not root.exists():
error = FileNotFoundError(f'Path does not exist: {root}')
if self.error_handler:
self.error_handler(error, root)
else:
raise error
return
if not root.is_dir():
try:
yield self._get_file_meta(root)
except Exception as e:
self._errors += 1
if self.error_handler:
self.error_handler(e, root)
else:
raise
return
for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
current_dir = Path(dirpath)
if self.skip_hidden:
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in filenames:
if self.skip_hidden and filename.startswith('.'):
continue
file_path = current_dir / filename
try:
if file_path.is_symlink() and (not file_path.exists()):
continue
meta = self._get_file_meta(file_path)
self._files_scanned += 1
self._bytes_scanned += meta.size
yield meta
except PermissionError as e:
self._errors += 1
if self.error_handler:
self.error_handler(e, file_path)
continue
except Exception as e:
self._errors += 1
if self.error_handler:
self.error_handler(e, file_path)
continue
def _get_file_meta(self, path: Path) -> FileMeta:
stat = path.stat()
created_time = stat.st_ctime
if hasattr(stat, 'st_birthtime'):
created_time = stat.st_birthtime
return FileMeta(path=path, size=stat.st_size, modified_time=stat.st_mtime, created_time=created_time)
@property
def files_scanned(self) -> int:
return self._files_scanned
@property
def bytes_scanned(self) -> int:
return self._bytes_scanned
@property
def errors(self) -> int:
return self._errors
def reset_stats(self) -> None:
self._files_scanned = 0
self._bytes_scanned = 0
self._errors = 0
class FilteredScanner(FileScanner):
def __init__(self, min_size: Optional[int]=None, max_size: Optional[int]=None, extensions: Optional[list[str]]=None, exclude_patterns: Optional[list[str]]=None, **kwargs):
super().__init__(**kwargs)
self.min_size = min_size
self.max_size = max_size
self.extensions = {ext.lower() for ext in extensions} if extensions else None
self.exclude_patterns = exclude_patterns or []
def scan(self, root: Path) -> Iterator[FileMeta]:
for meta in super().scan(root):
if self.min_size is not None and meta.size < self.min_size:
continue
if self.max_size is not None and meta.size > self.max_size:
continue
if self.extensions is not None:
if meta.path.suffix.lower() not in self.extensions:
continue
if self._should_exclude(meta.path):
continue
yield meta
def _should_exclude(self, path: Path) -> bool:
path_str = str(path)
for pattern in self.exclude_patterns:
if pattern in path_str:
return True
return False

119
app/discovery/system.py Normal file
View File

@@ -0,0 +1,119 @@
import os
import subprocess
from pathlib import Path
from typing import Optional
import psutil
from ._protocols import MountInfo, DiskInfo
class SystemAPI:
def query_mounts(self) -> list[MountInfo]:
mounts = []
for partition in psutil.disk_partitions(all=False):
mount_info = MountInfo(device=partition.device, mount_point=partition.mountpoint, fs_type=partition.fstype, options=partition.opts)
mounts.append(mount_info)
return mounts
def query_nvmes(self) -> list[DiskInfo]:
disks = []
try:
result = subprocess.run(['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'], capture_output=True, text=True, check=False)
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
parts = line.split(maxsplit=3)
if len(parts) >= 3:
device = f'/dev/{parts[0]}'
model = parts[1] if len(parts) > 1 else 'Unknown'
size_str = parts[2] if len(parts) > 2 else '0'
serial = parts[3] if len(parts) > 3 else 'Unknown'
try:
size = int(size_str)
except ValueError:
size = 0
disk_info = DiskInfo(device=device, model=model, size=size, serial=serial)
disks.append(disk_info)
except FileNotFoundError:
pass
if not disks:
disks = self._query_disks_fallback()
return disks
def _query_disks_fallback(self) -> list[DiskInfo]:
disks = []
seen_devices = set()
for partition in psutil.disk_partitions(all=True):
device = partition.device
if not device.startswith('/dev/'):
continue
base_device = self._get_base_device(device)
if base_device in seen_devices:
continue
seen_devices.add(base_device)
try:
usage = psutil.disk_usage(partition.mountpoint)
size = usage.total
except (PermissionError, OSError):
size = 0
disk_info = DiskInfo(device=base_device, model='Unknown', size=size, serial='Unknown')
disks.append(disk_info)
return disks
def _get_base_device(self, device: str) -> str:
if 'nvme' in device:
if 'p' in device:
return device.rsplit('p', 1)[0]
return device
import re
match = re.match('(/dev/[a-z]+)', device)
if match:
return match.group(1)
return device
def get_disk_for_path(self, path: Path) -> Optional[str]:
path = path.resolve()
best_match = None
best_match_len = 0
for partition in psutil.disk_partitions():
mount_point = Path(partition.mountpoint)
try:
if path == mount_point or mount_point in path.parents:
mount_len = len(str(mount_point))
if mount_len > best_match_len:
best_match = partition.device
best_match_len = mount_len
except (ValueError, OSError):
continue
return best_match
def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
try:
usage = psutil.disk_usage(str(path))
return (usage.total, usage.used, usage.free)
except (PermissionError, OSError):
return (0, 0, 0)
def get_mount_point(self, path: Path) -> Optional[Path]:
path = path.resolve()
best_match = None
best_match_len = 0
for partition in psutil.disk_partitions():
mount_point = Path(partition.mountpoint)
try:
if path == mount_point or mount_point in path.parents:
mount_len = len(str(mount_point))
if mount_len > best_match_len:
best_match = mount_point
best_match_len = mount_len
except (ValueError, OSError):
continue
return best_match
def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
try:
stat1 = path1.stat()
stat2 = path2.stat()
return stat1.st_dev == stat2.st_dev
except (OSError, PermissionError):
return False

View File

@@ -0,0 +1,59 @@
from typing import Dict
import re
class ContentEnricher:
def __init__(self, llm_client=None):
self.llm_client = llm_client
self.pii_patterns = {
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
}
def enrich(self, text: str, use_llm: bool = False) -> Dict:
enrichment = {
'summary': self._basic_summary(text),
'word_count': len(text.split()),
'has_pii': self._detect_pii(text),
'quality': self._assess_quality(text),
'topics': self._extract_basic_topics(text)
}
if use_llm and self.llm_client:
llm_result = self.llm_client.classify_content(text)
if llm_result.get('success'):
enrichment['llm_classification'] = llm_result['text']
return enrichment
def _basic_summary(self, text: str) -> str:
sentences = re.split(r'[.!?]+', text)
return ' '.join(sentences[:3])[:200]
def _detect_pii(self, text: str) -> Dict:
detected = {}
for pii_type, pattern in self.pii_patterns.items():
matches = re.findall(pattern, text)
if matches:
detected[pii_type] = len(matches)
return detected
def _assess_quality(self, text: str) -> str:
if len(text.strip()) < 10:
return 'low'
special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
if special_char_ratio > 0.3:
return 'low'
return 'high' if len(text.split()) > 50 else 'medium'
def _extract_basic_topics(self, text: str) -> list:
words = re.findall(r'\b[A-Z][a-z]+\b', text)
word_freq = {}
for word in words:
if len(word) > 3:
word_freq[word] = word_freq.get(word, 0) + 1
return sorted(word_freq, key=word_freq.get, reverse=True)[:10]

View File

@@ -0,0 +1,54 @@
import requests
import json
from typing import Dict, Optional
class LLMClient:
def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'):
self.endpoint = endpoint
self.model = model
self.local_ollama = 'http://localhost:11434'
def summarize(self, text: str, max_length: int = 200) -> Dict:
prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}"
return self._query(prompt)
def extract_topics(self, text: str) -> Dict:
prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}"
return self._query(prompt)
def classify_content(self, text: str) -> Dict:
prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}"
return self._query(prompt)
def _query(self, prompt: str, use_local: bool = False) -> Dict:
try:
endpoint = self.local_ollama if use_local else self.endpoint
if use_local:
response = requests.post(
f'{endpoint}/api/generate',
json={'model': 'llama3.2', 'prompt': prompt, 'stream': False},
timeout=30
)
else:
response = requests.post(
f'{endpoint}/v1/chat/completions',
json={
'model': self.model,
'messages': [{'role': 'user', 'content': prompt}],
'max_tokens': 500
},
timeout=30
)
if response.status_code == 200:
data = response.json()
if use_local:
return {'success': True, 'text': data.get('response', '')}
else:
return {'success': True, 'text': data['choices'][0]['message']['content']}
else:
return {'success': False, 'error': f'HTTP {response.status_code}'}
except Exception as e:
return {'success': False, 'error': str(e)}

3
app/filters/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .gitignore import GitignoreFilter, DEFAULT_PATTERNS
__all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS']

30
app/filters/gitignore.py Normal file
View File

@@ -0,0 +1,30 @@
from pathlib import Path
from typing import Set
import fnmatch
DEFAULT_PATTERNS = {
'node_modules/**', '__pycache__/**', '.git/**', 'build/**', 'dist/**',
'.cache/**', 'target/**', 'vendor/**', '.venv/**', 'venv/**',
'*.pyc', '*.pyo', '*.so', '*.dll', '*.dylib', '*.o', '*.a',
'.DS_Store', 'Thumbs.db', '.pytest_cache/**', '.tox/**',
'*.egg-info/**', '.mypy_cache/**', '.coverage', 'htmlcov/**',
'.gradle/**', 'bin/**', 'obj/**', '.vs/**', '.idea/**'
}
class GitignoreFilter:
def __init__(self, patterns: Set[str] = None):
self.patterns = patterns or DEFAULT_PATTERNS
def should_exclude(self, path: str) -> bool:
path_obj = Path(path)
for pattern in self.patterns:
if '**' in pattern:
clean_pattern = pattern.replace('/**', '').replace('**/', '')
if clean_pattern in path_obj.parts:
return True
elif fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(path_obj.name, pattern):
return True
return False
def filter_files(self, files: list) -> list:
return [f for f in files if not self.should_exclude(f)]

918
app/main.py Normal file
View File

@@ -0,0 +1,918 @@
import os
import sys
from dataclasses import dataclass
import psycopg2
import shutil
import hashlib
import argparse
import json
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
import logging
import time
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)])
logger = logging.getLogger(__name__)
@dataclass
class FileRecord:
path: str
size: int
modified_time: float
disk_label: str
checksum: Optional[str] = None
status: str = 'indexed'
class DiskReorganizer:
def __init__(self, db_config: Dict=None):
if db_config is None:
db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
self.db_config = db_config
self.init_database()
def get_connection(self):
return psycopg2.connect(**self.db_config)
def init_database(self):
try:
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute("\n SELECT table_name FROM information_schema.tables\n WHERE table_schema = 'public' AND table_name IN ('files', 'operations')\n ")
tables = cursor.fetchall()
if len(tables) < 2:
logger.error('Database tables not found! Please run setup_database.sh first.')
raise Exception('Database not properly initialized. Run setup_database.sh')
cursor.close()
conn.close()
logger.info('Database connection verified successfully')
except psycopg2.Error as e:
logger.error(f'Database connection failed: {e}')
raise
def index_disk(self, disk_root: str, disk_name: str):
logger.info(f'Indexing disk: {disk_name} at {disk_root}')
disk_path = Path(disk_root)
if not disk_path.exists():
logger.error(f'Disk path {disk_root} does not exist!')
return
files_count = 0
total_size = 0
start_time = time.time()
conn = self.get_connection()
cursor = conn.cursor()
try:
for root, dirs, files in os.walk(disk_path):
dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))]
for file in files:
try:
file_path = Path(root) / file
if not file_path.is_file():
continue
stat = file_path.stat()
size = stat.st_size
mtime = datetime.fromtimestamp(stat.st_mtime)
rel_path = str(file_path.relative_to(disk_path))
cursor.execute('\n INSERT INTO files (path, size, modified_time, disk_label, checksum, status)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n disk_label = EXCLUDED.disk_label,\n status = EXCLUDED.status\n ', (rel_path, size, mtime, disk_name, None, 'indexed'))
files_count += 1
total_size += size
if files_count % 100 == 0:
elapsed = time.time() - start_time
rate = files_count / elapsed if elapsed > 0 else 0
display_path = str(file_path)
if len(display_path) > 60:
display_path = '...' + display_path[-57:]
print(f'\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}', end='', flush=True)
if files_count % 1000 == 0:
conn.commit()
except Exception as e:
conn.rollback()
logger.warning(f'\nSkipping {file_path}: {e}')
continue
conn.commit()
print()
logger.info(f'Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}')
finally:
cursor.close()
conn.close()
def calculate_disk_usage(self) -> Dict[str, Dict]:
conn = self.get_connection()
cursor = conn.cursor()
try:
cursor.execute('\n SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count\n FROM files\n GROUP BY disk_label\n ')
usage = {}
for row in cursor.fetchall():
disk = row[0]
size = int(row[1] or 0)
count = int(row[2])
usage[disk] = {'size': size, 'count': count, 'formatted_size': self.format_size(size)}
return usage
finally:
cursor.close()
conn.close()
def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict:
logger.info(f'Planning migration to free up {target_disk}')
usage = self.calculate_disk_usage()
if target_disk not in usage:
logger.error(f'Target disk {target_disk} not found in index!')
return {}
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute('SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC', (target_disk,))
files_to_move = cursor.fetchall()
cursor.close()
conn.close()
target_disk_usage = usage[target_disk]['size']
logger.info(f'Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}')
dest_availability = []
for disk in destination_disks:
if disk not in usage:
available = float('inf')
else:
available = float('inf')
dest_availability.append({'disk': disk, 'available': available, 'planned_usage': 0})
plan = {'target_disk': target_disk, 'total_size': target_disk_usage, 'file_count': len(files_to_move), 'operations': [], 'destination_disks': destination_disks}
conn = self.get_connection()
cursor = conn.cursor()
try:
for file_info in files_to_move:
rel_path, size, mtime = file_info
dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)]
op = {'source_disk': target_disk, 'source_path': rel_path, 'dest_disk': dest_disk, 'target_path': rel_path, 'size': int(size)}
plan['operations'].append(op)
cursor.execute('INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)', (f'{target_disk}:{rel_path}', f'{dest_disk}:{rel_path}', 'move', 'pending'))
conn.commit()
finally:
cursor.close()
conn.close()
plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(plan_file, 'w') as f:
json.dump(plan, f, indent=2)
logger.info(f"Plan created with {len(plan['operations'])} operations")
logger.info(f'Plan saved to {plan_file}')
return plan
def verify_operation(self, source: Path, dest: Path) -> bool:
if not dest.exists():
return False
try:
source_stat = source.stat()
dest_stat = dest.stat()
if source_stat.st_size != dest_stat.st_size:
return False
return True
except Exception as e:
logger.error(f'Verification error: {e}')
return False
@staticmethod
def file_checksum(path: Path) -> str:
hash_md5 = hashlib.md5()
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def execute_migration(self, plan_file: str, dry_run: bool=True):
logger.info(f"{('DRY RUN' if dry_run else 'EXECUTING')} migration from {plan_file}")
with open(plan_file, 'r') as f:
plan = json.load(f)
operations = plan['operations']
logger.info(f'Processing {len(operations)} operations...')
success_count = 0
error_count = 0
start_time = time.time()
conn = self.get_connection()
cursor = conn.cursor()
try:
for i, op in enumerate(operations, 1):
source_disk = op['source_disk']
source_path = op['source_path']
dest_disk = op['dest_disk']
target_path = op['target_path']
source_full = Path(source_disk) / source_path
dest_full = Path(dest_disk) / target_path
elapsed = time.time() - start_time
rate = i / elapsed if elapsed > 0 else 0
eta = (len(operations) - i) / rate if rate > 0 else 0
display_path = str(source_path)
if len(display_path) > 50:
display_path = '...' + display_path[-47:]
print(f'\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}', end='', flush=True)
if dry_run:
if source_full.exists():
success_count += 1
else:
logger.warning(f'\n Source does not exist: {source_full}')
error_count += 1
continue
try:
dest_full.parent.mkdir(parents=True, exist_ok=True)
if source_full.exists():
shutil.copy2(source_full, dest_full)
if self.verify_operation(source_full, dest_full):
cursor.execute("UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s", (dest_disk, source_path, source_disk))
cursor.execute('UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s', (f'{source_disk}:{source_path}',))
success_count += 1
else:
raise Exception('Verification failed')
else:
logger.warning(f'\n Source missing: {source_full}')
error_count += 1
except Exception as e:
logger.error(f'\n Error processing {source_path}: {e}')
cursor.execute('UPDATE operations SET error = %s WHERE source_path = %s', (str(e), f'{source_disk}:{source_path}'))
error_count += 1
if i % 10 == 0:
conn.commit()
conn.commit()
print()
finally:
cursor.close()
conn.close()
logger.info(f'Migration complete: {success_count} success, {error_count} errors')
if not dry_run and error_count == 0:
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
def run_deduplication(self, disk: Optional[str]=None, use_chunks: bool=True):
logger.info(f"Starting deduplication{(' for disk ' + disk if disk else '')}")
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
conn = self.get_connection()
cursor = conn.cursor()
def hash_file_local(file_path: Path) -> str:
hasher = hashlib.sha256()
with open(file_path, 'rb') as f:
while (chunk := f.read(65536)):
hasher.update(chunk)
return hasher.hexdigest()
try:
if disk:
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC', (disk,))
else:
cursor.execute('SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC')
files_to_process = cursor.fetchall()
total = len(files_to_process)
logger.info(f'Found {total} files to hash')
processed = 0
skipped = 0
start_time = time.time()
batch = []
print(f'Phase 1: Computing checksums...')
for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
try:
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
if not full_path.exists():
skipped += 1
if idx % 100 == 0:
elapsed = time.time() - start_time
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
remaining = (total - idx) / rate if rate > 0 else 0
pct = 100 * idx / total
print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
continue
checksum = hash_file_local(full_path)
batch.append((checksum, path_str))
processed += 1
if len(batch) >= 1000:
try:
cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
conn.commit()
batch.clear()
except Exception as e:
conn.rollback()
batch.clear()
print(f'\nBatch update failed: {e}')
if idx % 100 == 0:
elapsed = time.time() - start_time
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
remaining = (total - idx) / rate if rate > 0 else 0
pct = 100 * idx / total
print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
except Exception as e:
skipped += 1
if idx <= 5:
print(f'\nDebug: {full_path} - {e}')
if batch:
try:
cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
conn.commit()
except Exception as e:
conn.rollback()
print(f'\nFinal batch failed: {e}')
print()
elapsed = time.time() - start_time
logger.info(f'Phase 1 done: {processed:,} files in {int(elapsed / 60)}m{int(elapsed % 60):02d}s ({skipped:,} skipped)')
print('Phase 2: Finding duplicates...')
cursor.execute('\n UPDATE files f1 SET duplicate_of = (\n SELECT MIN(path) FROM files f2\n WHERE f2.checksum = f1.checksum AND f2.path < f1.path\n )\n WHERE checksum IS NOT NULL\n ')
conn.commit()
cursor.execute('SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL')
dup_count = cursor.fetchone()[0]
logger.info(f'Phase 2 done: Found {dup_count:,} duplicates')
finally:
cursor.close()
conn.close()
def plan_merge(self, sources: List[str], target: str, output_file: str, filter_system: bool=False, network_target: str=None):
logger.info(f"Planning merge: {', '.join(sources)}{target or network_target}")
if filter_system:
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from filters import GitignoreFilter
file_filter = GitignoreFilter()
logger.info('System/build file filtering enabled')
conn = self.get_connection()
cursor = conn.cursor()
try:
placeholders = ','.join(['%s'] * len(sources))
cursor.execute(f'\n SELECT path, size, checksum, disk_label, duplicate_of\n FROM files\n WHERE disk_label IN ({placeholders})\n ORDER BY size DESC\n ', tuple(sources))
files = cursor.fetchall()
total_files = len(files)
total_size = sum((int(f[1]) for f in files))
unique_files = {}
duplicate_count = 0
duplicate_size = 0
filtered_count = 0
filtered_size = 0
for path, size, checksum, disk_label, duplicate_of in files:
if filter_system and file_filter.should_exclude(path):
filtered_count += 1
filtered_size += int(size)
continue
if checksum and checksum in unique_files:
duplicate_count += 1
duplicate_size += int(size)
elif checksum:
unique_files[checksum] = (path, int(size), disk_label)
unique_count = len(unique_files)
unique_size = sum((f[1] for f in unique_files.values()))
plan = {'sources': sources, 'target': target or network_target, 'network': network_target is not None, 'total_files': total_files, 'total_size': total_size, 'unique_files': unique_count, 'unique_size': unique_size, 'duplicate_files': duplicate_count, 'duplicate_size': duplicate_size, 'filtered_files': filtered_count if filter_system else 0, 'filtered_size': filtered_size if filter_system else 0, 'space_saved': duplicate_size + (filtered_size if filter_system else 0), 'operations': []}
for checksum, (path, size, disk_label) in unique_files.items():
plan['operations'].append({'source_disk': disk_label, 'source_path': path, 'target_disk': target or network_target, 'target_path': path, 'size': size, 'checksum': checksum})
with open(output_file, 'w') as f:
json.dump(plan, f, indent=2)
logger.info(f'Merge plan saved to {output_file}')
print(f'\n=== MERGE PLAN SUMMARY ===')
print(f"Sources: {', '.join(sources)}")
print(f'Target: {target or network_target}')
print(f'Total files: {total_files:,} ({self.format_size(total_size)})')
if filter_system:
print(f'Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})')
print(f'Unique files: {unique_count:,} ({self.format_size(unique_size)})')
print(f'Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})')
print(f"Total space saved: {self.format_size(plan['space_saved'])}")
print(f'Space needed on target: {self.format_size(unique_size)}')
finally:
cursor.close()
conn.close()
def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
conn = self.get_connection()
cursor = conn.cursor()
try:
if preview_merge:
with open(preview_merge, 'r') as f:
plan = json.load(f)
print('\n=== MERGE PLAN PREVIEW ===')
print(f"Sources: {', '.join(plan['sources'])}")
print(f"Target: {plan['target']}")
print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
print(f"Space saved: {self.format_size(plan['space_saved'])}")
print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
return
cursor.execute('\n SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status\n ')
print('\n=== FILE MIGRATION REPORT ===')
for row in cursor.fetchall():
status, count, size = row
print(f'{status:15}: {count:6} files, {self.format_size(int(size or 0))}')
cursor.execute('\n SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label\n ')
print('\n=== DISK USAGE ===')
for row in cursor.fetchall():
disk, count, size = row
print(f'{disk:20}: {count:6} files, {self.format_size(int(size or 0))}')
cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL\n ')
hashed_count, hashed_size = cursor.fetchone()
cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL\n ')
dup_count, dup_size = cursor.fetchone()
print('\n=== DEDUPLICATION STATS ===')
print(f'Files with checksums: {hashed_count or 0:6}')
print(f'Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})')
if show_duplicates and dup_count:
print('\n=== DUPLICATE FILES ===')
cursor.execute('\n SELECT path, size, duplicate_of FROM files\n WHERE duplicate_of IS NOT NULL\n ORDER BY size DESC\n LIMIT 20\n ')
for path, size, dup_of in cursor.fetchall():
print(f' {path} ({self.format_size(int(size))}) → {dup_of}')
cursor.execute('\n SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified\n ')
print('\n=== OPERATIONS REPORT ===')
for row in cursor.fetchall():
op_type, executed, verified, count = row
status = 'EXECUTED' if executed else 'PENDING'
if verified:
status += '+VERIFIED'
print(f'{op_type:10} {status:15}: {count} operations')
finally:
cursor.close()
conn.close()
def profile_content(self, disk: Optional[str]=None, update_db: bool=False, limit: Optional[int]=None):
from content.profiler import ContentProfiler
profiler = ContentProfiler()
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
conn = self.get_connection()
cursor = conn.cursor()
try:
query = 'SELECT path, size, disk_label FROM files WHERE 1=1'
params = []
if disk:
query += ' AND disk_label = %s'
params.append(disk)
if limit:
query += f' LIMIT {limit}'
cursor.execute(query, params)
files = cursor.fetchall()
total = len(files)
logger.info(f'Profiling {total:,} files...')
kind_stats = {}
processable = 0
batch = []
for idx, (path, size, disk_label) in enumerate(files, 1):
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
if not full_path.exists():
continue
profile = profiler.profile_file(full_path)
if 'error' not in profile:
kind = profile['kind']
if kind not in kind_stats:
kind_stats[kind] = {'count': 0, 'processable': 0}
kind_stats[kind]['count'] += 1
if profile['processable']:
kind_stats[kind]['processable'] += 1
processable += 1
if update_db:
profile_json = json.dumps(profile)
batch.append((kind, profile_json, path))
if len(batch) >= 500:
cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
conn.commit()
batch.clear()
if idx % 100 == 0:
print(f'\rProfiled: {idx:,}/{total:,}', end='', flush=True)
if update_db and batch:
cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
conn.commit()
print()
print(f'\n=== CONTENT PROFILE SUMMARY ===')
print(f'Total files: {total:,}')
print(f'Processable: {processable:,}\n')
print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
print('-' * 60)
for kind in sorted(kind_stats.keys()):
stats = kind_stats[kind]
extractor = profiler._suggest_extractor(kind, '')
print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
finally:
cursor.close()
conn.close()
def extract_content(self, kind: Optional[str]=None, limit: int=10):
from content.profiler import ContentProfiler
from content.extractors import ContentExtractor
profiler = ContentProfiler()
extractor = ContentExtractor()
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
conn = self.get_connection()
cursor = conn.cursor()
try:
query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
params = []
if kind:
query += " AND metadata->'profile'->>'kind' = %s"
params.append(kind)
query += f' LIMIT {limit}'
cursor.execute(query, params)
files = cursor.fetchall()
print(f'\n=== EXTRACTING CONTENT ===')
print(f'Processing {len(files)} files\n')
for path, size, disk_label, metadata in files:
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
if not full_path.exists():
continue
profile = metadata.get('profile', {}) if metadata else {}
extractor_type = profile.get('extractor')
if not extractor_type:
continue
print(f'Extracting: {path}')
print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}")
result = extractor.extract(full_path, extractor_type)
if 'text' in result:
preview = result['text'][:200]
print(f' Preview: {preview}...')
elif 'pipeline' in result:
print(f" Pipeline: {''.join(result['pipeline'])}")
print(f" Status: {result.get('status', 'pending')}")
print()
finally:
cursor.close()
conn.close()
def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False):
from parsers.text_parser import TextParser
from parsers.code_parser import CodeParser
from parsers.pdf_parser import PDFParser
parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()}
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
conn = self.get_connection()
cursor = conn.cursor()
try:
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
params = []
if kind:
suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
if kind in suffix_map:
query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
query += f" LIMIT {limit}"
cursor.execute(query, params)
files = cursor.fetchall()
print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
parsed_count = 0
for path, size, disk_label in files:
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
if not full_path.exists() or int(size) > 10 * 1024 * 1024:
continue
file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text'
parser = parsers.get(file_kind)
if not parser:
continue
result = parser.parse(full_path)
if 'error' not in result:
text = result.get('text', '')
quality = result.get('quality', 'unknown')
print(f"{path[:60]} | {file_kind} | {len(text):,} chars")
if update_db and text:
cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path))
parsed_count += 1
if parsed_count % 10 == 0:
conn.commit()
if update_db:
conn.commit()
print(f"\nParsed {parsed_count} files")
finally:
cursor.close()
conn.close()
def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
from enrichment.enricher import ContentEnricher
enricher = ContentEnricher()
conn = self.get_connection()
cursor = conn.cursor()
try:
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
files = cursor.fetchall()
print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
for path, text in files:
enrichment = enricher.enrich(text[:5000], use_llm=False)
print(f"{path[:60]}")
print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
print(f" PII: {list(enrichment.get('has_pii', {}).keys())}")
print(f" Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
conn.commit()
print(f"Enriched {len(files)} files")
finally:
cursor.close()
conn.close()
def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True):
from classification.classifier import FileClassifier
classifier = FileClassifier()
conn = self.get_connection()
cursor = conn.cursor()
try:
task_name = f"classify_{disk or 'all'}"
skip_count = 0
if resume and update_db:
cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,))
checkpoint = cursor.fetchone()
if checkpoint:
last_path, skip_count = checkpoint
logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed')
if disk:
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,))
else:
cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path')
files = cursor.fetchall()
total = len(files)
logger.info(f'Classifying {total:,} files...')
categories = {}
build_artifacts = 0
batch = []
processed = 0
for idx, (path, size, disk_label) in enumerate(files, 1):
if idx <= skip_count:
continue
labels, category, is_build = classifier.classify_path(path, int(size))
if is_build:
build_artifacts += 1
if category not in categories:
categories[category] = {'count': 0, 'size': 0}
categories[category]['count'] += 1
categories[category]['size'] += int(size)
if update_db:
labels_str = ','.join(labels)
batch.append((category, labels_str, path))
if len(batch) >= 1000:
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
cursor.execute('''
INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
ON CONFLICT (task_name) DO UPDATE SET
last_processed_path = EXCLUDED.last_processed_path,
processed_count = EXCLUDED.processed_count,
updated_at = CURRENT_TIMESTAMP
''', (task_name, path, idx))
conn.commit()
batch.clear()
processed += 1
if idx % 1000 == 0:
print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True)
if update_db and batch:
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
cursor.execute('''
INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
ON CONFLICT (task_name) DO UPDATE SET
last_processed_path = EXCLUDED.last_processed_path,
processed_count = EXCLUDED.processed_count,
updated_at = CURRENT_TIMESTAMP
''', (task_name, files[-1][0] if files else '', total))
conn.commit()
print()
print(f'\n=== CLASSIFICATION SUMMARY ===')
print(f'Total files: {total:,}')
print(f'Build artifacts: {build_artifacts:,}')
print(f'\nCategories:')
for category in sorted(categories.keys()):
info = categories[category]
print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
finally:
cursor.close()
conn.close()
def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
from analysis.folder_analyzer import FolderAnalyzer
analyzer = FolderAnalyzer()
conn = self.get_connection()
cursor = conn.cursor()
try:
query = '''
SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label
FROM files
WHERE 1=1
'''
params = []
if disk:
query += ' AND disk_label = %s'
params.append(disk)
cursor.execute(query, params)
potential_folders = cursor.fetchall()
logger.info(f'Found {len(potential_folders)} potential folders to analyze')
processed = 0
for folder_name, disk_label in potential_folders:
cursor.execute('''
SELECT path, size FROM files
WHERE disk_label = %s AND path LIKE %s
''', (disk_label, f'{folder_name}%'))
files = cursor.fetchall()
if len(files) < min_files:
continue
files_list = [{'path': f[0], 'size': int(f[1])} for f in files]
folder_path = Path(folder_name)
analysis = analyzer.analyze_folder(folder_path, files_list)
readme_text = None
for file_dict in files_list:
if 'readme' in file_dict['path'].lower():
readme_text = f"Found README at {file_dict['path']}"
break
summary = analyzer.generate_summary(analysis, readme_text)
cursor.execute('''
INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary,
has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (path) DO UPDATE SET
file_count = EXCLUDED.file_count,
total_size = EXCLUDED.total_size,
project_type = EXCLUDED.project_type,
intent = EXCLUDED.intent,
summary = EXCLUDED.summary,
has_readme = EXCLUDED.has_readme,
has_git = EXCLUDED.has_git,
has_manifest = EXCLUDED.has_manifest,
manifest_types = EXCLUDED.manifest_types,
dominant_file_types = EXCLUDED.dominant_file_types,
structure = EXCLUDED.structure,
updated_at = CURRENT_TIMESTAMP
''', (
str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list),
analysis.get('project_type'), analysis.get('intent'), summary,
analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'),
analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})),
json.dumps(analysis.get('structure', {}))
))
processed += 1
if processed % 100 == 0:
conn.commit()
print(f'\rAnalyzed: {processed} folders', end='', flush=True)
conn.commit()
print()
logger.info(f'Completed folder analysis: {processed} folders')
cursor.execute('''
SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size)
FROM folders
GROUP BY project_type
''')
print(f'\n=== FOLDER ANALYSIS SUMMARY ===')
for row in cursor.fetchall():
proj_type, count, files, size = row
print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}')
finally:
cursor.close()
conn.close()
def review_migration(self, category: Optional[str]=None, show_build: bool=False):
from classification.classifier import FileClassifier
classifier = FileClassifier()
conn = self.get_connection()
cursor = conn.cursor()
try:
query = 'SELECT path, size, category FROM files WHERE 1=1'
params = []
if category:
query += ' AND category = %s'
params.append(category)
if not show_build:
query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"
query += ' ORDER BY category, size DESC LIMIT 100'
cursor.execute(query, params)
files = cursor.fetchall()
if not files:
print('No files found matching criteria')
return
print(f'\n=== MIGRATION PREVIEW ===')
print(f'Showing {len(files)} files\n')
current_category = None
for path, size, cat in files:
if cat != current_category:
current_category = cat
print(f'\n{cat}:')
labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
target = classifier.suggest_target_path(path, suggested_cat, labels)
print(f' {path}')
print(f'{target} ({self.format_size(int(size))})')
finally:
cursor.close()
conn.close()
@staticmethod
def format_size(size: int) -> str:
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size < 1024:
return f'{size:.1f}{unit}'
size /= 1024
return f'{size:.1f}PB'
def main():
parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
subparsers = parser.add_subparsers(dest='command', required=True)
index_parser = subparsers.add_parser('index', help='Index files on a disk')
index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
index_parser.add_argument('disk_name', help='Logical name for the disk')
plan_parser = subparsers.add_parser('plan', help='Create migration plan')
plan_parser.add_argument('target_disk', help='Disk to free up')
plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
merge_parser.add_argument('--target', required=True, help='Target disk')
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
profile_parser.add_argument('--disk', help='Profile specific disk')
profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
profile_parser.add_argument('--limit', type=int, help='Limit number of files')
extract_parser = subparsers.add_parser('extract', help='Extract content from files')
extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
classify_parser.add_argument('--disk', help='Classify specific disk')
classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch instead of resuming')
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
folders_parser.add_argument('--disk', help='Analyze specific disk')
folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
review_parser.add_argument('--category', help='Review specific category')
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
report_parser = subparsers.add_parser('report', help='Show current status')
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
args = parser.parse_args()
tool = DiskReorganizer()
if args.command == 'index':
tool.index_disk(args.disk_root, args.disk_name)
elif args.command == 'dedupe':
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
elif args.command == 'merge':
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output, filter_system=args.filter_system, network_target=args.network)
elif args.command == 'plan':
plan = tool.plan_migration(args.target_disk, args.dest_disks)
if plan:
print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
print(f"Destination disks: {', '.join(plan['destination_disks'])}")
elif args.command == 'execute':
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
elif args.command == 'profile':
tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
elif args.command == 'extract':
tool.extract_content(kind=args.kind, limit=args.limit)
elif args.command == 'parse':
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
elif args.command == 'enrich':
tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
elif args.command == 'classify':
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
elif args.command == 'analyze-folders':
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
elif args.command == 'review':
tool.review_migration(category=args.category, show_build=args.show_build)
elif args.command == 'report':
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
if __name__ == '__main__':
main()

27
app/migration/__init__.py Normal file
View File

@@ -0,0 +1,27 @@
"""Migration package exports"""
from .copy import (
CopyMigrationStrategy,
FastCopyStrategy,
SafeCopyStrategy,
ReferenceCopyStrategy
)
from .hardlink import (
HardlinkMigrationStrategy,
SymlinkMigrationStrategy,
DedupHardlinkStrategy
)
from .engine import MigrationEngine
from ._protocols import IMigrationStrategy, IMigrationEngine
__all__ = [
'CopyMigrationStrategy',
'FastCopyStrategy',
'SafeCopyStrategy',
'ReferenceCopyStrategy',
'HardlinkMigrationStrategy',
'SymlinkMigrationStrategy',
'DedupHardlinkStrategy',
'MigrationEngine',
'IMigrationStrategy',
'IMigrationEngine',
]

107
app/migration/_protocols.py Normal file
View File

@@ -0,0 +1,107 @@
"""Protocol definitions for the migration package"""
from typing import Protocol
from pathlib import Path
from ..shared.models import OperationRecord
class IMigrationStrategy(Protocol):
"""Protocol for migration strategies"""
def migrate(
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate a file from source to destination
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
...
def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible
Args:
source: Source file path
destination: Destination file path
Returns:
True if migration is possible
"""
...
def estimate_time(self, source: Path) -> float:
"""Estimate migration time in seconds
Args:
source: Source file path
Returns:
Estimated time in seconds
"""
...
def cleanup(self, source: Path) -> bool:
"""Cleanup source file after successful migration
Args:
source: Source file path
Returns:
True if cleanup successful
"""
...
class IMigrationEngine(Protocol):
"""Protocol for migration engine"""
def plan_migration(
self,
disk: str,
target_base: Path
) -> list[OperationRecord]:
"""Plan migration for a disk
Args:
disk: Disk identifier
target_base: Target base directory
Returns:
List of planned operations
"""
...
def execute_migration(
self,
operations: list[OperationRecord],
dry_run: bool = False
) -> dict:
"""Execute migration operations
Args:
operations: List of operations to execute
dry_run: Whether to perform a dry run
Returns:
Dictionary with execution statistics
"""
...
def rollback(self, operation: OperationRecord) -> bool:
"""Rollback a migration operation
Args:
operation: Operation to rollback
Returns:
True if rollback successful
"""
...

268
app/migration/copy.py Normal file
View File

@@ -0,0 +1,268 @@
"""Copy-based migration strategy"""
import shutil
from pathlib import Path
from typing import Optional
import os
from ..shared.logger import ProgressLogger
class CopyMigrationStrategy:
"""Copy files to destination with verification"""
def __init__(
self,
logger: Optional[ProgressLogger] = None,
preserve_metadata: bool = True,
verify_checksums: bool = True
):
"""Initialize copy migration strategy
Args:
logger: Optional progress logger
preserve_metadata: Whether to preserve file metadata
verify_checksums: Whether to verify checksums after copy
"""
self.logger = logger
self.preserve_metadata = preserve_metadata
self.verify_checksums = verify_checksums
def migrate(
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate file by copying
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
if not source.exists():
if self.logger:
self.logger.error(f"Source file does not exist: {source}")
return False
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True)
try:
# Copy file
if self.preserve_metadata:
shutil.copy2(source, destination)
else:
shutil.copy(source, destination)
# Verify if requested
if verify and self.verify_checksums:
if not self._verify_copy(source, destination):
if self.logger:
self.logger.error(f"Verification failed: {source} -> {destination}")
destination.unlink()
return False
return True
except Exception as e:
if self.logger:
self.logger.error(f"Copy failed: {source} -> {destination}: {e}")
return False
def _verify_copy(self, source: Path, destination: Path) -> bool:
"""Verify copied file
Args:
source: Source file path
destination: Destination file path
Returns:
True if verification successful
"""
# Check size
source_size = source.stat().st_size
dest_size = destination.stat().st_size
if source_size != dest_size:
return False
# Compare checksums for files larger than 1MB
if source_size > 1024 * 1024:
from ..deduplication.chunker import hash_file
source_hash = hash_file(source)
dest_hash = hash_file(destination)
return source_hash == dest_hash
# For small files, compare content directly
with open(source, 'rb') as f1, open(destination, 'rb') as f2:
return f1.read() == f2.read()
def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible
Args:
source: Source file path
destination: Destination file path
Returns:
True if migration is possible
"""
if not source.exists():
return False
# Check if destination directory is writable
dest_dir = destination.parent
if dest_dir.exists():
return os.access(dest_dir, os.W_OK)
# Check if parent directory exists and is writable
parent = dest_dir.parent
while not parent.exists() and parent != parent.parent:
parent = parent.parent
return parent.exists() and os.access(parent, os.W_OK)
def estimate_time(self, source: Path) -> float:
"""Estimate migration time in seconds
Args:
source: Source file path
Returns:
Estimated time in seconds
"""
if not source.exists():
return 0.0
size = source.stat().st_size
# Estimate based on typical copy speed (100 MB/s)
typical_speed = 100 * 1024 * 1024 # bytes per second
return size / typical_speed
def cleanup(self, source: Path) -> bool:
"""Cleanup source file after successful migration
Args:
source: Source file path
Returns:
True if cleanup successful
"""
try:
if source.exists():
source.unlink()
return True
except Exception as e:
if self.logger:
self.logger.warning(f"Failed to cleanup {source}: {e}")
return False
class FastCopyStrategy(CopyMigrationStrategy):
"""Fast copy strategy without verification"""
def __init__(self, logger: Optional[ProgressLogger] = None):
"""Initialize fast copy strategy"""
super().__init__(
logger=logger,
preserve_metadata=True,
verify_checksums=False
)
class SafeCopyStrategy(CopyMigrationStrategy):
"""Safe copy strategy with full verification"""
def __init__(self, logger: Optional[ProgressLogger] = None):
"""Initialize safe copy strategy"""
super().__init__(
logger=logger,
preserve_metadata=True,
verify_checksums=True
)
class ReferenceCopyStrategy:
"""Create reference copy using reflinks (CoW) if supported"""
def __init__(self, logger: Optional[ProgressLogger] = None):
"""Initialize reflink copy strategy"""
self.logger = logger
def migrate(
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate using reflink (copy-on-write)
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
if not source.exists():
if self.logger:
self.logger.error(f"Source file does not exist: {source}")
return False
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True)
try:
# Try reflink copy (works on btrfs, xfs, etc.)
import subprocess
result = subprocess.run(
['cp', '--reflink=auto', str(source), str(destination)],
capture_output=True,
check=False
)
if result.returncode != 0:
# Fallback to regular copy
shutil.copy2(source, destination)
return True
except Exception as e:
if self.logger:
self.logger.error(f"Reflink copy failed: {source} -> {destination}: {e}")
return False
def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible"""
if not source.exists():
return False
dest_dir = destination.parent
if dest_dir.exists():
return os.access(dest_dir, os.W_OK)
return True
def estimate_time(self, source: Path) -> float:
"""Estimate migration time (reflinks are fast)"""
return 0.1 # Reflinks are nearly instant
def cleanup(self, source: Path) -> bool:
"""Cleanup source file"""
try:
if source.exists():
source.unlink()
return True
except Exception as e:
if self.logger:
self.logger.warning(f"Failed to cleanup {source}: {e}")
return False

454
app/migration/engine.py Normal file
View File

@@ -0,0 +1,454 @@
"""Migration engine"""
from pathlib import Path
from typing import Optional, Callable
from datetime import datetime
import psycopg2
from psycopg2.extras import execute_batch
from .copy import CopyMigrationStrategy, SafeCopyStrategy
from .hardlink import HardlinkMigrationStrategy, SymlinkMigrationStrategy
from ..shared.models import OperationRecord, ProcessingStats, MigrationPlan
from ..shared.config import DatabaseConfig, ProcessingConfig
from ..shared.logger import ProgressLogger
class MigrationEngine:
"""Engine for migrating files"""
def __init__(
self,
db_config: DatabaseConfig,
processing_config: ProcessingConfig,
logger: ProgressLogger,
target_base: Path
):
"""Initialize migration engine
Args:
db_config: Database configuration
processing_config: Processing configuration
logger: Progress logger
target_base: Target base directory for migrations
"""
self.db_config = db_config
self.processing_config = processing_config
self.logger = logger
self.target_base = Path(target_base)
self._connection = None
# Initialize strategies
self.copy_strategy = SafeCopyStrategy(logger=logger)
self.hardlink_strategy = HardlinkMigrationStrategy(logger=logger)
self.symlink_strategy = SymlinkMigrationStrategy(logger=logger)
def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection
def _ensure_tables(self):
"""Ensure migration tables exist"""
conn = self._get_connection()
cursor = conn.cursor()
# Create operations table
cursor.execute("""
CREATE TABLE IF NOT EXISTS operations (
id SERIAL PRIMARY KEY,
source_path TEXT NOT NULL,
target_path TEXT NOT NULL,
operation_type TEXT NOT NULL,
size BIGINT DEFAULT 0,
status TEXT DEFAULT 'pending',
error TEXT,
executed_at TIMESTAMP,
verified BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create index on status
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_operations_status
ON operations(status)
""")
conn.commit()
cursor.close()
def plan_migration(
self,
disk: Optional[str] = None,
category: Optional[str] = None
) -> MigrationPlan:
"""Plan migration for files
Args:
disk: Optional disk filter
category: Optional category filter
Returns:
MigrationPlan with planned operations
"""
self.logger.section("Planning Migration")
conn = self._get_connection()
cursor = conn.cursor()
# Build query
conditions = ["category IS NOT NULL"]
params = []
if disk:
conditions.append("disk_label = %s")
params.append(disk)
if category:
conditions.append("category = %s")
params.append(category)
query = f"""
SELECT path, size, category, duplicate_of
FROM files
WHERE {' AND '.join(conditions)}
ORDER BY category, path
"""
cursor.execute(query, params)
files = cursor.fetchall()
self.logger.info(f"Found {len(files)} files to migrate")
operations = []
total_size = 0
for path_str, size, file_category, duplicate_of in files:
source = Path(path_str)
# Determine destination
target_path = self.target_base / file_category / source.name
# Determine operation type
if duplicate_of:
# Use hardlink for duplicates
operation_type = 'hardlink'
else:
# Use copy for unique files
operation_type = 'copy'
operation = OperationRecord(
source_path=source,
target_path=target_path,
operation_type=operation_type,
size=size
)
operations.append(operation)
total_size += size
cursor.close()
plan = MigrationPlan(
target_disk=str(self.target_base),
destination_disks=[str(self.target_base)],
operations=operations,
total_size=total_size,
file_count=len(operations)
)
self.logger.info(
f"Migration plan created: {plan.file_count} files, "
f"{plan.total_size:,} bytes"
)
return plan
def execute_migration(
self,
operations: list[OperationRecord],
dry_run: bool = False,
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
) -> ProcessingStats:
"""Execute migration operations
Args:
operations: List of operations to execute
dry_run: Whether to perform a dry run
progress_callback: Optional callback for progress updates
Returns:
ProcessingStats with execution statistics
"""
self.logger.section("Executing Migration" + (" (DRY RUN)" if dry_run else ""))
self._ensure_tables()
stats = ProcessingStats()
total_ops = len(operations)
for operation in operations:
stats.files_processed += 1
if dry_run:
# In dry run, just log what would happen
self.logger.debug(
f"[DRY RUN] Would {operation.operation_type}: "
f"{operation.source_path} -> {operation.target_path}"
)
stats.files_succeeded += 1
else:
# Execute actual migration
success = self._execute_operation(operation)
if success:
stats.files_succeeded += 1
stats.bytes_processed += operation.size
else:
stats.files_failed += 1
# Progress callback
if progress_callback and stats.files_processed % 100 == 0:
progress_callback(stats.files_processed, total_ops, stats)
# Log progress
if stats.files_processed % 1000 == 0:
self.logger.progress(
stats.files_processed,
total_ops,
prefix="Operations executed",
bytes_processed=stats.bytes_processed,
elapsed_seconds=stats.elapsed_seconds
)
self.logger.info(
f"Migration {'dry run' if dry_run else 'execution'} complete: "
f"{stats.files_succeeded}/{total_ops} operations, "
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
)
return stats
def _execute_operation(self, operation: OperationRecord) -> bool:
"""Execute a single migration operation
Args:
operation: Operation to execute
Returns:
True if successful
"""
operation.status = 'in_progress'
operation.executed_at = datetime.now()
try:
# Select strategy based on operation type
if operation.operation_type == 'copy':
strategy = self.copy_strategy
elif operation.operation_type == 'hardlink':
strategy = self.hardlink_strategy
elif operation.operation_type == 'symlink':
strategy = self.symlink_strategy
else:
raise ValueError(f"Unknown operation type: {operation.operation_type}")
# Execute migration
success = strategy.migrate(
operation.source_path,
operation.target_path,
verify=self.processing_config.verify_operations
)
if success:
operation.status = 'completed'
operation.verified = True
self._record_operation(operation)
return True
else:
operation.status = 'failed'
operation.error = "Migration failed"
self._record_operation(operation)
return False
except Exception as e:
operation.status = 'failed'
operation.error = str(e)
self._record_operation(operation)
self.logger.error(f"Operation failed: {operation.source_path}: {e}")
return False
def _record_operation(self, operation: OperationRecord):
"""Record operation in database
Args:
operation: Operation to record
"""
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("""
INSERT INTO operations (
source_path, target_path, operation_type, bytes_processed,
status, error, executed_at, verified
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
""", (
str(operation.source_path),
str(operation.target_path),
operation.operation_type,
operation.size,
operation.status,
operation.error,
operation.executed_at,
operation.verified
))
conn.commit()
cursor.close()
def rollback(self, operation: OperationRecord) -> bool:
"""Rollback a migration operation
Args:
operation: Operation to rollback
Returns:
True if rollback successful
"""
self.logger.warning(f"Rolling back: {operation.target_path}")
try:
# Remove destination
if operation.target_path.exists():
operation.target_path.unlink()
# Update database
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("""
UPDATE operations
SET status = 'rolled_back'
WHERE source_path = %s AND target_path = %s
""", (str(operation.source_path), str(operation.target_path)))
conn.commit()
cursor.close()
return True
except Exception as e:
self.logger.error(f"Rollback failed: {operation.target_path}: {e}")
return False
def get_migration_stats(self) -> dict:
"""Get migration statistics
Returns:
Dictionary with statistics
"""
conn = self._get_connection()
cursor = conn.cursor()
stats = {}
# Total operations
cursor.execute("SELECT COUNT(*) FROM operations")
stats['total_operations'] = cursor.fetchone()[0]
# Operations by status
cursor.execute("""
SELECT status, COUNT(*)
FROM operations
GROUP BY status
""")
for status, count in cursor.fetchall():
stats[f'{status}_operations'] = count
# Total size migrated
cursor.execute("""
SELECT COALESCE(SUM(size), 0)
FROM operations
WHERE status = 'completed'
""")
stats['total_size_migrated'] = cursor.fetchone()[0]
cursor.close()
return stats
def verify_migrations(self) -> dict:
"""Verify completed migrations
Returns:
Dictionary with verification results
"""
self.logger.subsection("Verifying Migrations")
conn = self._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT source_path, target_path, operation_type
FROM operations
WHERE status = 'completed' AND verified = FALSE
""")
operations = cursor.fetchall()
cursor.close()
results = {
'total': len(operations),
'verified': 0,
'failed': 0
}
for source_str, dest_str, op_type in operations:
source = Path(source_str)
dest = Path(dest_str)
# Verify destination exists
if not dest.exists():
results['failed'] += 1
self.logger.warning(f"Verification failed: {dest} does not exist")
continue
# Verify based on operation type
if op_type == 'hardlink':
# Check if hardlinked
if source.exists() and source.stat().st_ino == dest.stat().st_ino:
results['verified'] += 1
else:
results['failed'] += 1
else:
# Check if destination exists and has correct size
if dest.exists():
results['verified'] += 1
else:
results['failed'] += 1
self.logger.info(
f"Verification complete: {results['verified']}/{results['total']} verified"
)
return results
def close(self):
"""Close database connection"""
if self._connection and not self._connection.closed:
self._connection.close()
def __enter__(self):
"""Context manager entry"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close()

377
app/migration/hardlink.py Normal file
View File

@@ -0,0 +1,377 @@
"""Hardlink-based migration strategy"""
import os
from pathlib import Path
from typing import Optional
from ..shared.logger import ProgressLogger
class HardlinkMigrationStrategy:
"""Create hardlinks to files instead of copying"""
def __init__(self, logger: Optional[ProgressLogger] = None):
"""Initialize hardlink migration strategy
Args:
logger: Optional progress logger
"""
self.logger = logger
def migrate(
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate file by creating hardlink
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
if not source.exists():
if self.logger:
self.logger.error(f"Source file does not exist: {source}")
return False
# Check if source and destination are on same filesystem
if not self._same_filesystem(source, destination.parent):
if self.logger:
self.logger.warning(
f"Cannot hardlink across filesystems: {source} -> {destination}"
)
return False
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True)
try:
# Create hardlink
os.link(source, destination)
# Verify if requested
if verify:
if not self._verify_hardlink(source, destination):
if self.logger:
self.logger.error(f"Verification failed: {source} -> {destination}")
destination.unlink()
return False
return True
except FileExistsError:
if self.logger:
self.logger.warning(f"Destination already exists: {destination}")
return False
except Exception as e:
if self.logger:
self.logger.error(f"Hardlink failed: {source} -> {destination}: {e}")
return False
def _same_filesystem(self, path1: Path, path2: Path) -> bool:
"""Check if two paths are on the same filesystem
Args:
path1: First path
path2: Second path
Returns:
True if on same filesystem
"""
try:
# Get device IDs
stat1 = path1.stat()
stat2 = path2.stat()
return stat1.st_dev == stat2.st_dev
except Exception:
return False
def _verify_hardlink(self, source: Path, destination: Path) -> bool:
"""Verify hardlink
Args:
source: Source file path
destination: Destination file path
Returns:
True if verification successful
"""
try:
# Check if they have the same inode
source_stat = source.stat()
dest_stat = destination.stat()
return source_stat.st_ino == dest_stat.st_ino
except Exception:
return False
def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible
Args:
source: Source file path
destination: Destination file path
Returns:
True if migration is possible
"""
if not source.exists():
return False
# Check if on same filesystem
dest_dir = destination.parent
if dest_dir.exists():
return self._same_filesystem(source, dest_dir)
# Check parent directories
parent = dest_dir.parent
while not parent.exists() and parent != parent.parent:
parent = parent.parent
return parent.exists() and self._same_filesystem(source, parent)
def estimate_time(self, source: Path) -> float:
"""Estimate migration time in seconds
Args:
source: Source file path
Returns:
Estimated time in seconds (hardlinks are instant)
"""
return 0.01 # Hardlinks are nearly instant
def cleanup(self, source: Path) -> bool:
"""Cleanup source file after successful migration
Note: For hardlinks, we typically don't remove the source
immediately as both links point to the same inode.
Args:
source: Source file path
Returns:
True (no cleanup needed for hardlinks)
"""
# For hardlinks, we don't remove the source
# Both source and destination point to the same data
return True
class SymlinkMigrationStrategy:
"""Create symbolic links to files"""
def __init__(
self,
logger: Optional[ProgressLogger] = None,
absolute_links: bool = True
):
"""Initialize symlink migration strategy
Args:
logger: Optional progress logger
absolute_links: Whether to create absolute symlinks
"""
self.logger = logger
self.absolute_links = absolute_links
def migrate(
self,
source: Path,
destination: Path,
verify: bool = True
) -> bool:
"""Migrate file by creating symlink
Args:
source: Source file path
destination: Destination file path
verify: Whether to verify the operation
Returns:
True if migration successful
"""
if not source.exists():
if self.logger:
self.logger.error(f"Source file does not exist: {source}")
return False
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True)
try:
# Determine link target
if self.absolute_links:
target = source.resolve()
else:
# Create relative symlink
target = os.path.relpath(source, destination.parent)
# Create symlink
destination.symlink_to(target)
# Verify if requested
if verify:
if not self._verify_symlink(destination, source):
if self.logger:
self.logger.error(f"Verification failed: {source} -> {destination}")
destination.unlink()
return False
return True
except FileExistsError:
if self.logger:
self.logger.warning(f"Destination already exists: {destination}")
return False
except Exception as e:
if self.logger:
self.logger.error(f"Symlink failed: {source} -> {destination}: {e}")
return False
def _verify_symlink(self, symlink: Path, expected_target: Path) -> bool:
"""Verify symlink
Args:
symlink: Symlink path
expected_target: Expected target path
Returns:
True if verification successful
"""
try:
# Check if it's a symlink
if not symlink.is_symlink():
return False
# Resolve and compare
resolved = symlink.resolve()
expected = expected_target.resolve()
return resolved == expected
except Exception:
return False
def can_migrate(self, source: Path, destination: Path) -> bool:
"""Check if migration is possible
Args:
source: Source file path
destination: Destination file path
Returns:
True if migration is possible
"""
if not source.exists():
return False
# Check if destination directory is writable
dest_dir = destination.parent
if dest_dir.exists():
return os.access(dest_dir, os.W_OK)
return True
def estimate_time(self, source: Path) -> float:
"""Estimate migration time in seconds
Args:
source: Source file path
Returns:
Estimated time in seconds (symlinks are instant)
"""
return 0.01 # Symlinks are instant
def cleanup(self, source: Path) -> bool:
"""Cleanup source file after successful migration
Note: For symlinks, we don't remove the source as the
symlink points to it.
Args:
source: Source file path
Returns:
True (no cleanup needed for symlinks)
"""
# For symlinks, we don't remove the source
return True
class DedupHardlinkStrategy(HardlinkMigrationStrategy):
"""Hardlink strategy for deduplication
Creates hardlinks for duplicate files to save space.
"""
def __init__(self, logger: Optional[ProgressLogger] = None):
"""Initialize dedup hardlink strategy"""
super().__init__(logger=logger)
def deduplicate(
self,
canonical: Path,
duplicate: Path
) -> bool:
"""Replace duplicate with hardlink to canonical
Args:
canonical: Canonical file path
duplicate: Duplicate file path
Returns:
True if deduplication successful
"""
if not canonical.exists():
if self.logger:
self.logger.error(f"Canonical file does not exist: {canonical}")
return False
if not duplicate.exists():
if self.logger:
self.logger.error(f"Duplicate file does not exist: {duplicate}")
return False
# Check if already hardlinked
if self._verify_hardlink(canonical, duplicate):
return True
# Check if on same filesystem
if not self._same_filesystem(canonical, duplicate):
if self.logger:
self.logger.warning(
f"Cannot hardlink across filesystems: {canonical} -> {duplicate}"
)
return False
try:
# Create temporary backup
backup = duplicate.with_suffix(duplicate.suffix + '.bak')
duplicate.rename(backup)
# Create hardlink
os.link(canonical, duplicate)
# Remove backup
backup.unlink()
return True
except Exception as e:
if self.logger:
self.logger.error(f"Deduplication failed: {duplicate}: {e}")
# Restore from backup
if backup.exists():
backup.rename(duplicate)
return False

View File

@@ -0,0 +1,44 @@
from pathlib import Path
from typing import Dict
import re
class CodeParser:
def __init__(self):
self.patterns = {
'python': {'imports': r'^import |^from .+ import', 'class': r'^class \w+', 'function': r'^def \w+'},
'javascript': {'imports': r'^import |^require\(', 'class': r'^class \w+', 'function': r'^function \w+|^const \w+ = '},
'java': {'package': r'^package ', 'imports': r'^import ', 'class': r'^public class \w+'},
'go': {'package': r'^package ', 'imports': r'^import ', 'function': r'^func \w+'}
}
def parse(self, file_path: Path) -> Dict:
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
language = self._detect_language(file_path, text)
structure = self._extract_structure(text, language)
return {
'text': text,
'language': language,
'line_count': len(text.split('\n')),
'structure': structure,
'quality': 'high'
}
except Exception as e:
return {'error': str(e)}
def _detect_language(self, file_path: Path, text: str) -> str:
lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go'}
return lang_map.get(file_path.suffix.lower(), 'unknown')
def _extract_structure(self, text: str, language: str) -> Dict:
patterns = self.patterns.get(language, {})
structure = {'type': 'code', 'language': language}
for key, pattern in patterns.items():
matches = re.findall(pattern, text, re.MULTILINE)
structure[key] = len(matches)
return structure

View File

@@ -0,0 +1,42 @@
from pathlib import Path
from typing import Dict
class MediaParser:
def parse_audio(self, file_path: Path) -> Dict:
return {
'text': '[Audio transcription pending]',
'needs_transcription': True,
'transcription_service': 'whisper',
'structure': {'type': 'audio'},
'quality': 'pending'
}
def parse_video(self, file_path: Path) -> Dict:
return {
'text': '[Video transcription pending]',
'needs_transcription': True,
'needs_scene_detection': True,
'transcription_service': 'whisper',
'structure': {'type': 'video'},
'quality': 'pending'
}
def parse_image(self, file_path: Path) -> Dict:
try:
from PIL import Image
with Image.open(file_path) as img:
width, height = img.size
mode = img.mode
return {
'text': '[Image caption/OCR pending]',
'needs_ocr': True,
'needs_caption': True,
'dimensions': f'{width}x{height}',
'mode': mode,
'structure': {'type': 'image', 'width': width, 'height': height},
'quality': 'pending'
}
except Exception as e:
return {'error': str(e)}

31
app/parsers/pdf_parser.py Normal file
View File

@@ -0,0 +1,31 @@
from pathlib import Path
from typing import Dict, List
class PDFParser:
def parse(self, file_path: Path) -> Dict:
try:
import PyPDF2
pages = []
with open(file_path, 'rb') as f:
pdf = PyPDF2.PdfReader(f)
page_count = len(pdf.pages)
for i, page in enumerate(pdf.pages[:50]):
text = page.extract_text()
pages.append({'page': i + 1, 'text': text, 'char_count': len(text)})
full_text = '\n\n'.join([p['text'] for p in pages])
has_text_layer = sum(p['char_count'] for p in pages) > 100
return {
'text': full_text,
'page_count': page_count,
'pages_extracted': len(pages),
'has_text_layer': has_text_layer,
'needs_ocr': not has_text_layer,
'structure': {'type': 'document', 'pages': pages[:5]},
'quality': 'high' if has_text_layer else 'needs_ocr'
}
except Exception as e:
return {'error': str(e), 'needs_ocr': True}

View File

@@ -0,0 +1,26 @@
from pathlib import Path
from typing import Dict, Optional
import chardet
class TextParser:
def parse(self, file_path: Path) -> Dict:
try:
with open(file_path, 'rb') as f:
raw_data = f.read(1024 * 1024)
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
text = raw_data.decode(encoding, errors='ignore')
lines = text.split('\n')
return {
'text': text,
'encoding': encoding,
'line_count': len(lines),
'char_count': len(text),
'word_count': len(text.split()),
'structure': {'type': 'plain_text'},
'quality': 'high' if encoding == 'utf-8' else 'medium'
}
except Exception as e:
return {'error': str(e)}

51
app/setup.py Normal file
View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
"""Setup script for defrag disk reorganizer"""
from setuptools import setup, find_packages
from pathlib import Path
# Read requirements
requirements_path = Path(__file__).parent / 'requirements.txt'
with open(requirements_path) as f:
requirements = [
line.strip()
for line in f
if line.strip() and not line.startswith('#')
]
# Read long description from README
readme_path = Path(__file__).parent / 'README.md'
long_description = ""
if readme_path.exists():
with open(readme_path) as f:
long_description = f.read()
setup(
name='defrag',
version='1.0.0',
description='Intelligent disk reorganization system for 20TB+ data with deduplication and classification',
long_description=long_description,
long_description_content_type='text/markdown',
author='Project Defrag',
author_email='defrag@example.com',
url='https://github.com/yourusername/defrag',
packages=find_packages(),
install_requires=requirements,
python_requires='>=3.9',
entry_points={
'console_scripts': [
'defrag=main:main',
],
},
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: System Administrators',
'Topic :: System :: Filesystems',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
],
keywords='disk management storage deduplication classification migration',
)

50
app/shared/__init__.py Normal file
View File

@@ -0,0 +1,50 @@
"""Shared package exports"""
from .models import (
FileRecord,
OperationRecord,
DiskInfo,
MigrationPlan,
ProcessingStats
)
from .config import (
Config,
DatabaseConfig,
ProcessingConfig,
LoggingConfig,
load_config
)
from .logger import (
ProgressLogger,
create_logger,
format_size,
format_rate,
format_time
)
from ._protocols import IDatabase, ILogger
__all__ = [
# Models
'FileRecord',
'OperationRecord',
'DiskInfo',
'MigrationPlan',
'ProcessingStats',
# Config
'Config',
'DatabaseConfig',
'ProcessingConfig',
'LoggingConfig',
'load_config',
# Logger
'ProgressLogger',
'create_logger',
'format_size',
'format_rate',
'format_time',
# Protocols
'IDatabase',
'ILogger',
]

67
app/shared/_protocols.py Normal file
View File

@@ -0,0 +1,67 @@
"""Protocol definitions for the shared package"""
from typing import Protocol, Any
from pathlib import Path
from dataclasses import dataclass
from datetime import datetime
@dataclass
class FileRecord:
"""Core file record with all metadata"""
path: Path
size: int
modified_time: float
created_time: float
disk_label: str
checksum: str | None = None
status: str = 'indexed' # indexed, planned, moved, verified
category: str | None = None
duplicate_of: str | None = None
@dataclass
class OperationRecord:
"""Record of a migration operation"""
source_path: Path
target_path: Path
operation_type: str # move, copy, hardlink, symlink
status: str = 'pending' # pending, in_progress, completed, failed
error: str | None = None
executed_at: datetime | None = None
verified: bool = False
class IDatabase(Protocol):
"""Protocol for database operations"""
def store_file(self, file_record: FileRecord) -> None:
"""Store a file record"""
...
def get_files_by_disk(self, disk: str) -> list[FileRecord]:
"""Get all files on a specific disk"""
...
def store_operation(self, operation: OperationRecord) -> None:
"""Store an operation record"""
...
def get_pending_operations(self) -> list[OperationRecord]:
"""Get all pending operations"""
...
class ILogger(Protocol):
"""Protocol for logging operations"""
def info(self, message: str) -> None:
...
def warning(self, message: str) -> None:
...
def error(self, message: str) -> None:
...
def debug(self, message: str) -> None:
...

110
app/shared/config.py Normal file
View File

@@ -0,0 +1,110 @@
"""Configuration management for disk reorganizer"""
import json
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class DatabaseConfig:
"""Database connection configuration"""
host: str = '192.168.1.159'
port: int = 5432
database: str = 'disk_reorganizer_db'
user: str = 'disk_reorg_user'
password: str = 'heel-goed-wachtwoord'
def to_dict(self) -> dict:
"""Convert to dictionary"""
return asdict(self)
@dataclass
class ProcessingConfig:
"""Processing behavior configuration"""
batch_size: int = 1000
commit_interval: int = 100
parallel_workers: int = 4
chunk_size: int = 8192
hash_algorithm: str = 'sha256'
verify_operations: bool = True
preserve_timestamps: bool = True
def to_dict(self) -> dict:
"""Convert to dictionary"""
return asdict(self)
@dataclass
class LoggingConfig:
"""Logging configuration"""
level: str = 'INFO'
log_file: str = 'disk_reorganizer.log'
console_output: bool = True
file_output: bool = True
def to_dict(self) -> dict:
"""Convert to dictionary"""
return asdict(self)
@dataclass
class Config:
"""Main configuration container"""
database: DatabaseConfig = None
processing: ProcessingConfig = None
logging: LoggingConfig = None
def __post_init__(self):
"""Initialize nested configs with defaults if not provided"""
if self.database is None:
self.database = DatabaseConfig()
if self.processing is None:
self.processing = ProcessingConfig()
if self.logging is None:
self.logging = LoggingConfig()
@classmethod
def from_file(cls, config_path: Path) -> 'Config':
"""Load configuration from JSON file"""
if not config_path.exists():
return cls()
with open(config_path, 'r') as f:
data = json.load(f)
return cls(
database=DatabaseConfig(**data.get('database', {})),
processing=ProcessingConfig(**data.get('processing', {})),
logging=LoggingConfig(**data.get('logging', {}))
)
def to_file(self, config_path: Path) -> None:
"""Save configuration to JSON file"""
data = {
'database': self.database.to_dict(),
'processing': self.processing.to_dict(),
'logging': self.logging.to_dict()
}
with open(config_path, 'w') as f:
json.dump(data, f, indent=2)
def to_dict(self) -> dict:
"""Convert to dictionary"""
return {
'database': self.database.to_dict(),
'processing': self.processing.to_dict(),
'logging': self.logging.to_dict()
}
def load_config(config_path: Optional[Path] = None) -> Config:
"""Load configuration from file or return default"""
if config_path is None:
config_path = Path('config.json')
if config_path.exists():
return Config.from_file(config_path)
return Config()

217
app/shared/logger.py Normal file
View File

@@ -0,0 +1,217 @@
"""Dynamic progress logger with formatting utilities"""
import sys
import logging
from typing import Optional
from datetime import datetime
from pathlib import Path
def format_size(bytes_size: int) -> str:
"""Format bytes to human-readable size string
Args:
bytes_size: Size in bytes
Returns:
Human-readable size string (e.g., "1.5 GB", "234.5 MB")
"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
if bytes_size < 1024.0:
return f"{bytes_size:.1f} {unit}"
bytes_size /= 1024.0
return f"{bytes_size:.1f} EB"
def format_rate(bytes_per_second: float) -> str:
"""Format transfer rate to human-readable string
Args:
bytes_per_second: Transfer rate in bytes per second
Returns:
Human-readable rate string (e.g., "125.3 MB/s")
"""
return f"{format_size(int(bytes_per_second))}/s"
def format_time(seconds: float) -> str:
"""Format seconds to human-readable time string
Args:
seconds: Time in seconds
Returns:
Human-readable time string (e.g., "2h 34m 12s", "45m 23s", "12s")
"""
if seconds < 60:
return f"{int(seconds)}s"
elif seconds < 3600:
minutes = int(seconds // 60)
secs = int(seconds % 60)
return f"{minutes}m {secs}s"
else:
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
return f"{hours}h {minutes}m {secs}s"
class ProgressLogger:
"""Dynamic progress logger with real-time statistics"""
def __init__(
self,
name: str = "defrag",
level: int = logging.INFO,
log_file: Optional[Path] = None,
console_output: bool = True
):
"""Initialize progress logger
Args:
name: Logger name
level: Logging level
log_file: Optional log file path
console_output: Whether to output to console
"""
self.logger = logging.getLogger(name)
self.logger.setLevel(level)
self.logger.handlers.clear()
# Create formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# Add console handler
if console_output:
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(level)
console_handler.setFormatter(formatter)
self.logger.addHandler(console_handler)
# Add file handler
if log_file:
log_file.parent.mkdir(parents=True, exist_ok=True)
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(level)
file_handler.setFormatter(formatter)
self.logger.addHandler(file_handler)
self._last_progress_line = ""
def info(self, message: str) -> None:
"""Log info message"""
self.logger.info(message)
def warning(self, message: str) -> None:
"""Log warning message"""
self.logger.warning(message)
def error(self, message: str) -> None:
"""Log error message"""
self.logger.error(message)
def debug(self, message: str) -> None:
"""Log debug message"""
self.logger.debug(message)
def critical(self, message: str) -> None:
"""Log critical message"""
self.logger.critical(message)
def progress(
self,
current: int,
total: int,
prefix: str = "",
suffix: str = "",
bytes_processed: Optional[int] = None,
elapsed_seconds: Optional[float] = None
) -> None:
"""Log progress with dynamic statistics
Args:
current: Current progress count
total: Total count
prefix: Prefix message
suffix: Suffix message
bytes_processed: Optional bytes processed for rate calculation
elapsed_seconds: Optional elapsed time for rate calculation
"""
if total == 0:
percent = 0.0
else:
percent = (current / total) * 100
progress_msg = f"{prefix} [{current}/{total}] {percent:.1f}%"
if bytes_processed is not None and elapsed_seconds is not None and elapsed_seconds > 0:
rate = bytes_per_second = bytes_processed / elapsed_seconds
progress_msg += f" | {format_size(bytes_processed)} @ {format_rate(rate)}"
# Estimate time remaining
if current > 0:
estimated_total_seconds = (elapsed_seconds / current) * total
remaining_seconds = estimated_total_seconds - elapsed_seconds
progress_msg += f" | ETA: {format_time(remaining_seconds)}"
if suffix:
progress_msg += f" | {suffix}"
self.info(progress_msg)
def section(self, title: str) -> None:
"""Log section header
Args:
title: Section title
"""
separator = "=" * 60
self.info(separator)
self.info(f" {title}")
self.info(separator)
def subsection(self, title: str) -> None:
"""Log subsection header
Args:
title: Subsection title
"""
self.info(f"\n--- {title} ---")
def create_logger(
name: str = "defrag",
level: str = "INFO",
log_file: Optional[Path] = None,
console_output: bool = True
) -> ProgressLogger:
"""Create and configure a progress logger
Args:
name: Logger name
level: Logging level as string
log_file: Optional log file path
console_output: Whether to output to console
Returns:
Configured ProgressLogger instance
"""
level_map = {
'DEBUG': logging.DEBUG,
'INFO': logging.INFO,
'WARNING': logging.WARNING,
'ERROR': logging.ERROR,
'CRITICAL': logging.CRITICAL
}
log_level = level_map.get(level.upper(), logging.INFO)
return ProgressLogger(
name=name,
level=log_level,
log_file=log_file,
console_output=console_output
)

127
app/shared/models.py Normal file
View File

@@ -0,0 +1,127 @@
"""Data models for the disk reorganizer"""
from dataclasses import dataclass, field
from pathlib import Path
from datetime import datetime
from typing import Optional
@dataclass
class FileRecord:
"""Core file record with all metadata"""
path: Path
size: int
modified_time: float
created_time: float
disk_label: str
checksum: Optional[str] = None
status: str = 'indexed' # indexed, planned, moved, verified
category: Optional[str] = None
duplicate_of: Optional[str] = None
def to_dict(self) -> dict:
"""Convert to dictionary for serialization"""
return {
'path': str(self.path),
'size': self.size,
'modified_time': self.modified_time,
'created_time': self.created_time,
'disk_label': self.disk_label,
'checksum': self.checksum,
'status': self.status,
'category': self.category,
'duplicate_of': self.duplicate_of
}
@dataclass
class OperationRecord:
"""Record of a migration operation"""
source_path: Path
target_path: Path
operation_type: str # move, copy, hardlink, symlink
size: int = 0
status: str = 'pending' # pending, in_progress, completed, failed
error: Optional[str] = None
executed_at: Optional[datetime] = None
verified: bool = False
def to_dict(self) -> dict:
"""Convert to dictionary for serialization"""
return {
'source_path': str(self.source_path),
'target_path': str(self.target_path),
'operation_type': self.operation_type,
'size': self.size,
'status': self.status,
'error': self.error,
'executed_at': self.executed_at.isoformat() if self.executed_at else None,
'verified': self.verified
}
@dataclass
class DiskInfo:
"""Information about a disk/volume"""
name: str
device: str
mount_point: Path
total_size: int
used_size: int
free_size: int
fs_type: str
@property
def usage_percent(self) -> float:
"""Calculate usage percentage"""
if self.total_size == 0:
return 0.0
return (self.used_size / self.total_size) * 100
@dataclass
class MigrationPlan:
"""Complete migration plan"""
target_disk: str
destination_disks: list[str]
operations: list[OperationRecord]
total_size: int
file_count: int
created_at: datetime = field(default_factory=datetime.now)
def to_dict(self) -> dict:
"""Convert to dictionary for serialization"""
return {
'target_disk': self.target_disk,
'destination_disks': self.destination_disks,
'operations': [op.to_dict() for op in self.operations],
'total_size': self.total_size,
'file_count': self.file_count,
'created_at': self.created_at.isoformat()
}
@dataclass
class ProcessingStats:
"""Statistics for processing operations"""
files_processed: int = 0
bytes_processed: int = 0
files_succeeded: int = 0
files_failed: int = 0
start_time: datetime = field(default_factory=datetime.now)
@property
def elapsed_seconds(self) -> float:
"""Calculate elapsed time in seconds"""
return (datetime.now() - self.start_time).total_seconds()
@property
def files_per_second(self) -> float:
"""Calculate processing rate"""
elapsed = self.elapsed_seconds
return self.files_processed / elapsed if elapsed > 0 else 0.0
@property
def bytes_per_second(self) -> float:
"""Calculate throughput"""
elapsed = self.elapsed_seconds
return self.bytes_processed / elapsed if elapsed > 0 else 0.0

0
app/tests/__init__.py Normal file
View File

9
defrag.iml Normal file
View File

@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@@ -0,0 +1,20 @@
services:
app:
environment:
- LOG_LEVEL=DEBUG
- PYTHONPATH=/app
volumes:
- .:/app
- /var/run/docker.sock:/var/run/docker.sock
ports:
- "8000:8000"
command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
postgres:
environment:
- POSTGRES_LOG_STATEMENT=all
ports:
- "5433:5432" # Different port to avoid conflict with host PostgreSQL
redis:
command: redis-server --appendonly yes --loglevel verbose

276
docker-compose.yml Normal file
View File

@@ -0,0 +1,276 @@
services:
# PostgreSQL Database
postgres:
image: postgres:15-alpine
container_name: project_defrag_db
environment:
POSTGRES_USER: disk_reorg_user
POSTGRES_PASSWORD: heel-goed-wachtwoord
POSTGRES_DB: disk_reorganizer_db
POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C"
volumes:
- postgres_data:/var/lib/postgresql/data
- ./sql/init.sql:/docker-entrypoint-initdb.d/init.sql
- ./sql/migrations:/docker-entrypoint-initdb.d/migrations
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U disk_reorg_user -d disk_reorganizer_db"]
interval: 10s
timeout: 5s
retries: 5
networks:
- defrag-network
# Redis for deduplication hash store (optional)
redis:
image: redis:7-alpine
container_name: project_defrag_redis
command: redis-server --appendonly yes
volumes:
- redis_data:/data
ports:
- "6379:6379"
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
networks:
- defrag-network
# Application Service
app:
build: .
container_name: project_defrag_app
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
environment:
# Database Configuration
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
# Redis Configuration
REDIS_HOST: redis
REDIS_PORT: 6379
# Application Configuration
LOG_LEVEL: INFO
MAX_WORKERS: 4
CHUNK_SIZE_KB: 64
# Mount points (set these when running specific commands)
SOURCE_MOUNT: /mnt/source
TARGET_MOUNT: /mnt/target
volumes:
# Mount host directories for file operations
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
- ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
# Mount for configuration and plans
- ./config:/app/config
- ./plans:/app/plans
- ./logs:/app/logs
# Bind mount for development (optional)
- .:/app
networks:
- defrag-network
profiles:
- full-cycle
- development
# Uncomment for development with hot reload
# command: watchmedo auto-restart --pattern="*.py" --recursive -- python app/main.py
# Single command services for specific operations
index:
build: .
container_name: defrag_index
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source:ro
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "app/main.py", "index", "/media/mike/SMT", "SMT"]
profiles:
- index-only
networks:
- defrag-network
plan:
build: .
container_name: defrag_plan
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ./config:/app/config
- ./plans:/app/plans
- ./logs:/app/logs
command: ["python", "app/main.py", "plan", "/media/mike/SMT", "SMT"]
profiles:
- plan-only
networks:
- defrag-network
execute:
build: .
container_name: defrag_execute
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ${HOST_SOURCE_PATH:-/mnt/source}:/mnt/source
- ${HOST_TARGET_PATH:-/mnt/target}:/mnt/target
- ./plans:/app/plans
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "app/main.py", "execute", "/app/plans/plan.json"]
profiles:
- execute-only
networks:
- defrag-network
dry-run:
build: .
container_name: defrag_dry_run
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ./plans:/app/plans
- ./config:/app/config
- ./logs:/app/logs
command: ["python", "app/main.py", "execute", "/app/plans/plan.json", "--dry-run"]
profiles:
- dry-run-only
networks:
- defrag-network
report:
build: .
container_name: defrag_report
depends_on:
postgres:
condition: service_healthy
environment:
DB_HOST: postgres
DB_PORT: 5432
DB_NAME: disk_reorganizer_db
DB_USER: disk_reorg_user
DB_PASSWORD: heel-goed-wachtwoord
volumes:
- ./reports:/app/reports
- ./logs:/app/logs
command: ["python", "app/main.py", "report"]
profiles:
- report-only
networks:
- defrag-network
# Monitoring and Admin Services
pgadmin:
image: dpage/pgadmin4:latest
container_name: defrag_pgadmin
environment:
PGADMIN_DEFAULT_EMAIL: admin@defrag.local
PGADMIN_DEFAULT_PASSWORD: admin123
volumes:
- pgadmin_data:/var/lib/pgadmin
ports:
- "5050:80"
depends_on:
- postgres
profiles:
- monitoring
networks:
- defrag-network
redis-commander:
image: rediscommander/redis-commander:latest
container_name: defrag_redis_commander
environment:
REDIS_HOSTS: local:redis:6379
ports:
- "8081:8081"
depends_on:
- redis
profiles:
- monitoring
networks:
- defrag-network
flyway:
image: flyway/flyway:latest
container_name: flyway
volumes:
- ./sql/migration:/flyway/sql:ro
environment:
FLYWAY_URL: jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
FLYWAY_USER: disk_reorg_user
FLYWAY_PASSWORD: heel-goed-wachtwoord
FLYWAY_SCHEMAS: public
FLYWAY_LOCATIONS: filesystem:./sql
FLYWAY_CONNECT_RETRIES: "60"
command: migrate
restart: "no"
pg_backup:
image: postgres:16
container_name: pg_backup
environment:
PGPASSWORD: heel-goed-wachtwoord
volumes:
- ./:/backup
command:
- bash
- -lc
- >
pg_dump -h 192.168.1.159 -p 5432 -U disk_reorg_user -d disk_reorganizer_db
--format=custom --no-owner --no-privileges
-f /backup/backup_$(date +%F_%H%M)_disk_reorganizer_db.dump
restart: "no"
networks:
defrag-network:
driver: bridge
volumes:
postgres_data:
driver: local
redis_data:
driver: local
pgadmin_data:
driver: local

7
flyway.conf Normal file
View File

@@ -0,0 +1,7 @@
flyway.url=jdbc:postgresql://192.168.1.159:5432/disk_reorganizer_db
flyway.user=disk_org_user
flyway.password=heel-goed-wachtwoord
flyway.locations=filesystem:sql/migration
flyway.schemas=public

74
pyproject.toml Normal file
View File

@@ -0,0 +1,74 @@
[build-system]
requires = ["setuptools>=65.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "defrag"
version = "1.0.0"
description = "Intelligent disk reorganization system for 20TB+ data"
readme = "README.md"
requires-python = ">=3.9"
license = {text = "MIT"}
authors = [
{name = "Project Defrag"}
]
keywords = ["disk", "storage", "deduplication", "classification", "migration"]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: System Administrators",
"Topic :: System :: Filesystems",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dependencies = [
"psycopg2-binary>=2.9.0",
"psutil>=5.9.0",
"pandas>=1.5.0",
"pyarrow>=10.0.0",
"python-magic>=0.4.27",
]
[project.optional-dependencies]
redis = ["redis>=4.5.0"]
ml = ["scikit-learn>=1.2.0", "numpy>=1.24.0"]
dev = [
"pytest>=7.2.0",
"pytest-cov>=4.0.0",
"black>=23.0.0",
"mypy>=1.0.0",
"flake8>=6.0.0",
]
all = [
"redis>=4.5.0",
"scikit-learn>=1.2.0",
"numpy>=1.24.0",
]
[project.scripts]
defrag = "main:main"
[tool.black]
line-length = 100
target-version = ['py39', 'py310', 'py311', 'py312']
include = '\.pyi?$'
[tool.mypy]
python_version = "3.9"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = false
disallow_incomplete_defs = false
check_untyped_defs = true
no_implicit_optional = true
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = "-v --cov=. --cov-report=html --cov-report=term"

41
requirements.txt Normal file
View File

@@ -0,0 +1,41 @@
# PostgreSQL database adapter for Python
psycopg2-binary>=2.9.9
# Alternative: psycopg2>=2.9.9 (requires PostgreSQL development libraries)
# Use psycopg2-binary for easier installation without compilation
# Core dependencies
# Optional/feature dependencies
redis>=4.5.0 # For RedisHashStore
scikit-learn>=1.0.0 # For MLClassifier
numpy>=1.21.0 # For MLClassifier
# Development dependencies
pytest>=7.0.0
pytest-cov>=4.0.0
black>=22.0.0
mypy>=0.950
flake8>=5.0.0
# Core dependencies
psycopg2-binary>=2.9.0
psutil>=5.9.0
# Data processing
pandas>=1.5.0
pyarrow>=10.0.0
# File type detection
python-magic>=0.4.27
# Optional dependencies
redis>=4.5.0 # For RedisHashStore (optional)
scikit-learn>=1.2.0 # For MLClassifier (optional)
numpy>=1.24.0 # For MLClassifier (optional)
# Development dependencies
pytest>=7.2.0
pytest-cov>=4.0.0
black>=23.0.0
mypy>=1.0.0
flake8>=6.0.0
chardet

51
setup.sh Normal file
View File

@@ -0,0 +1,51 @@
#!/bin/bash
# setup.sh - Complete Docker setup for Project Defrag
set -e
echo "🚀 Setting up Project Defrag with Docker..."
# 1. Create necessary directories
echo "📁 Creating directories..."
mkdir -p {config,plans,logs,reports,sql/migrations}
# 2. Copy environment file
if [ ! -f .env ]; then
echo "⚙️ Creating .env file from template..."
cp .env.example .env
echo "⚠️ Please edit .env file with your configuration!"
fi
# 3. Build the Docker image
echo "🐳 Building Docker image..."
docker compose build app
# 4. Start the database
#echo "🗄️ Starting PostgreSQL database..."
#docker-compose up -d postgres
# 5. Wait for database to be ready
#echo "⏳ Waiting for database to be ready..."
#sleep 10
# 6. Run database initialization
#echo "📊 Initializing database..."
#docker-compose exec -T postgres psql -U disk_reorg_user -d disk_reorganizer_db -f /docker-entrypoint-initdb.d/init.sql
# 7. Start optional services
echo "🔧 Starting monitoring services..."
docker compose --profile monitoring up -d
echo "✅ Setup complete!"
echo ""
echo "📋 Available commands:"
echo " docker compose up -d # Start all services"
echo " docker compose --profile index-only up index # Run index only"
echo " docker compose --profile plan-only up plan # Generate plan"
echo " docker compose --profile dry-run-only up dry-run # Dry run"
echo " docker compose --profile execute-only up execute # Execute migration"
echo " docker compose --profile report-only up report # Generate report"
echo ""
echo "🌐 Access monitoring:"
echo " - PostgreSQL Admin: http://localhost:5050"
echo " - Redis Commander: http://localhost:8081"

61
sql/legacy_setup.sql Normal file
View File

@@ -0,0 +1,61 @@
-- PostgreSQL Database Setup Script for Disk Reorganizer
-- Database: disk_reorganizer_db
-- User: disk_reorg_user
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
-- Create the database (run as superuser: auction)
CREATE DATABASE disk_reorganizer_db
WITH
ENCODING = 'UTF8'
LC_COLLATE = 'en_US.UTF-8'
LC_CTYPE = 'en_US.UTF-8'
TEMPLATE = template0;
-- Connect to the new database
\c disk_reorganizer_db
-- Create the user
CREATE USER disk_reorg_user WITH PASSWORD 'heel-goed-wachtwoord';
-- Create files table
-- Create index on disk column for faster queries
-- Grant privileges to disk_reorg_user
GRANT CONNECT ON DATABASE disk_reorganizer_db TO disk_reorg_user;
GRANT USAGE ON SCHEMA public TO disk_reorg_user;
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO disk_reorg_user;
GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO disk_reorg_user;
-- future tables/sequences created by your owner role (pick the role that creates them)
ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
-- Create function to update updated_at timestamp
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS
$$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Create trigger for files table
CREATE TRIGGER update_files_updated_at
BEFORE UPDATE
ON files
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
-- Display success message
\echo 'Database setup completed successfully!'
\echo 'Database: disk_reorganizer_db'
\echo 'User: disk_reorg_user'
\echo 'Tables created: files, operations'
\echo 'Indexes and triggers created 2)'

View File

@@ -0,0 +1,188 @@
-- sql/init.sql
-- Initialize PostgreSQL database for Project Defrag
-- Enable useful extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
-- future tables/sequences created by your owner role (pick the role that creates them)
ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
GRANT ALL PRIVILEGES ON TABLES TO disk_reorg_user;
ALTER DEFAULT PRIVILEGES FOR ROLE auction IN SCHEMA public
GRANT ALL PRIVILEGES ON SEQUENCES TO disk_reorg_user;
ALTER DATABASE disk_reorganizer_db OWNER TO disk_reorg_user;
-- Files table
CREATE TABLE IF NOT EXISTS files
(
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
path TEXT NOT NULL,
size BIGINT NOT NULL,
modified_time TIMESTAMP WITH TIME ZONE,
created_time TIMESTAMP WITH TIME ZONE,
file_hash VARCHAR(64), -- SHA-256 hash
checksum VARCHAR(64), -- Alias for file_hash (legacy compatibility)
category VARCHAR(50),
disk_label VARCHAR(50),
last_verified TIMESTAMP WITH TIME ZONE,
status VARCHAR(20) DEFAULT 'indexed',
duplicate_of TEXT, -- Path to canonical file if this is a duplicate
-- Metadata
metadata JSONB DEFAULT '{}',
-- Audit fields
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
-- Constraints
CONSTRAINT unique_file_path UNIQUE (path)
);
-- Operations table (audit log)
CREATE TABLE IF NOT EXISTS operations
(
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
operation_type VARCHAR(50) NOT NULL,
source_path TEXT,
target_path TEXT,
status VARCHAR(20) NOT NULL,
-- Legacy compatibility fields
executed INTEGER DEFAULT 0,
verified INTEGER DEFAULT 0,
error TEXT,
-- File reference
file_id UUID REFERENCES files (id) ON DELETE SET NULL,
-- Performance metrics
duration_ms INTEGER,
bytes_processed BIGINT,
-- Error information
error_message TEXT,
error_details JSONB,
-- Context
session_id VARCHAR(100),
user_agent TEXT,
-- Audit fields
started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP WITH TIME ZONE,
executed_at TIMESTAMP WITH TIME ZONE,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
-- Deduplication hash store
CREATE TABLE IF NOT EXISTS deduplication_store
(
hash VARCHAR(64) PRIMARY KEY,
canonical_path TEXT NOT NULL,
reference_count INTEGER DEFAULT 1,
first_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_seen TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
-- Migration plan table
CREATE TABLE IF NOT EXISTS migration_plans
(
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name VARCHAR(100) NOT NULL,
source_disk VARCHAR(50) NOT NULL,
target_disk VARCHAR(50) NOT NULL,
plan_json JSONB NOT NULL,
-- Statistics
total_files INTEGER DEFAULT 0,
total_size BIGINT DEFAULT 0,
estimated_duration INTEGER, -- in seconds
-- Status
status VARCHAR(20) DEFAULT 'draft',
-- Audit
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
executed_at TIMESTAMP WITH TIME ZONE,
completed_at TIMESTAMP WITH TIME ZONE
);
-- Indexes for performance
CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
create index on files (checksum);
create index on files (checksum, path);
CREATE INDEX IF NOT EXISTS idx_operations_status ON operations (status);
CREATE INDEX IF NOT EXISTS idx_operations_created ON operations (created_at);
CREATE INDEX IF NOT EXISTS idx_operations_file_id ON operations (file_id);
CREATE INDEX IF NOT EXISTS idx_dedup_canonical ON deduplication_store (canonical_path);
-- Functions for updating timestamps
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS
$$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ language 'plpgsql';
-- Triggers for automatic updated_at
CREATE TRIGGER update_files_updated_at
BEFORE UPDATE
ON files
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
-- View for operational dashboard
CREATE OR REPLACE VIEW operational_dashboard AS
SELECT o.status,
COUNT(*) as operation_count,
SUM(o.bytes_processed) as total_bytes,
AVG(o.duration_ms) as avg_duration_ms,
MIN(o.started_at) as earliest_operation,
MAX(o.completed_at) as latest_operation
FROM operations o
WHERE o.started_at > CURRENT_TIMESTAMP - INTERVAL '24 hours'
GROUP BY o.status;
-- View for disk usage statistics
CREATE OR REPLACE VIEW disk_usage_stats AS
SELECT disk_label,
COUNT(*) as file_count,
SUM(size) as total_size,
AVG(size) as avg_file_size,
MIN(created_time) as oldest_file,
MAX(modified_time) as newest_file
FROM files
GROUP BY disk_label;
-- Insert default configuration
INSERT INTO migration_plans (name, source_disk, target_disk, plan_json, status)
VALUES ('Default Migration Plan',
'disk_d',
'disk_e',
'{"strategy": "hardlink", "verify_copies": true, "preserve_timestamps": true}'::jsonb,
'draft')
ON CONFLICT DO NOTHING;
-- Create read-only user for monitoring
DO
$$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'monitor_user') THEN
CREATE USER monitor_user WITH PASSWORD 'monitor_password';
END IF;
END
$$;
GRANT CONNECT ON DATABASE disk_reorganizer_db TO monitor_user;
GRANT USAGE ON SCHEMA public TO monitor_user;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO monitor_user;
GRANT SELECT ON operational_dashboard TO monitor_user;
GRANT SELECT ON disk_usage_stats TO monitor_user;

View File

@@ -0,0 +1,11 @@
-- Add extracted text and enrichment columns
ALTER TABLE files ADD COLUMN IF NOT EXISTS extracted_text TEXT;
ALTER TABLE files ADD COLUMN IF NOT EXISTS text_quality VARCHAR(20);
ALTER TABLE files ADD COLUMN IF NOT EXISTS enrichment JSONB;
-- Add indexes for text search
CREATE INDEX IF NOT EXISTS idx_files_extracted_text ON files USING gin(to_tsvector('english', extracted_text));
CREATE INDEX IF NOT EXISTS idx_files_enrichment ON files USING gin(enrichment);
-- Add full text search capability
CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING gin(to_tsvector('english', COALESCE(extracted_text, '')));

View File

@@ -0,0 +1,41 @@
CREATE TABLE IF NOT EXISTS folders
(
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
path TEXT NOT NULL UNIQUE,
parent_path TEXT,
disk_label VARCHAR(50),
file_count INT DEFAULT 0,
total_size BIGINT DEFAULT 0,
project_type VARCHAR(50),
intent TEXT,
summary TEXT,
has_readme BOOLEAN DEFAULT FALSE,
has_git BOOLEAN DEFAULT FALSE,
has_manifest BOOLEAN DEFAULT FALSE,
manifest_types TEXT[],
dominant_file_types JSONB,
structure JSONB,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_folders_path ON folders (path);
CREATE INDEX IF NOT EXISTS idx_folders_parent ON folders (parent_path);
CREATE INDEX IF NOT EXISTS idx_folders_disk ON folders (disk_label);
CREATE INDEX IF NOT EXISTS idx_folders_project_type ON folders (project_type);
CREATE TABLE IF NOT EXISTS processing_checkpoints
(
task_name VARCHAR(100) PRIMARY KEY,
last_processed_id TEXT,
last_processed_path TEXT,
processed_count INT DEFAULT 0,
total_count INT,
started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);