initial

2025-12-13 11:56:06 +01:00
commit 2b2c575385
57 changed files with 6505 additions and 0 deletions
--- a/app/classification/init.py
+++ b/app/classification/init.py
@@ -0,0 +1,2 @@
+from .classifier import FileClassifier
+__all__ = ['FileClassifier']
--- a/app/classification/_protocols.py
+++ b/app/classification/_protocols.py
@@ -0,0 +1,30 @@
+from typing import Protocol, Optional
+from pathlib import Path
+from dataclasses import dataclass
+
+@dataclass
+class ClassificationRule:
+    name: str
+    category: str
+    patterns: list[str]
+    priority: int = 0
+    description: str = ''
+
+class IClassifier(Protocol):
+
+    def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
+        ...
+
+    def get_category_rules(self, category: str) -> list[ClassificationRule]:
+        ...
+
+class IRuleEngine(Protocol):
+
+    def add_rule(self, rule: ClassificationRule) -> None:
+        ...
+
+    def remove_rule(self, rule_name: str) -> None:
+        ...
+
+    def match_path(self, path: Path) -> Optional[str]:
+        ...
--- a/app/classification/classifier.py
+++ b/app/classification/classifier.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+from typing import List, Set, Dict, Tuple
+import re
+
+class FileClassifier:
+
+    def __init__(self):
+        self.build_patterns = {'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', 'site-packages', 'bower_components', 'jspm_packages'}
+        self.artifact_patterns = {'java': {'.jar', '.war', '.ear', '.class'}, 'python': {'.pyc', '.pyo', '.whl', '.egg'}, 'node': {'node_modules'}, 'go': {'vendor', 'pkg'}, 'rust': {'target'}, 'docker': {'.dockerignore', 'Dockerfile'}}
+        self.category_keywords = {'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, 'cache': {'cache', 'temp', 'tmp', '.cache'}, 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, 'backups': {'backup', 'bak', 'snapshot', 'archive'}, 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, 'artifacts': {'build', 'dist', 'release', 'output'}, 'temp': {'tmp', 'temp', 'staging', 'processing'}}
+        self.media_extensions = {'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, 'presentation': {'.ppt', '.pptx', '.odp'}}
+        self.code_extensions = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'}
+
+    def classify_path(self, path: str, size: int=0) -> Tuple[Set[str], str, bool]:
+        p = Path(path)
+        labels = set()
+        primary_category = 'misc'
+        is_build_artifact = False
+        parts = p.parts
+        name_lower = p.name.lower()
+        for part in parts:
+            part_lower = part.lower()
+            if part_lower in self.build_patterns:
+                is_build_artifact = True
+                labels.add('build-artifact')
+                break
+        if is_build_artifact:
+            for artifact_type, patterns in self.artifact_patterns.items():
+                if any((part.lower() in patterns for part in parts)) or p.suffix in patterns:
+                    primary_category = f'artifacts/{artifact_type}'
+                    labels.add('artifact')
+                    return (labels, primary_category, is_build_artifact)
+        if '.git' in parts:
+            labels.add('vcs')
+            primary_category = 'infra/git-infrastructure'
+            return (labels, primary_category, False)
+        for category, keywords in self.category_keywords.items():
+            if any((kw in name_lower or any((kw in part.lower() for part in parts)) for kw in keywords)):
+                labels.add(category)
+                primary_category = category
+                break
+        for media_type, extensions in self.media_extensions.items():
+            if p.suffix.lower() in extensions:
+                labels.add(media_type)
+                labels.add('media')
+                primary_category = f'user/{media_type}'
+                break
+        if p.suffix.lower() in self.code_extensions:
+            labels.add('code')
+            if primary_category == 'misc':
+                primary_category = 'dev'
+        if size > 100 * 1024 * 1024:
+            labels.add('large-file')
+        if any((kw in name_lower for kw in ['test', 'spec', 'mock'])):
+            labels.add('test')
+        if any((kw in name_lower for kw in ['config', 'settings', 'env'])):
+            labels.add('config')
+        return (labels, primary_category, is_build_artifact)
+
+    def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str:
+        p = Path(source_path)
+        if 'build-artifact' in labels:
+            return f'trash/build-artifacts/{source_path}'
+        if category.startswith('artifacts/'):
+            artifact_type = category.split('/')[-1]
+            return f'artifacts/{artifact_type}/{p.name}'
+        if category.startswith('user/'):
+            media_type = category.split('/')[-1]
+            return f'user/{media_type}/{p.name}'
+        parts = [part for part in p.parts if part not in self.build_patterns]
+        if len(parts) > 3:
+            project_name = parts[0] if parts else 'misc'
+            return f"{category}/{project_name}/{'/'.join(parts[1:])}"
+        return f'{category}/{source_path}'
--- a/app/classification/engine.py
+++ b/app/classification/engine.py
@@ -0,0 +1,148 @@
+from pathlib import Path
+from typing import Optional, Callable
+import psycopg2
+from .rules import RuleBasedClassifier
+from .ml import create_ml_classifier, DummyMLClassifier
+from ..shared.models import ProcessingStats
+from ..shared.config import DatabaseConfig
+from ..shared.logger import ProgressLogger
+
+class ClassificationEngine:
+
+    def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, use_ml: bool=False):
+        self.db_config = db_config
+        self.logger = logger
+        self.rule_classifier = RuleBasedClassifier()
+        self.ml_classifier = create_ml_classifier() if use_ml else None
+        self.use_ml = use_ml and (not isinstance(self.ml_classifier, DummyMLClassifier))
+        self._connection = None
+
+    def _get_connection(self):
+        if self._connection is None or self._connection.closed:
+            self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
+        return self._connection
+
+    def classify_all(self, disk: Optional[str]=None, batch_size: int=1000, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
+        self.logger.section('Starting Classification')
+        conn = self._get_connection()
+        cursor = conn.cursor()
+        if disk:
+            cursor.execute('\n                SELECT path, checksum\n                FROM files\n                WHERE disk_label = %s AND category IS NULL\n            ', (disk,))
+        else:
+            cursor.execute('\n                SELECT path, checksum\n                FROM files\n                WHERE category IS NULL\n            ')
+        files_to_classify = cursor.fetchall()
+        total_files = len(files_to_classify)
+        self.logger.info(f'Found {total_files} files to classify')
+        stats = ProcessingStats()
+        batch = []
+        for path_str, checksum in files_to_classify:
+            path = Path(path_str)
+            category = self.rule_classifier.classify(path)
+            if category is None and self.use_ml and self.ml_classifier:
+                category = self.ml_classifier.classify(path)
+            if category is None:
+                category = 'temp/processing'
+            batch.append((category, str(path)))
+            stats.files_processed += 1
+            if len(batch) >= batch_size:
+                self._update_categories(cursor, batch)
+                conn.commit()
+                batch.clear()
+                if progress_callback:
+                    progress_callback(stats.files_processed, total_files, stats)
+                if stats.files_processed % (batch_size * 10) == 0:
+                    self.logger.progress(stats.files_processed, total_files, prefix='Files classified', elapsed_seconds=stats.elapsed_seconds)
+        if batch:
+            self._update_categories(cursor, batch)
+            conn.commit()
+        stats.files_succeeded = stats.files_processed
+        cursor.close()
+        self.logger.info(f'Classification complete: {stats.files_processed} files in {stats.elapsed_seconds:.1f}s')
+        return stats
+
+    def _update_categories(self, cursor, batch: list[tuple[str, str]]):
+        from psycopg2.extras import execute_batch
+        query = '\n            UPDATE files\n            SET category = %s\n            WHERE path = %s\n        '
+        execute_batch(cursor, query, batch)
+
+    def classify_path(self, path: Path) -> Optional[str]:
+        category = self.rule_classifier.classify(path)
+        if category is None and self.use_ml and self.ml_classifier:
+            category = self.ml_classifier.classify(path)
+        return category
+
+    def get_category_stats(self) -> dict[str, dict]:
+        conn = self._get_connection()
+        cursor = conn.cursor()
+        cursor.execute('\n            SELECT\n                category,\n                COUNT(*) as file_count,\n                SUM(size) as total_size\n            FROM files\n            WHERE category IS NOT NULL\n            GROUP BY category\n            ORDER BY total_size DESC\n        ')
+        stats = {}
+        for category, file_count, total_size in cursor.fetchall():
+            stats[category] = {'file_count': file_count, 'total_size': total_size}
+        cursor.close()
+        return stats
+
+    def get_uncategorized_count(self) -> int:
+        conn = self._get_connection()
+        cursor = conn.cursor()
+        cursor.execute('SELECT COUNT(*) FROM files WHERE category IS NULL')
+        count = cursor.fetchone()[0]
+        cursor.close()
+        return count
+
+    def reclassify_category(self, old_category: str, new_category: str) -> int:
+        self.logger.info(f'Reclassifying {old_category} -> {new_category}')
+        conn = self._get_connection()
+        cursor = conn.cursor()
+        cursor.execute('\n            UPDATE files\n            SET category = %s\n            WHERE category = %s\n        ', (new_category, old_category))
+        count = cursor.rowcount
+        conn.commit()
+        cursor.close()
+        self.logger.info(f'Reclassified {count} files')
+        return count
+
+    def train_ml_classifier(self, min_samples: int=10) -> bool:
+        if not self.use_ml or self.ml_classifier is None:
+            self.logger.warning('ML classifier not available')
+            return False
+        self.logger.subsection('Training ML Classifier')
+        conn = self._get_connection()
+        cursor = conn.cursor()
+        cursor.execute('\n            SELECT path, category\n            FROM files\n            WHERE category IS NOT NULL\n        ')
+        training_data = [(Path(path), category) for path, category in cursor.fetchall()]
+        cursor.close()
+        if not training_data:
+            self.logger.warning('No training data available')
+            return False
+        category_counts = {}
+        for _, category in training_data:
+            category_counts[category] = category_counts.get(category, 0) + 1
+        filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples]
+        if not filtered_data:
+            self.logger.warning(f'No categories with >= {min_samples} samples')
+            return False
+        self.logger.info(f'Training with {len(filtered_data)} samples')
+        try:
+            self.ml_classifier.train(filtered_data)
+            self.logger.info('ML classifier trained successfully')
+            return True
+        except Exception as e:
+            self.logger.error(f'Failed to train ML classifier: {e}')
+            return False
+
+    def get_all_categories(self) -> list[str]:
+        conn = self._get_connection()
+        cursor = conn.cursor()
+        cursor.execute('\n            SELECT DISTINCT category\n            FROM files\n            WHERE category IS NOT NULL\n            ORDER BY category\n        ')
+        categories = [row[0] for row in cursor.fetchall()]
+        cursor.close()
+        return categories
+
+    def close(self):
+        if self._connection and (not self._connection.closed):
+            self._connection.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
--- a/app/classification/ml.py
+++ b/app/classification/ml.py
@@ -0,0 +1,127 @@
+from pathlib import Path
+from typing import Optional, List, Tuple
+import pickle
+try:
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    from sklearn.naive_bayes import MultinomialNB
+    from sklearn.pipeline import Pipeline
+    SKLEARN_AVAILABLE = True
+except ImportError:
+    SKLEARN_AVAILABLE = False
+
+class MLClassifier:
+
+    def __init__(self):
+        if not SKLEARN_AVAILABLE:
+            raise ImportError('scikit-learn is required for ML classification. Install with: pip install scikit-learn')
+        self.model: Optional[Pipeline] = None
+        self.categories: List[str] = []
+        self._is_trained = False
+
+    def _extract_features(self, path: Path) -> str:
+        parts = path.parts
+        extension = path.suffix
+        filename = path.name
+        features = []
+        features.extend(parts)
+        if extension:
+            features.append(f'ext:{extension}')
+        name_parts = filename.replace('-', ' ').replace('_', ' ').replace('.', ' ').split()
+        features.extend([f'name:{part}' for part in name_parts])
+        return ' '.join(features)
+
+    def train(self, training_data: List[Tuple[Path, str]]) -> None:
+        if not training_data:
+            raise ValueError('Training data cannot be empty')
+        X = [self._extract_features(path) for path, _ in training_data]
+        y = [category for _, category in training_data]
+        self.categories = sorted(set(y))
+        self.model = Pipeline([('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 2), min_df=1)), ('classifier', MultinomialNB())])
+        self.model.fit(X, y)
+        self._is_trained = True
+
+    def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
+        if not self._is_trained or self.model is None:
+            return None
+        features = self._extract_features(path)
+        try:
+            prediction = self.model.predict([features])[0]
+            return prediction
+        except Exception:
+            return None
+
+    def predict_proba(self, path: Path) -> dict[str, float]:
+        if not self._is_trained or self.model is None:
+            return {}
+        features = self._extract_features(path)
+        try:
+            probabilities = self.model.predict_proba([features])[0]
+            return {category: float(prob) for category, prob in zip(self.categories, probabilities)}
+        except Exception:
+            return {}
+
+    def save_model(self, model_path: Path) -> None:
+        if not self._is_trained:
+            raise ValueError('Cannot save untrained model')
+        model_data = {'model': self.model, 'categories': self.categories, 'is_trained': self._is_trained}
+        with open(model_path, 'wb') as f:
+            pickle.dump(model_data, f)
+
+    def load_model(self, model_path: Path) -> None:
+        with open(model_path, 'rb') as f:
+            model_data = pickle.load(f)
+        self.model = model_data['model']
+        self.categories = model_data['categories']
+        self._is_trained = model_data['is_trained']
+
+    @property
+    def is_trained(self) -> bool:
+        return self._is_trained
+
+class DummyMLClassifier:
+
+    def __init__(self):
+        pass
+
+    def train(self, training_data: List[Tuple[Path, str]]) -> None:
+        raise NotImplementedError('ML classification requires scikit-learn. Install with: pip install scikit-learn')
+
+    def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
+        return None
+
+    def predict_proba(self, path: Path) -> dict[str, float]:
+        return {}
+
+    def save_model(self, model_path: Path) -> None:
+        raise NotImplementedError('ML classification not available')
+
+    def load_model(self, model_path: Path) -> None:
+        raise NotImplementedError('ML classification not available')
+
+    @property
+    def is_trained(self) -> bool:
+        return False
+
+def create_ml_classifier() -> MLClassifier | DummyMLClassifier:
+    if SKLEARN_AVAILABLE:
+        return MLClassifier()
+    else:
+        return DummyMLClassifier()
+
+def train_from_database(db_connection, min_samples_per_category: int=10) -> MLClassifier | DummyMLClassifier:
+    classifier = create_ml_classifier()
+    if isinstance(classifier, DummyMLClassifier):
+        return classifier
+    cursor = db_connection.cursor()
+    cursor.execute('\n        SELECT path, category\n        FROM files\n        WHERE category IS NOT NULL\n    ')
+    training_data = [(Path(path), category) for path, category in cursor.fetchall()]
+    cursor.close()
+    if not training_data:
+        return classifier
+    category_counts = {}
+    for _, category in training_data:
+        category_counts[category] = category_counts.get(category, 0) + 1
+    filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples_per_category]
+    if filtered_data:
+        classifier.train(filtered_data)
+    return classifier
--- a/app/classification/rules.py
+++ b/app/classification/rules.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+from typing import Optional
+import fnmatch
+from ._protocols import ClassificationRule
+
+class RuleBasedClassifier:
+
+    def __init__(self):
+        self.rules: list[ClassificationRule] = []
+        self._load_default_rules()
+
+    def _load_default_rules(self):
+        self.add_rule(ClassificationRule(name='maven_cache', category='artifacts/java/maven', patterns=['**/.m2/**', '**/.maven/**', '**/maven-central-cache/**'], priority=10, description='Maven repository and cache'))
+        self.add_rule(ClassificationRule(name='gradle_cache', category='artifacts/java/gradle', patterns=['**/.gradle/**', '**/gradle-cache/**', '**/gradle-build-cache/**'], priority=10, description='Gradle cache and artifacts'))
+        self.add_rule(ClassificationRule(name='python_cache', category='cache/pycache', patterns=['**/__pycache__/**', '**/*.pyc', '**/*.pyo'], priority=10, description='Python cache files'))
+        self.add_rule(ClassificationRule(name='python_artifacts', category='artifacts/python', patterns=['**/pip-cache/**', '**/pypi-cache/**', '**/wheelhouse/**'], priority=10, description='Python package artifacts'))
+        self.add_rule(ClassificationRule(name='node_modules', category='cache/node_modules-archive', patterns=['**/node_modules/**'], priority=10, description='Node.js modules'))
+        self.add_rule(ClassificationRule(name='node_cache', category='artifacts/node', patterns=['**/.npm/**', '**/npm-registry/**', '**/yarn-cache/**', '**/pnpm-store/**'], priority=10, description='Node.js package managers cache'))
+        self.add_rule(ClassificationRule(name='go_cache', category='artifacts/go', patterns=['**/goproxy-cache/**', '**/go/pkg/mod/**', '**/go-module-cache/**'], priority=10, description='Go module cache'))
+        self.add_rule(ClassificationRule(name='git_repos', category='development/git-infrastructure', patterns=['**/.git/**', '**/gitea/repositories/**'], priority=15, description='Git repositories and infrastructure'))
+        self.add_rule(ClassificationRule(name='gitea', category='development/gitea', patterns=['**/gitea/**'], priority=12, description='Gitea server data'))
+        self.add_rule(ClassificationRule(name='postgresql', category='databases/postgresql', patterns=['**/postgresql/**', '**/postgres/**', '**/*.sql'], priority=10, description='PostgreSQL databases'))
+        self.add_rule(ClassificationRule(name='mysql', category='databases/mysql', patterns=['**/mysql/**', '**/mariadb/**'], priority=10, description='MySQL/MariaDB databases'))
+        self.add_rule(ClassificationRule(name='mongodb', category='databases/mongodb', patterns=['**/mongodb/**', '**/mongo/**'], priority=10, description='MongoDB databases'))
+        self.add_rule(ClassificationRule(name='redis', category='databases/redis', patterns=['**/redis/**', '**/*.rdb'], priority=10, description='Redis databases'))
+        self.add_rule(ClassificationRule(name='sqlite', category='databases/sqlite', patterns=['**/*.db', '**/*.sqlite', '**/*.sqlite3'], priority=8, description='SQLite databases'))
+        self.add_rule(ClassificationRule(name='llm_models', category='cache/llm-models', patterns=['**/hugging-face/**', '**/huggingface/**', '**/.cache/huggingface/**', '**/models/**/*.bin', '**/models/**/*.onnx', '**/models/**/*.safetensors', '**/llm*/**', '**/openai-cache/**'], priority=12, description='LLM and AI model files'))
+        self.add_rule(ClassificationRule(name='docker_volumes', category='apps/volumes/docker-volumes', patterns=['**/docker/volumes/**', '**/var/lib/docker/volumes/**'], priority=10, description='Docker volumes'))
+        self.add_rule(ClassificationRule(name='app_data', category='apps/volumes/app-data', patterns=['**/app-data/**', '**/application-data/**'], priority=8, description='Application data'))
+        self.add_rule(ClassificationRule(name='build_output', category='development/build-tools', patterns=['**/target/**', '**/build/**', '**/dist/**', '**/out/**'], priority=5, description='Build output directories'))
+        self.add_rule(ClassificationRule(name='system_backups', category='backups/system', patterns=['**/backup/**', '**/backups/**', '**/*.bak', '**/*.backup'], priority=10, description='System backups'))
+        self.add_rule(ClassificationRule(name='database_backups', category='backups/database', patterns=['**/*.sql.gz', '**/*.dump', '**/db-backup/**'], priority=11, description='Database backups'))
+        self.add_rule(ClassificationRule(name='archives', category='backups/archive', patterns=['**/*.tar', '**/*.tar.gz', '**/*.tgz', '**/*.zip', '**/*.7z'], priority=5, description='Archive files'))
+
+    def add_rule(self, rule: ClassificationRule) -> None:
+        self.rules.append(rule)
+        self.rules.sort(key=lambda r: r.priority, reverse=True)
+
+    def remove_rule(self, rule_name: str) -> None:
+        self.rules = [r for r in self.rules if r.name != rule_name]
+
+    def match_path(self, path: Path) -> Optional[str]:
+        path_str = str(path)
+        for rule in self.rules:
+            for pattern in rule.patterns:
+                if fnmatch.fnmatch(path_str, pattern):
+                    return rule.category
+        return None
+
+    def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
+        return self.match_path(path)
+
+    def get_category_rules(self, category: str) -> list[ClassificationRule]:
+        return [r for r in self.rules if r.category == category]
+
+    def get_all_categories(self) -> set[str]:
+        return {r.category for r in self.rules}
+
+    def get_rules_by_priority(self, min_priority: int=0) -> list[ClassificationRule]:
+        return [r for r in self.rules if r.priority >= min_priority]