base

2025-12-12 19:25:16 +01:00
parent 5e0db89d45
commit 56b2db82fc
34 changed files with 117 additions and 6556 deletions
--- a/app/classification/init.py
+++ b/app/classification/init.py
@@ -0,0 +1,17 @@
+"""Classification package exports"""
+from .rules import RuleBasedClassifier
+from .ml import create_ml_classifier, train_from_database, MLClassifier, DummyMLClassifier
+from .engine import ClassificationEngine
+from ._protocols import ClassificationRule, IClassifier, IRuleEngine
+
+__all__ = [
+    'RuleBasedClassifier',
+    'MLClassifier',
+    'DummyMLClassifier',
+    'create_ml_classifier',
+    'train_from_database',
+    'ClassificationEngine',
+    'ClassificationRule',
+    'IClassifier',
+    'IRuleEngine',
+]
--- a/app/classification/_protocols.py
+++ b/app/classification/_protocols.py
@@ -0,0 +1,72 @@
+"""Protocol definitions for the classification package"""
+from typing import Protocol, Optional
+from pathlib import Path
+from dataclasses import dataclass
+
+
+@dataclass
+class ClassificationRule:
+    """Rule for classifying files"""
+    name: str
+    category: str
+    patterns: list[str]
+    priority: int = 0
+    description: str = ""
+
+
+class IClassifier(Protocol):
+    """Protocol for classification operations"""
+
+    def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
+        """Classify a file path
+
+        Args:
+            path: Path to classify
+            file_type: Optional file type hint
+
+        Returns:
+            Category name or None if no match
+        """
+        ...
+
+    def get_category_rules(self, category: str) -> list[ClassificationRule]:
+        """Get all rules for a category
+
+        Args:
+            category: Category name
+
+        Returns:
+            List of rules for the category
+        """
+        ...
+
+
+class IRuleEngine(Protocol):
+    """Protocol for rule-based classification"""
+
+    def add_rule(self, rule: ClassificationRule) -> None:
+        """Add a classification rule
+
+        Args:
+            rule: Rule to add
+        """
+        ...
+
+    def remove_rule(self, rule_name: str) -> None:
+        """Remove a rule by name
+
+        Args:
+            rule_name: Name of rule to remove
+        """
+        ...
+
+    def match_path(self, path: Path) -> Optional[str]:
+        """Match path against rules
+
+        Args:
+            path: Path to match
+
+        Returns:
+            Category name or None if no match
+        """
+        ...
--- a/app/classification/engine.py
+++ b/app/classification/engine.py
@@ -0,0 +1,350 @@
+"""Main classification engine"""
+from pathlib import Path
+from typing import Optional, Callable
+import psycopg2
+
+from .rules import RuleBasedClassifier
+from .ml import create_ml_classifier, DummyMLClassifier
+from ..shared.models import ProcessingStats
+from ..shared.config import DatabaseConfig
+from ..shared.logger import ProgressLogger
+
+
+class ClassificationEngine:
+    """Engine for classifying files"""
+
+    def __init__(
+        self,
+        db_config: DatabaseConfig,
+        logger: ProgressLogger,
+        use_ml: bool = False
+    ):
+        """Initialize classification engine
+
+        Args:
+            db_config: Database configuration
+            logger: Progress logger
+            use_ml: Whether to use ML classification in addition to rules
+        """
+        self.db_config = db_config
+        self.logger = logger
+        self.rule_classifier = RuleBasedClassifier()
+        self.ml_classifier = create_ml_classifier() if use_ml else None
+        self.use_ml = use_ml and not isinstance(self.ml_classifier, DummyMLClassifier)
+        self._connection = None
+
+    def _get_connection(self):
+        """Get or create database connection"""
+        if self._connection is None or self._connection.closed:
+            self._connection = psycopg2.connect(
+                host=self.db_config.host,
+                port=self.db_config.port,
+                database=self.db_config.database,
+                user=self.db_config.user,
+                password=self.db_config.password
+            )
+        return self._connection
+
+    def classify_all(
+        self,
+        disk: Optional[str] = None,
+        batch_size: int = 1000,
+        progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
+    ) -> ProcessingStats:
+        """Classify all files in database
+
+        Args:
+            disk: Optional disk filter
+            batch_size: Number of files to process per batch
+            progress_callback: Optional callback for progress updates
+
+        Returns:
+            ProcessingStats with classification statistics
+        """
+        self.logger.section("Starting Classification")
+
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        # Get files without categories
+        if disk:
+            cursor.execute("""
+                SELECT path, checksum
+                FROM files_bak
+                WHERE disk = %s AND category IS NULL
+            """, (disk,))
+        else:
+            cursor.execute("""
+                SELECT path, checksum
+                FROM files_bak
+                WHERE category IS NULL
+            """)
+
+        files_to_classify = cursor.fetchall()
+        total_files = len(files_to_classify)
+
+        self.logger.info(f"Found {total_files} files to classify")
+
+        stats = ProcessingStats()
+        batch = []
+
+        for path_str, checksum in files_to_classify:
+            path = Path(path_str)
+
+            # Classify using rules first
+            category = self.rule_classifier.classify(path)
+
+            # If no rule match and ML is available, try ML
+            if category is None and self.use_ml and self.ml_classifier:
+                category = self.ml_classifier.classify(path)
+
+            # If still no category, assign default
+            if category is None:
+                category = "temp/processing"
+
+            batch.append((category, str(path)))
+            stats.files_processed += 1
+
+            # Batch update
+            if len(batch) >= batch_size:
+                self._update_categories(cursor, batch)
+                conn.commit()
+                batch.clear()
+
+                # Progress callback
+                if progress_callback:
+                    progress_callback(stats.files_processed, total_files, stats)
+
+                # Log progress
+                if stats.files_processed % (batch_size * 10) == 0:
+                    self.logger.progress(
+                        stats.files_processed,
+                        total_files,
+                        prefix="Files classified",
+                        elapsed_seconds=stats.elapsed_seconds
+                    )
+
+        # Update remaining batch
+        if batch:
+            self._update_categories(cursor, batch)
+            conn.commit()
+
+        stats.files_succeeded = stats.files_processed
+
+        cursor.close()
+
+        self.logger.info(
+            f"Classification complete: {stats.files_processed} files in {stats.elapsed_seconds:.1f}s"
+        )
+
+        return stats
+
+    def _update_categories(self, cursor, batch: list[tuple[str, str]]):
+        """Update categories in batch
+
+        Args:
+            cursor: Database cursor
+            batch: List of (category, path) tuples
+        """
+        from psycopg2.extras import execute_batch
+
+        query = """
+            UPDATE files_bak
+            SET category = %s
+            WHERE path = %s
+        """
+
+        execute_batch(cursor, query, batch)
+
+    def classify_path(self, path: Path) -> Optional[str]:
+        """Classify a single path
+
+        Args:
+            path: Path to classify
+
+        Returns:
+            Category name or None
+        """
+        # Try rules first
+        category = self.rule_classifier.classify(path)
+
+        # Try ML if available
+        if category is None and self.use_ml and self.ml_classifier:
+            category = self.ml_classifier.classify(path)
+
+        return category
+
+    def get_category_stats(self) -> dict[str, dict]:
+        """Get statistics by category
+
+        Returns:
+            Dictionary mapping category to statistics
+        """
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            SELECT
+                category,
+                COUNT(*) as file_count,
+                SUM(size) as total_size
+            FROM files_bak
+            WHERE category IS NOT NULL
+            GROUP BY category
+            ORDER BY total_size DESC
+        """)
+
+        stats = {}
+        for category, file_count, total_size in cursor.fetchall():
+            stats[category] = {
+                'file_count': file_count,
+                'total_size': total_size
+            }
+
+        cursor.close()
+
+        return stats
+
+    def get_uncategorized_count(self) -> int:
+        """Get count of uncategorized files
+
+        Returns:
+            Number of files without category
+        """
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        cursor.execute("SELECT COUNT(*) FROM files_bak WHERE category IS NULL")
+        count = cursor.fetchone()[0]
+
+        cursor.close()
+
+        return count
+
+    def reclassify_category(
+        self,
+        old_category: str,
+        new_category: str
+    ) -> int:
+        """Reclassify all files in a category
+
+        Args:
+            old_category: Current category
+            new_category: New category
+
+        Returns:
+            Number of files reclassified
+        """
+        self.logger.info(f"Reclassifying {old_category} -> {new_category}")
+
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            UPDATE files_bak
+            SET category = %s
+            WHERE category = %s
+        """, (new_category, old_category))
+
+        count = cursor.rowcount
+        conn.commit()
+        cursor.close()
+
+        self.logger.info(f"Reclassified {count} files")
+
+        return count
+
+    def train_ml_classifier(
+        self,
+        min_samples: int = 10
+    ) -> bool:
+        """Train ML classifier from existing categorized data
+
+        Args:
+            min_samples: Minimum samples per category
+
+        Returns:
+            True if training successful
+        """
+        if not self.use_ml or self.ml_classifier is None:
+            self.logger.warning("ML classifier not available")
+            return False
+
+        self.logger.subsection("Training ML Classifier")
+
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        # Get categorized files
+        cursor.execute("""
+            SELECT path, category
+            FROM files_bak
+            WHERE category IS NOT NULL
+        """)
+
+        training_data = [(Path(path), category) for path, category in cursor.fetchall()]
+        cursor.close()
+
+        if not training_data:
+            self.logger.warning("No training data available")
+            return False
+
+        # Count samples per category
+        category_counts = {}
+        for _, category in training_data:
+            category_counts[category] = category_counts.get(category, 0) + 1
+
+        # Filter categories with enough samples
+        filtered_data = [
+            (path, category)
+            for path, category in training_data
+            if category_counts[category] >= min_samples
+        ]
+
+        if not filtered_data:
+            self.logger.warning(f"No categories with >= {min_samples} samples")
+            return False
+
+        self.logger.info(f"Training with {len(filtered_data)} samples")
+
+        try:
+            self.ml_classifier.train(filtered_data)
+            self.logger.info("ML classifier trained successfully")
+            return True
+        except Exception as e:
+            self.logger.error(f"Failed to train ML classifier: {e}")
+            return False
+
+    def get_all_categories(self) -> list[str]:
+        """Get all categories from database
+
+        Returns:
+            List of category names
+        """
+        conn = self._get_connection()
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            SELECT DISTINCT category
+            FROM files_bak
+            WHERE category IS NOT NULL
+            ORDER BY category
+        """)
+
+        categories = [row[0] for row in cursor.fetchall()]
+        cursor.close()
+
+        return categories
+
+    def close(self):
+        """Close database connection"""
+        if self._connection and not self._connection.closed:
+            self._connection.close()
+
+    def __enter__(self):
+        """Context manager entry"""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        self.close()
--- a/app/classification/ml.py
+++ b/app/classification/ml.py
@@ -0,0 +1,269 @@
+"""ML-based classification (optional, using sklearn if available)"""
+from pathlib import Path
+from typing import Optional, List, Tuple
+import pickle
+
+try:
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    from sklearn.naive_bayes import MultinomialNB
+    from sklearn.pipeline import Pipeline
+    SKLEARN_AVAILABLE = True
+except ImportError:
+    SKLEARN_AVAILABLE = False
+
+
+class MLClassifier:
+    """Machine learning-based file classifier
+
+    Uses path-based features and optional metadata to classify files.
+    Requires scikit-learn to be installed.
+    """
+
+    def __init__(self):
+        """Initialize ML classifier"""
+        if not SKLEARN_AVAILABLE:
+            raise ImportError(
+                "scikit-learn is required for ML classification. "
+                "Install with: pip install scikit-learn"
+            )
+
+        self.model: Optional[Pipeline] = None
+        self.categories: List[str] = []
+        self._is_trained = False
+
+    def _extract_features(self, path: Path) -> str:
+        """Extract features from path
+
+        Args:
+            path: Path to extract features from
+
+        Returns:
+            Feature string
+        """
+        # Convert path to feature string
+        # Include: path parts, extension, filename
+        parts = path.parts
+        extension = path.suffix
+        filename = path.name
+
+        features = []
+
+        # Add path components
+        features.extend(parts)
+
+        # Add extension
+        if extension:
+            features.append(f"ext:{extension}")
+
+        # Add filename components (split on common separators)
+        name_parts = filename.replace('-', ' ').replace('_', ' ').replace('.', ' ').split()
+        features.extend([f"name:{part}" for part in name_parts])
+
+        return ' '.join(features)
+
+    def train(self, training_data: List[Tuple[Path, str]]) -> None:
+        """Train the classifier
+
+        Args:
+            training_data: List of (path, category) tuples
+        """
+        if not training_data:
+            raise ValueError("Training data cannot be empty")
+
+        # Extract features and labels
+        X = [self._extract_features(path) for path, _ in training_data]
+        y = [category for _, category in training_data]
+
+        # Store unique categories
+        self.categories = sorted(set(y))
+
+        # Create and train pipeline
+        self.model = Pipeline([
+            ('tfidf', TfidfVectorizer(
+                max_features=1000,
+                ngram_range=(1, 2),
+                min_df=1
+            )),
+            ('classifier', MultinomialNB())
+        ])
+
+        self.model.fit(X, y)
+        self._is_trained = True
+
+    def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
+        """Classify a file path
+
+        Args:
+            path: Path to classify
+            file_type: Optional file type hint (not used in ML classifier)
+
+        Returns:
+            Category name or None if not trained
+        """
+        if not self._is_trained or self.model is None:
+            return None
+
+        features = self._extract_features(path)
+
+        try:
+            prediction = self.model.predict([features])[0]
+            return prediction
+        except Exception:
+            return None
+
+    def predict_proba(self, path: Path) -> dict[str, float]:
+        """Get prediction probabilities for all categories
+
+        Args:
+            path: Path to classify
+
+        Returns:
+            Dictionary mapping category to probability
+        """
+        if not self._is_trained or self.model is None:
+            return {}
+
+        features = self._extract_features(path)
+
+        try:
+            probabilities = self.model.predict_proba([features])[0]
+            return {
+                category: float(prob)
+                for category, prob in zip(self.categories, probabilities)
+            }
+        except Exception:
+            return {}
+
+    def save_model(self, model_path: Path) -> None:
+        """Save trained model to disk
+
+        Args:
+            model_path: Path to save model
+        """
+        if not self._is_trained:
+            raise ValueError("Cannot save untrained model")
+
+        model_data = {
+            'model': self.model,
+            'categories': self.categories,
+            'is_trained': self._is_trained
+        }
+
+        with open(model_path, 'wb') as f:
+            pickle.dump(model_data, f)
+
+    def load_model(self, model_path: Path) -> None:
+        """Load trained model from disk
+
+        Args:
+            model_path: Path to model file
+        """
+        with open(model_path, 'rb') as f:
+            model_data = pickle.load(f)
+
+        self.model = model_data['model']
+        self.categories = model_data['categories']
+        self._is_trained = model_data['is_trained']
+
+    @property
+    def is_trained(self) -> bool:
+        """Check if model is trained"""
+        return self._is_trained
+
+
+class DummyMLClassifier:
+    """Dummy ML classifier for when sklearn is not available"""
+
+    def __init__(self):
+        """Initialize dummy classifier"""
+        pass
+
+    def train(self, training_data: List[Tuple[Path, str]]) -> None:
+        """Dummy train method"""
+        raise NotImplementedError(
+            "ML classification requires scikit-learn. "
+            "Install with: pip install scikit-learn"
+        )
+
+    def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
+        """Dummy classify method"""
+        return None
+
+    def predict_proba(self, path: Path) -> dict[str, float]:
+        """Dummy predict_proba method"""
+        return {}
+
+    def save_model(self, model_path: Path) -> None:
+        """Dummy save_model method"""
+        raise NotImplementedError("ML classification not available")
+
+    def load_model(self, model_path: Path) -> None:
+        """Dummy load_model method"""
+        raise NotImplementedError("ML classification not available")
+
+    @property
+    def is_trained(self) -> bool:
+        """Check if model is trained"""
+        return False
+
+
+def create_ml_classifier() -> MLClassifier | DummyMLClassifier:
+    """Create ML classifier if sklearn is available, otherwise return dummy
+
+    Returns:
+        MLClassifier or DummyMLClassifier
+    """
+    if SKLEARN_AVAILABLE:
+        return MLClassifier()
+    else:
+        return DummyMLClassifier()
+
+
+def train_from_database(
+    db_connection,
+    min_samples_per_category: int = 10
+) -> MLClassifier | DummyMLClassifier:
+    """Train ML classifier from database
+
+    Args:
+        db_connection: Database connection
+        min_samples_per_category: Minimum samples required per category
+
+    Returns:
+        Trained classifier
+    """
+    classifier = create_ml_classifier()
+
+    if isinstance(classifier, DummyMLClassifier):
+        return classifier
+
+    # Query classified files from database
+    cursor = db_connection.cursor()
+    cursor.execute("""
+        SELECT path, category
+        FROM files_bak
+        WHERE category IS NOT NULL
+    """)
+
+    training_data = [(Path(path), category) for path, category in cursor.fetchall()]
+    cursor.close()
+
+    if not training_data:
+        return classifier
+
+    # Count samples per category
+    category_counts = {}
+    for _, category in training_data:
+        category_counts[category] = category_counts.get(category, 0) + 1
+
+    # Filter to categories with enough samples
+    filtered_data = [
+        (path, category)
+        for path, category in training_data
+        if category_counts[category] >= min_samples_per_category
+    ]
+
+    if filtered_data:
+        classifier.train(filtered_data)
+
+    return classifier
--- a/app/classification/rules.py
+++ b/app/classification/rules.py
@@ -0,0 +1,282 @@
+"""Rule-based classification engine"""
+from pathlib import Path
+from typing import Optional
+import fnmatch
+
+from ._protocols import ClassificationRule
+
+
+class RuleBasedClassifier:
+    """Rule-based file classifier using pattern matching"""
+
+    def __init__(self):
+        """Initialize rule-based classifier"""
+        self.rules: list[ClassificationRule] = []
+        self._load_default_rules()
+
+    def _load_default_rules(self):
+        """Load default classification rules based on ARCHITECTURE.md"""
+
+        # Build artifacts and caches
+        self.add_rule(ClassificationRule(
+            name="maven_cache",
+            category="artifacts/java/maven",
+            patterns=["**/.m2/**", "**/.maven/**", "**/maven-central-cache/**"],
+            priority=10,
+            description="Maven repository and cache"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="gradle_cache",
+            category="artifacts/java/gradle",
+            patterns=["**/.gradle/**", "**/gradle-cache/**", "**/gradle-build-cache/**"],
+            priority=10,
+            description="Gradle cache and artifacts"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="python_cache",
+            category="cache/pycache",
+            patterns=["**/__pycache__/**", "**/*.pyc", "**/*.pyo"],
+            priority=10,
+            description="Python cache files"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="python_artifacts",
+            category="artifacts/python",
+            patterns=["**/pip-cache/**", "**/pypi-cache/**", "**/wheelhouse/**"],
+            priority=10,
+            description="Python package artifacts"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="node_modules",
+            category="cache/node_modules-archive",
+            patterns=["**/node_modules/**"],
+            priority=10,
+            description="Node.js modules"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="node_cache",
+            category="artifacts/node",
+            patterns=["**/.npm/**", "**/npm-registry/**", "**/yarn-cache/**", "**/pnpm-store/**"],
+            priority=10,
+            description="Node.js package managers cache"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="go_cache",
+            category="artifacts/go",
+            patterns=["**/goproxy-cache/**", "**/go/pkg/mod/**", "**/go-module-cache/**"],
+            priority=10,
+            description="Go module cache"
+        ))
+
+        # Version control
+        self.add_rule(ClassificationRule(
+            name="git_repos",
+            category="development/git-infrastructure",
+            patterns=["**/.git/**", "**/gitea/repositories/**"],
+            priority=15,
+            description="Git repositories and infrastructure"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="gitea",
+            category="development/gitea",
+            patterns=["**/gitea/**"],
+            priority=12,
+            description="Gitea server data"
+        ))
+
+        # Databases
+        self.add_rule(ClassificationRule(
+            name="postgresql",
+            category="databases/postgresql",
+            patterns=["**/postgresql/**", "**/postgres/**", "**/*.sql"],
+            priority=10,
+            description="PostgreSQL databases"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="mysql",
+            category="databases/mysql",
+            patterns=["**/mysql/**", "**/mariadb/**"],
+            priority=10,
+            description="MySQL/MariaDB databases"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="mongodb",
+            category="databases/mongodb",
+            patterns=["**/mongodb/**", "**/mongo/**"],
+            priority=10,
+            description="MongoDB databases"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="redis",
+            category="databases/redis",
+            patterns=["**/redis/**", "**/*.rdb"],
+            priority=10,
+            description="Redis databases"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="sqlite",
+            category="databases/sqlite",
+            patterns=["**/*.db", "**/*.sqlite", "**/*.sqlite3"],
+            priority=8,
+            description="SQLite databases"
+        ))
+
+        # LLM and AI models
+        self.add_rule(ClassificationRule(
+            name="llm_models",
+            category="cache/llm-models",
+            patterns=[
+                "**/hugging-face/**",
+                "**/huggingface/**",
+                "**/.cache/huggingface/**",
+                "**/models/**/*.bin",
+                "**/models/**/*.onnx",
+                "**/models/**/*.safetensors",
+                "**/llm*/**",
+                "**/openai-cache/**"
+            ],
+            priority=12,
+            description="LLM and AI model files"
+        ))
+
+        # Docker and containers
+        self.add_rule(ClassificationRule(
+            name="docker_volumes",
+            category="apps/volumes/docker-volumes",
+            patterns=["**/docker/volumes/**", "**/var/lib/docker/volumes/**"],
+            priority=10,
+            description="Docker volumes"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="app_data",
+            category="apps/volumes/app-data",
+            patterns=["**/app-data/**", "**/application-data/**"],
+            priority=8,
+            description="Application data"
+        ))
+
+        # Build outputs
+        self.add_rule(ClassificationRule(
+            name="build_output",
+            category="development/build-tools",
+            patterns=["**/target/**", "**/build/**", "**/dist/**", "**/out/**"],
+            priority=5,
+            description="Build output directories"
+        ))
+
+        # Backups
+        self.add_rule(ClassificationRule(
+            name="system_backups",
+            category="backups/system",
+            patterns=["**/backup/**", "**/backups/**", "**/*.bak", "**/*.backup"],
+            priority=10,
+            description="System backups"
+        ))
+
+        self.add_rule(ClassificationRule(
+            name="database_backups",
+            category="backups/database",
+            patterns=["**/*.sql.gz", "**/*.dump", "**/db-backup/**"],
+            priority=11,
+            description="Database backups"
+        ))
+
+        # Archives
+        self.add_rule(ClassificationRule(
+            name="archives",
+            category="backups/archive",
+            patterns=["**/*.tar", "**/*.tar.gz", "**/*.tgz", "**/*.zip", "**/*.7z"],
+            priority=5,
+            description="Archive files"
+        ))
+
+    def add_rule(self, rule: ClassificationRule) -> None:
+        """Add a classification rule
+
+        Args:
+            rule: Rule to add
+        """
+        self.rules.append(rule)
+        # Sort rules by priority (higher priority first)
+        self.rules.sort(key=lambda r: r.priority, reverse=True)
+
+    def remove_rule(self, rule_name: str) -> None:
+        """Remove a rule by name
+
+        Args:
+            rule_name: Name of rule to remove
+        """
+        self.rules = [r for r in self.rules if r.name != rule_name]
+
+    def match_path(self, path: Path) -> Optional[str]:
+        """Match path against rules
+
+        Args:
+            path: Path to match
+
+        Returns:
+            Category name or None if no match
+        """
+        path_str = str(path)
+
+        # Try to match each rule in priority order
+        for rule in self.rules:
+            for pattern in rule.patterns:
+                if fnmatch.fnmatch(path_str, pattern):
+                    return rule.category
+
+        return None
+
+    def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
+        """Classify a file path
+
+        Args:
+            path: Path to classify
+            file_type: Optional file type hint
+
+        Returns:
+            Category name or None if no match
+        """
+        return self.match_path(path)
+
+    def get_category_rules(self, category: str) -> list[ClassificationRule]:
+        """Get all rules for a category
+
+        Args:
+            category: Category name
+
+        Returns:
+            List of rules for the category
+        """
+        return [r for r in self.rules if r.category == category]
+
+    def get_all_categories(self) -> set[str]:
+        """Get all defined categories
+
+        Returns:
+            Set of category names
+        """
+        return {r.category for r in self.rules}
+
+    def get_rules_by_priority(self, min_priority: int = 0) -> list[ClassificationRule]:
+        """Get rules above a minimum priority
+
+        Args:
+            min_priority: Minimum priority threshold
+
+        Returns:
+            List of rules with priority >= min_priority
+        """
+        return [r for r in self.rules if r.priority >= min_priority]