"""Rule-based classification engine""" from pathlib import Path from typing import Optional import fnmatch from ._protocols import ClassificationRule class RuleBasedClassifier: """Rule-based file classifier using pattern matching""" def __init__(self): """Initialize rule-based classifier""" self.rules: list[ClassificationRule] = [] self._load_default_rules() def _load_default_rules(self): """Load default classification rules based on ARCHITECTURE.md""" # Build artifacts and caches self.add_rule(ClassificationRule( name="maven_cache", category="artifacts/java/maven", patterns=["**/.m2/**", "**/.maven/**", "**/maven-central-cache/**"], priority=10, description="Maven repository and cache" )) self.add_rule(ClassificationRule( name="gradle_cache", category="artifacts/java/gradle", patterns=["**/.gradle/**", "**/gradle-cache/**", "**/gradle-build-cache/**"], priority=10, description="Gradle cache and artifacts" )) self.add_rule(ClassificationRule( name="python_cache", category="cache/pycache", patterns=["**/__pycache__/**", "**/*.pyc", "**/*.pyo"], priority=10, description="Python cache files" )) self.add_rule(ClassificationRule( name="python_artifacts", category="artifacts/python", patterns=["**/pip-cache/**", "**/pypi-cache/**", "**/wheelhouse/**"], priority=10, description="Python package artifacts" )) self.add_rule(ClassificationRule( name="node_modules", category="cache/node_modules-archive", patterns=["**/node_modules/**"], priority=10, description="Node.js modules" )) self.add_rule(ClassificationRule( name="node_cache", category="artifacts/node", patterns=["**/.npm/**", "**/npm-registry/**", "**/yarn-cache/**", "**/pnpm-store/**"], priority=10, description="Node.js package managers cache" )) self.add_rule(ClassificationRule( name="go_cache", category="artifacts/go", patterns=["**/goproxy-cache/**", "**/go/pkg/mod/**", "**/go-module-cache/**"], priority=10, description="Go module cache" )) # Version control self.add_rule(ClassificationRule( name="git_repos", category="development/git-infrastructure", patterns=["**/.git/**", "**/gitea/repositories/**"], priority=15, description="Git repositories and infrastructure" )) self.add_rule(ClassificationRule( name="gitea", category="development/gitea", patterns=["**/gitea/**"], priority=12, description="Gitea server data" )) # Databases self.add_rule(ClassificationRule( name="postgresql", category="databases/postgresql", patterns=["**/postgresql/**", "**/postgres/**", "**/*.sql"], priority=10, description="PostgreSQL databases" )) self.add_rule(ClassificationRule( name="mysql", category="databases/mysql", patterns=["**/mysql/**", "**/mariadb/**"], priority=10, description="MySQL/MariaDB databases" )) self.add_rule(ClassificationRule( name="mongodb", category="databases/mongodb", patterns=["**/mongodb/**", "**/mongo/**"], priority=10, description="MongoDB databases" )) self.add_rule(ClassificationRule( name="redis", category="databases/redis", patterns=["**/redis/**", "**/*.rdb"], priority=10, description="Redis databases" )) self.add_rule(ClassificationRule( name="sqlite", category="databases/sqlite", patterns=["**/*.db", "**/*.sqlite", "**/*.sqlite3"], priority=8, description="SQLite databases" )) # LLM and AI models self.add_rule(ClassificationRule( name="llm_models", category="cache/llm-models", patterns=[ "**/hugging-face/**", "**/huggingface/**", "**/.cache/huggingface/**", "**/models/**/*.bin", "**/models/**/*.onnx", "**/models/**/*.safetensors", "**/llm*/**", "**/openai-cache/**" ], priority=12, description="LLM and AI model files" )) # Docker and containers self.add_rule(ClassificationRule( name="docker_volumes", category="apps/volumes/docker-volumes", patterns=["**/docker/volumes/**", "**/var/lib/docker/volumes/**"], priority=10, description="Docker volumes" )) self.add_rule(ClassificationRule( name="app_data", category="apps/volumes/app-data", patterns=["**/app-data/**", "**/application-data/**"], priority=8, description="Application data" )) # Build outputs self.add_rule(ClassificationRule( name="build_output", category="development/build-tools", patterns=["**/target/**", "**/build/**", "**/dist/**", "**/out/**"], priority=5, description="Build output directories" )) # Backups self.add_rule(ClassificationRule( name="system_backups", category="backups/system", patterns=["**/backup/**", "**/backups/**", "**/*.bak", "**/*.backup"], priority=10, description="System backups" )) self.add_rule(ClassificationRule( name="database_backups", category="backups/database", patterns=["**/*.sql.gz", "**/*.dump", "**/db-backup/**"], priority=11, description="Database backups" )) # Archives self.add_rule(ClassificationRule( name="archives", category="backups/archive", patterns=["**/*.tar", "**/*.tar.gz", "**/*.tgz", "**/*.zip", "**/*.7z"], priority=5, description="Archive files" )) def add_rule(self, rule: ClassificationRule) -> None: """Add a classification rule Args: rule: Rule to add """ self.rules.append(rule) # Sort rules by priority (higher priority first) self.rules.sort(key=lambda r: r.priority, reverse=True) def remove_rule(self, rule_name: str) -> None: """Remove a rule by name Args: rule_name: Name of rule to remove """ self.rules = [r for r in self.rules if r.name != rule_name] def match_path(self, path: Path) -> Optional[str]: """Match path against rules Args: path: Path to match Returns: Category name or None if no match """ path_str = str(path) # Try to match each rule in priority order for rule in self.rules: for pattern in rule.patterns: if fnmatch.fnmatch(path_str, pattern): return rule.category return None def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]: """Classify a file path Args: path: Path to classify file_type: Optional file type hint Returns: Category name or None if no match """ return self.match_path(path) def get_category_rules(self, category: str) -> list[ClassificationRule]: """Get all rules for a category Args: category: Category name Returns: List of rules for the category """ return [r for r in self.rules if r.category == category] def get_all_categories(self) -> set[str]: """Get all defined categories Returns: Set of category names """ return {r.category for r in self.rules} def get_rules_by_priority(self, min_priority: int = 0) -> list[ClassificationRule]: """Get rules above a minimum priority Args: min_priority: Minimum priority threshold Returns: List of rules with priority >= min_priority """ return [r for r in self.rules if r.priority >= min_priority]