283 lines
8.7 KiB
Python
283 lines
8.7 KiB
Python
"""Rule-based classification engine"""
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import fnmatch
|
|
|
|
from ._protocols import ClassificationRule
|
|
|
|
|
|
class RuleBasedClassifier:
|
|
"""Rule-based file classifier using pattern matching"""
|
|
|
|
def __init__(self):
|
|
"""Initialize rule-based classifier"""
|
|
self.rules: list[ClassificationRule] = []
|
|
self._load_default_rules()
|
|
|
|
def _load_default_rules(self):
|
|
"""Load default classification rules based on ARCHITECTURE.md"""
|
|
|
|
# Build artifacts and caches
|
|
self.add_rule(ClassificationRule(
|
|
name="maven_cache",
|
|
category="artifacts/java/maven",
|
|
patterns=["**/.m2/**", "**/.maven/**", "**/maven-central-cache/**"],
|
|
priority=10,
|
|
description="Maven repository and cache"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="gradle_cache",
|
|
category="artifacts/java/gradle",
|
|
patterns=["**/.gradle/**", "**/gradle-cache/**", "**/gradle-build-cache/**"],
|
|
priority=10,
|
|
description="Gradle cache and artifacts"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="python_cache",
|
|
category="cache/pycache",
|
|
patterns=["**/__pycache__/**", "**/*.pyc", "**/*.pyo"],
|
|
priority=10,
|
|
description="Python cache files"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="python_artifacts",
|
|
category="artifacts/python",
|
|
patterns=["**/pip-cache/**", "**/pypi-cache/**", "**/wheelhouse/**"],
|
|
priority=10,
|
|
description="Python package artifacts"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="node_modules",
|
|
category="cache/node_modules-archive",
|
|
patterns=["**/node_modules/**"],
|
|
priority=10,
|
|
description="Node.js modules"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="node_cache",
|
|
category="artifacts/node",
|
|
patterns=["**/.npm/**", "**/npm-registry/**", "**/yarn-cache/**", "**/pnpm-store/**"],
|
|
priority=10,
|
|
description="Node.js package managers cache"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="go_cache",
|
|
category="artifacts/go",
|
|
patterns=["**/goproxy-cache/**", "**/go/pkg/mod/**", "**/go-module-cache/**"],
|
|
priority=10,
|
|
description="Go module cache"
|
|
))
|
|
|
|
# Version control
|
|
self.add_rule(ClassificationRule(
|
|
name="git_repos",
|
|
category="development/git-infrastructure",
|
|
patterns=["**/.git/**", "**/gitea/repositories/**"],
|
|
priority=15,
|
|
description="Git repositories and infrastructure"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="gitea",
|
|
category="development/gitea",
|
|
patterns=["**/gitea/**"],
|
|
priority=12,
|
|
description="Gitea server data"
|
|
))
|
|
|
|
# Databases
|
|
self.add_rule(ClassificationRule(
|
|
name="postgresql",
|
|
category="databases/postgresql",
|
|
patterns=["**/postgresql/**", "**/postgres/**", "**/*.sql"],
|
|
priority=10,
|
|
description="PostgreSQL databases"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="mysql",
|
|
category="databases/mysql",
|
|
patterns=["**/mysql/**", "**/mariadb/**"],
|
|
priority=10,
|
|
description="MySQL/MariaDB databases"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="mongodb",
|
|
category="databases/mongodb",
|
|
patterns=["**/mongodb/**", "**/mongo/**"],
|
|
priority=10,
|
|
description="MongoDB databases"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="redis",
|
|
category="databases/redis",
|
|
patterns=["**/redis/**", "**/*.rdb"],
|
|
priority=10,
|
|
description="Redis databases"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="sqlite",
|
|
category="databases/sqlite",
|
|
patterns=["**/*.db", "**/*.sqlite", "**/*.sqlite3"],
|
|
priority=8,
|
|
description="SQLite databases"
|
|
))
|
|
|
|
# LLM and AI models
|
|
self.add_rule(ClassificationRule(
|
|
name="llm_models",
|
|
category="cache/llm-models",
|
|
patterns=[
|
|
"**/hugging-face/**",
|
|
"**/huggingface/**",
|
|
"**/.cache/huggingface/**",
|
|
"**/models/**/*.bin",
|
|
"**/models/**/*.onnx",
|
|
"**/models/**/*.safetensors",
|
|
"**/llm*/**",
|
|
"**/openai-cache/**"
|
|
],
|
|
priority=12,
|
|
description="LLM and AI model files"
|
|
))
|
|
|
|
# Docker and containers
|
|
self.add_rule(ClassificationRule(
|
|
name="docker_volumes",
|
|
category="apps/volumes/docker-volumes",
|
|
patterns=["**/docker/volumes/**", "**/var/lib/docker/volumes/**"],
|
|
priority=10,
|
|
description="Docker volumes"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="app_data",
|
|
category="apps/volumes/app-data",
|
|
patterns=["**/app-data/**", "**/application-data/**"],
|
|
priority=8,
|
|
description="Application data"
|
|
))
|
|
|
|
# Build outputs
|
|
self.add_rule(ClassificationRule(
|
|
name="build_output",
|
|
category="development/build-tools",
|
|
patterns=["**/target/**", "**/build/**", "**/dist/**", "**/out/**"],
|
|
priority=5,
|
|
description="Build output directories"
|
|
))
|
|
|
|
# Backups
|
|
self.add_rule(ClassificationRule(
|
|
name="system_backups",
|
|
category="backups/system",
|
|
patterns=["**/backup/**", "**/backups/**", "**/*.bak", "**/*.backup"],
|
|
priority=10,
|
|
description="System backups"
|
|
))
|
|
|
|
self.add_rule(ClassificationRule(
|
|
name="database_backups",
|
|
category="backups/database",
|
|
patterns=["**/*.sql.gz", "**/*.dump", "**/db-backup/**"],
|
|
priority=11,
|
|
description="Database backups"
|
|
))
|
|
|
|
# Archives
|
|
self.add_rule(ClassificationRule(
|
|
name="archives",
|
|
category="backups/archive",
|
|
patterns=["**/*.tar", "**/*.tar.gz", "**/*.tgz", "**/*.zip", "**/*.7z"],
|
|
priority=5,
|
|
description="Archive files"
|
|
))
|
|
|
|
def add_rule(self, rule: ClassificationRule) -> None:
|
|
"""Add a classification rule
|
|
|
|
Args:
|
|
rule: Rule to add
|
|
"""
|
|
self.rules.append(rule)
|
|
# Sort rules by priority (higher priority first)
|
|
self.rules.sort(key=lambda r: r.priority, reverse=True)
|
|
|
|
def remove_rule(self, rule_name: str) -> None:
|
|
"""Remove a rule by name
|
|
|
|
Args:
|
|
rule_name: Name of rule to remove
|
|
"""
|
|
self.rules = [r for r in self.rules if r.name != rule_name]
|
|
|
|
def match_path(self, path: Path) -> Optional[str]:
|
|
"""Match path against rules
|
|
|
|
Args:
|
|
path: Path to match
|
|
|
|
Returns:
|
|
Category name or None if no match
|
|
"""
|
|
path_str = str(path)
|
|
|
|
# Try to match each rule in priority order
|
|
for rule in self.rules:
|
|
for pattern in rule.patterns:
|
|
if fnmatch.fnmatch(path_str, pattern):
|
|
return rule.category
|
|
|
|
return None
|
|
|
|
def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
|
|
"""Classify a file path
|
|
|
|
Args:
|
|
path: Path to classify
|
|
file_type: Optional file type hint
|
|
|
|
Returns:
|
|
Category name or None if no match
|
|
"""
|
|
return self.match_path(path)
|
|
|
|
def get_category_rules(self, category: str) -> list[ClassificationRule]:
|
|
"""Get all rules for a category
|
|
|
|
Args:
|
|
category: Category name
|
|
|
|
Returns:
|
|
List of rules for the category
|
|
"""
|
|
return [r for r in self.rules if r.category == category]
|
|
|
|
def get_all_categories(self) -> set[str]:
|
|
"""Get all defined categories
|
|
|
|
Returns:
|
|
Set of category names
|
|
"""
|
|
return {r.category for r in self.rules}
|
|
|
|
def get_rules_by_priority(self, min_priority: int = 0) -> list[ClassificationRule]:
|
|
"""Get rules above a minimum priority
|
|
|
|
Args:
|
|
min_priority: Minimum priority threshold
|
|
|
|
Returns:
|
|
List of rules with priority >= min_priority
|
|
"""
|
|
return [r for r in self.rules if r.priority >= min_priority]
|