initial commit
This commit is contained in:
@@ -0,0 +1,282 @@
|
||||
"""Rule-based classification engine"""
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import fnmatch
|
||||
|
||||
from ._protocols import ClassificationRule
|
||||
|
||||
|
||||
class RuleBasedClassifier:
|
||||
"""Rule-based file classifier using pattern matching"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize rule-based classifier"""
|
||||
self.rules: list[ClassificationRule] = []
|
||||
self._load_default_rules()
|
||||
|
||||
def _load_default_rules(self):
|
||||
"""Load default classification rules based on ARCHITECTURE.md"""
|
||||
|
||||
# Build artifacts and caches
|
||||
self.add_rule(ClassificationRule(
|
||||
name="maven_cache",
|
||||
category="artifacts/java/maven",
|
||||
patterns=["**/.m2/**", "**/.maven/**", "**/maven-central-cache/**"],
|
||||
priority=10,
|
||||
description="Maven repository and cache"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="gradle_cache",
|
||||
category="artifacts/java/gradle",
|
||||
patterns=["**/.gradle/**", "**/gradle-cache/**", "**/gradle-build-cache/**"],
|
||||
priority=10,
|
||||
description="Gradle cache and artifacts"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="python_cache",
|
||||
category="cache/pycache",
|
||||
patterns=["**/__pycache__/**", "**/*.pyc", "**/*.pyo"],
|
||||
priority=10,
|
||||
description="Python cache files"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="python_artifacts",
|
||||
category="artifacts/python",
|
||||
patterns=["**/pip-cache/**", "**/pypi-cache/**", "**/wheelhouse/**"],
|
||||
priority=10,
|
||||
description="Python package artifacts"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="node_modules",
|
||||
category="cache/node_modules-archive",
|
||||
patterns=["**/node_modules/**"],
|
||||
priority=10,
|
||||
description="Node.js modules"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="node_cache",
|
||||
category="artifacts/node",
|
||||
patterns=["**/.npm/**", "**/npm-registry/**", "**/yarn-cache/**", "**/pnpm-store/**"],
|
||||
priority=10,
|
||||
description="Node.js package managers cache"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="go_cache",
|
||||
category="artifacts/go",
|
||||
patterns=["**/goproxy-cache/**", "**/go/pkg/mod/**", "**/go-module-cache/**"],
|
||||
priority=10,
|
||||
description="Go module cache"
|
||||
))
|
||||
|
||||
# Version control
|
||||
self.add_rule(ClassificationRule(
|
||||
name="git_repos",
|
||||
category="development/git-infrastructure",
|
||||
patterns=["**/.git/**", "**/gitea/repositories/**"],
|
||||
priority=15,
|
||||
description="Git repositories and infrastructure"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="gitea",
|
||||
category="development/gitea",
|
||||
patterns=["**/gitea/**"],
|
||||
priority=12,
|
||||
description="Gitea server data"
|
||||
))
|
||||
|
||||
# Databases
|
||||
self.add_rule(ClassificationRule(
|
||||
name="postgresql",
|
||||
category="databases/postgresql",
|
||||
patterns=["**/postgresql/**", "**/postgres/**", "**/*.sql"],
|
||||
priority=10,
|
||||
description="PostgreSQL databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="mysql",
|
||||
category="databases/mysql",
|
||||
patterns=["**/mysql/**", "**/mariadb/**"],
|
||||
priority=10,
|
||||
description="MySQL/MariaDB databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="mongodb",
|
||||
category="databases/mongodb",
|
||||
patterns=["**/mongodb/**", "**/mongo/**"],
|
||||
priority=10,
|
||||
description="MongoDB databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="redis",
|
||||
category="databases/redis",
|
||||
patterns=["**/redis/**", "**/*.rdb"],
|
||||
priority=10,
|
||||
description="Redis databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="sqlite",
|
||||
category="databases/sqlite",
|
||||
patterns=["**/*.db", "**/*.sqlite", "**/*.sqlite3"],
|
||||
priority=8,
|
||||
description="SQLite databases"
|
||||
))
|
||||
|
||||
# LLM and AI models
|
||||
self.add_rule(ClassificationRule(
|
||||
name="llm_models",
|
||||
category="cache/llm-models",
|
||||
patterns=[
|
||||
"**/hugging-face/**",
|
||||
"**/huggingface/**",
|
||||
"**/.cache/huggingface/**",
|
||||
"**/models/**/*.bin",
|
||||
"**/models/**/*.onnx",
|
||||
"**/models/**/*.safetensors",
|
||||
"**/llm*/**",
|
||||
"**/openai-cache/**"
|
||||
],
|
||||
priority=12,
|
||||
description="LLM and AI model files"
|
||||
))
|
||||
|
||||
# Docker and containers
|
||||
self.add_rule(ClassificationRule(
|
||||
name="docker_volumes",
|
||||
category="apps/volumes/docker-volumes",
|
||||
patterns=["**/docker/volumes/**", "**/var/lib/docker/volumes/**"],
|
||||
priority=10,
|
||||
description="Docker volumes"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="app_data",
|
||||
category="apps/volumes/app-data",
|
||||
patterns=["**/app-data/**", "**/application-data/**"],
|
||||
priority=8,
|
||||
description="Application data"
|
||||
))
|
||||
|
||||
# Build outputs
|
||||
self.add_rule(ClassificationRule(
|
||||
name="build_output",
|
||||
category="development/build-tools",
|
||||
patterns=["**/target/**", "**/build/**", "**/dist/**", "**/out/**"],
|
||||
priority=5,
|
||||
description="Build output directories"
|
||||
))
|
||||
|
||||
# Backups
|
||||
self.add_rule(ClassificationRule(
|
||||
name="system_backups",
|
||||
category="backups/system",
|
||||
patterns=["**/backup/**", "**/backups/**", "**/*.bak", "**/*.backup"],
|
||||
priority=10,
|
||||
description="System backups"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="database_backups",
|
||||
category="backups/database",
|
||||
patterns=["**/*.sql.gz", "**/*.dump", "**/db-backup/**"],
|
||||
priority=11,
|
||||
description="Database backups"
|
||||
))
|
||||
|
||||
# Archives
|
||||
self.add_rule(ClassificationRule(
|
||||
name="archives",
|
||||
category="backups/archive",
|
||||
patterns=["**/*.tar", "**/*.tar.gz", "**/*.tgz", "**/*.zip", "**/*.7z"],
|
||||
priority=5,
|
||||
description="Archive files"
|
||||
))
|
||||
|
||||
def add_rule(self, rule: ClassificationRule) -> None:
|
||||
"""Add a classification rule
|
||||
|
||||
Args:
|
||||
rule: Rule to add
|
||||
"""
|
||||
self.rules.append(rule)
|
||||
# Sort rules by priority (higher priority first)
|
||||
self.rules.sort(key=lambda r: r.priority, reverse=True)
|
||||
|
||||
def remove_rule(self, rule_name: str) -> None:
|
||||
"""Remove a rule by name
|
||||
|
||||
Args:
|
||||
rule_name: Name of rule to remove
|
||||
"""
|
||||
self.rules = [r for r in self.rules if r.name != rule_name]
|
||||
|
||||
def match_path(self, path: Path) -> Optional[str]:
|
||||
"""Match path against rules
|
||||
|
||||
Args:
|
||||
path: Path to match
|
||||
|
||||
Returns:
|
||||
Category name or None if no match
|
||||
"""
|
||||
path_str = str(path)
|
||||
|
||||
# Try to match each rule in priority order
|
||||
for rule in self.rules:
|
||||
for pattern in rule.patterns:
|
||||
if fnmatch.fnmatch(path_str, pattern):
|
||||
return rule.category
|
||||
|
||||
return None
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
|
||||
"""Classify a file path
|
||||
|
||||
Args:
|
||||
path: Path to classify
|
||||
file_type: Optional file type hint
|
||||
|
||||
Returns:
|
||||
Category name or None if no match
|
||||
"""
|
||||
return self.match_path(path)
|
||||
|
||||
def get_category_rules(self, category: str) -> list[ClassificationRule]:
|
||||
"""Get all rules for a category
|
||||
|
||||
Args:
|
||||
category: Category name
|
||||
|
||||
Returns:
|
||||
List of rules for the category
|
||||
"""
|
||||
return [r for r in self.rules if r.category == category]
|
||||
|
||||
def get_all_categories(self) -> set[str]:
|
||||
"""Get all defined categories
|
||||
|
||||
Returns:
|
||||
Set of category names
|
||||
"""
|
||||
return {r.category for r in self.rules}
|
||||
|
||||
def get_rules_by_priority(self, min_priority: int = 0) -> list[ClassificationRule]:
|
||||
"""Get rules above a minimum priority
|
||||
|
||||
Args:
|
||||
min_priority: Minimum priority threshold
|
||||
|
||||
Returns:
|
||||
List of rules with priority >= min_priority
|
||||
"""
|
||||
return [r for r in self.rules if r.priority >= min_priority]
|
||||
|
||||
Reference in New Issue
Block a user