61 lines
5.6 KiB
Python
61 lines
5.6 KiB
Python
from pathlib import Path
|
|
from typing import Optional
|
|
import fnmatch
|
|
from ._protocols import ClassificationRule
|
|
|
|
class RuleBasedClassifier:
|
|
|
|
def __init__(self):
|
|
self.rules: list[ClassificationRule] = []
|
|
self._load_default_rules()
|
|
|
|
def _load_default_rules(self):
|
|
self.add_rule(ClassificationRule(name='maven_cache', category='artifacts/java/maven', patterns=['**/.m2/**', '**/.maven/**', '**/maven-central-cache/**'], priority=10, description='Maven repository and cache'))
|
|
self.add_rule(ClassificationRule(name='gradle_cache', category='artifacts/java/gradle', patterns=['**/.gradle/**', '**/gradle-cache/**', '**/gradle-build-cache/**'], priority=10, description='Gradle cache and artifacts'))
|
|
self.add_rule(ClassificationRule(name='python_cache', category='cache/pycache', patterns=['**/__pycache__/**', '**/*.pyc', '**/*.pyo'], priority=10, description='Python cache files'))
|
|
self.add_rule(ClassificationRule(name='python_artifacts', category='artifacts/python', patterns=['**/pip-cache/**', '**/pypi-cache/**', '**/wheelhouse/**'], priority=10, description='Python package artifacts'))
|
|
self.add_rule(ClassificationRule(name='node_modules', category='cache/node_modules-archive', patterns=['**/node_modules/**'], priority=10, description='Node.js modules'))
|
|
self.add_rule(ClassificationRule(name='node_cache', category='artifacts/node', patterns=['**/.npm/**', '**/npm-registry/**', '**/yarn-cache/**', '**/pnpm-store/**'], priority=10, description='Node.js package managers cache'))
|
|
self.add_rule(ClassificationRule(name='go_cache', category='artifacts/go', patterns=['**/goproxy-cache/**', '**/go/pkg/mod/**', '**/go-module-cache/**'], priority=10, description='Go module cache'))
|
|
self.add_rule(ClassificationRule(name='git_repos', category='development/git-infrastructure', patterns=['**/.git/**', '**/gitea/repositories/**'], priority=15, description='Git repositories and infrastructure'))
|
|
self.add_rule(ClassificationRule(name='gitea', category='development/gitea', patterns=['**/gitea/**'], priority=12, description='Gitea server data'))
|
|
self.add_rule(ClassificationRule(name='postgresql', category='databases/postgresql', patterns=['**/postgresql/**', '**/postgres/**', '**/*.sql'], priority=10, description='PostgreSQL databases'))
|
|
self.add_rule(ClassificationRule(name='mysql', category='databases/mysql', patterns=['**/mysql/**', '**/mariadb/**'], priority=10, description='MySQL/MariaDB databases'))
|
|
self.add_rule(ClassificationRule(name='mongodb', category='databases/mongodb', patterns=['**/mongodb/**', '**/mongo/**'], priority=10, description='MongoDB databases'))
|
|
self.add_rule(ClassificationRule(name='redis', category='databases/redis', patterns=['**/redis/**', '**/*.rdb'], priority=10, description='Redis databases'))
|
|
self.add_rule(ClassificationRule(name='sqlite', category='databases/sqlite', patterns=['**/*.db', '**/*.sqlite', '**/*.sqlite3'], priority=8, description='SQLite databases'))
|
|
self.add_rule(ClassificationRule(name='llm_models', category='cache/llm-models', patterns=['**/hugging-face/**', '**/huggingface/**', '**/.cache/huggingface/**', '**/models/**/*.bin', '**/models/**/*.onnx', '**/models/**/*.safetensors', '**/llm*/**', '**/openai-cache/**'], priority=12, description='LLM and AI model files'))
|
|
self.add_rule(ClassificationRule(name='docker_volumes', category='apps/volumes/docker-volumes', patterns=['**/docker/volumes/**', '**/var/lib/docker/volumes/**'], priority=10, description='Docker volumes'))
|
|
self.add_rule(ClassificationRule(name='app_data', category='apps/volumes/app-data', patterns=['**/app-data/**', '**/application-data/**'], priority=8, description='Application data'))
|
|
self.add_rule(ClassificationRule(name='build_output', category='development/build-tools', patterns=['**/target/**', '**/build/**', '**/dist/**', '**/out/**'], priority=5, description='Build output directories'))
|
|
self.add_rule(ClassificationRule(name='system_backups', category='backups/system', patterns=['**/backup/**', '**/backups/**', '**/*.bak', '**/*.backup'], priority=10, description='System backups'))
|
|
self.add_rule(ClassificationRule(name='database_backups', category='backups/database', patterns=['**/*.sql.gz', '**/*.dump', '**/db-backup/**'], priority=11, description='Database backups'))
|
|
self.add_rule(ClassificationRule(name='archives', category='backups/archive', patterns=['**/*.tar', '**/*.tar.gz', '**/*.tgz', '**/*.zip', '**/*.7z'], priority=5, description='Archive files'))
|
|
|
|
def add_rule(self, rule: ClassificationRule) -> None:
|
|
self.rules.append(rule)
|
|
self.rules.sort(key=lambda r: r.priority, reverse=True)
|
|
|
|
def remove_rule(self, rule_name: str) -> None:
|
|
self.rules = [r for r in self.rules if r.name != rule_name]
|
|
|
|
def match_path(self, path: Path) -> Optional[str]:
|
|
path_str = str(path)
|
|
for rule in self.rules:
|
|
for pattern in rule.patterns:
|
|
if fnmatch.fnmatch(path_str, pattern):
|
|
return rule.category
|
|
return None
|
|
|
|
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
|
|
return self.match_path(path)
|
|
|
|
def get_category_rules(self, category: str) -> list[ClassificationRule]:
|
|
return [r for r in self.rules if r.category == category]
|
|
|
|
def get_all_categories(self) -> set[str]:
|
|
return {r.category for r in self.rules}
|
|
|
|
def get_rules_by_priority(self, min_priority: int=0) -> list[ClassificationRule]:
|
|
return [r for r in self.rules if r.priority >= min_priority]
|