fly wa
This commit is contained in:
@@ -1,282 +1,60 @@
|
||||
"""Rule-based classification engine"""
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import fnmatch
|
||||
|
||||
from ._protocols import ClassificationRule
|
||||
|
||||
|
||||
class RuleBasedClassifier:
|
||||
"""Rule-based file classifier using pattern matching"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize rule-based classifier"""
|
||||
self.rules: list[ClassificationRule] = []
|
||||
self._load_default_rules()
|
||||
|
||||
def _load_default_rules(self):
|
||||
"""Load default classification rules based on ARCHITECTURE.md"""
|
||||
|
||||
# Build artifacts and caches
|
||||
self.add_rule(ClassificationRule(
|
||||
name="maven_cache",
|
||||
category="artifacts/java/maven",
|
||||
patterns=["**/.m2/**", "**/.maven/**", "**/maven-central-cache/**"],
|
||||
priority=10,
|
||||
description="Maven repository and cache"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="gradle_cache",
|
||||
category="artifacts/java/gradle",
|
||||
patterns=["**/.gradle/**", "**/gradle-cache/**", "**/gradle-build-cache/**"],
|
||||
priority=10,
|
||||
description="Gradle cache and artifacts"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="python_cache",
|
||||
category="cache/pycache",
|
||||
patterns=["**/__pycache__/**", "**/*.pyc", "**/*.pyo"],
|
||||
priority=10,
|
||||
description="Python cache files"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="python_artifacts",
|
||||
category="artifacts/python",
|
||||
patterns=["**/pip-cache/**", "**/pypi-cache/**", "**/wheelhouse/**"],
|
||||
priority=10,
|
||||
description="Python package artifacts"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="node_modules",
|
||||
category="cache/node_modules-archive",
|
||||
patterns=["**/node_modules/**"],
|
||||
priority=10,
|
||||
description="Node.js modules"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="node_cache",
|
||||
category="artifacts/node",
|
||||
patterns=["**/.npm/**", "**/npm-registry/**", "**/yarn-cache/**", "**/pnpm-store/**"],
|
||||
priority=10,
|
||||
description="Node.js package managers cache"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="go_cache",
|
||||
category="artifacts/go",
|
||||
patterns=["**/goproxy-cache/**", "**/go/pkg/mod/**", "**/go-module-cache/**"],
|
||||
priority=10,
|
||||
description="Go module cache"
|
||||
))
|
||||
|
||||
# Version control
|
||||
self.add_rule(ClassificationRule(
|
||||
name="git_repos",
|
||||
category="development/git-infrastructure",
|
||||
patterns=["**/.git/**", "**/gitea/repositories/**"],
|
||||
priority=15,
|
||||
description="Git repositories and infrastructure"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="gitea",
|
||||
category="development/gitea",
|
||||
patterns=["**/gitea/**"],
|
||||
priority=12,
|
||||
description="Gitea server data"
|
||||
))
|
||||
|
||||
# Databases
|
||||
self.add_rule(ClassificationRule(
|
||||
name="postgresql",
|
||||
category="databases/postgresql",
|
||||
patterns=["**/postgresql/**", "**/postgres/**", "**/*.sql"],
|
||||
priority=10,
|
||||
description="PostgreSQL databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="mysql",
|
||||
category="databases/mysql",
|
||||
patterns=["**/mysql/**", "**/mariadb/**"],
|
||||
priority=10,
|
||||
description="MySQL/MariaDB databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="mongodb",
|
||||
category="databases/mongodb",
|
||||
patterns=["**/mongodb/**", "**/mongo/**"],
|
||||
priority=10,
|
||||
description="MongoDB databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="redis",
|
||||
category="databases/redis",
|
||||
patterns=["**/redis/**", "**/*.rdb"],
|
||||
priority=10,
|
||||
description="Redis databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="sqlite",
|
||||
category="databases/sqlite",
|
||||
patterns=["**/*.db", "**/*.sqlite", "**/*.sqlite3"],
|
||||
priority=8,
|
||||
description="SQLite databases"
|
||||
))
|
||||
|
||||
# LLM and AI models
|
||||
self.add_rule(ClassificationRule(
|
||||
name="llm_models",
|
||||
category="cache/llm-models",
|
||||
patterns=[
|
||||
"**/hugging-face/**",
|
||||
"**/huggingface/**",
|
||||
"**/.cache/huggingface/**",
|
||||
"**/models/**/*.bin",
|
||||
"**/models/**/*.onnx",
|
||||
"**/models/**/*.safetensors",
|
||||
"**/llm*/**",
|
||||
"**/openai-cache/**"
|
||||
],
|
||||
priority=12,
|
||||
description="LLM and AI model files"
|
||||
))
|
||||
|
||||
# Docker and containers
|
||||
self.add_rule(ClassificationRule(
|
||||
name="docker_volumes",
|
||||
category="apps/volumes/docker-volumes",
|
||||
patterns=["**/docker/volumes/**", "**/var/lib/docker/volumes/**"],
|
||||
priority=10,
|
||||
description="Docker volumes"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="app_data",
|
||||
category="apps/volumes/app-data",
|
||||
patterns=["**/app-data/**", "**/application-data/**"],
|
||||
priority=8,
|
||||
description="Application data"
|
||||
))
|
||||
|
||||
# Build outputs
|
||||
self.add_rule(ClassificationRule(
|
||||
name="build_output",
|
||||
category="development/build-tools",
|
||||
patterns=["**/target/**", "**/build/**", "**/dist/**", "**/out/**"],
|
||||
priority=5,
|
||||
description="Build output directories"
|
||||
))
|
||||
|
||||
# Backups
|
||||
self.add_rule(ClassificationRule(
|
||||
name="system_backups",
|
||||
category="backups/system",
|
||||
patterns=["**/backup/**", "**/backups/**", "**/*.bak", "**/*.backup"],
|
||||
priority=10,
|
||||
description="System backups"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="database_backups",
|
||||
category="backups/database",
|
||||
patterns=["**/*.sql.gz", "**/*.dump", "**/db-backup/**"],
|
||||
priority=11,
|
||||
description="Database backups"
|
||||
))
|
||||
|
||||
# Archives
|
||||
self.add_rule(ClassificationRule(
|
||||
name="archives",
|
||||
category="backups/archive",
|
||||
patterns=["**/*.tar", "**/*.tar.gz", "**/*.tgz", "**/*.zip", "**/*.7z"],
|
||||
priority=5,
|
||||
description="Archive files"
|
||||
))
|
||||
self.add_rule(ClassificationRule(name='maven_cache', category='artifacts/java/maven', patterns=['**/.m2/**', '**/.maven/**', '**/maven-central-cache/**'], priority=10, description='Maven repository and cache'))
|
||||
self.add_rule(ClassificationRule(name='gradle_cache', category='artifacts/java/gradle', patterns=['**/.gradle/**', '**/gradle-cache/**', '**/gradle-build-cache/**'], priority=10, description='Gradle cache and artifacts'))
|
||||
self.add_rule(ClassificationRule(name='python_cache', category='cache/pycache', patterns=['**/__pycache__/**', '**/*.pyc', '**/*.pyo'], priority=10, description='Python cache files'))
|
||||
self.add_rule(ClassificationRule(name='python_artifacts', category='artifacts/python', patterns=['**/pip-cache/**', '**/pypi-cache/**', '**/wheelhouse/**'], priority=10, description='Python package artifacts'))
|
||||
self.add_rule(ClassificationRule(name='node_modules', category='cache/node_modules-archive', patterns=['**/node_modules/**'], priority=10, description='Node.js modules'))
|
||||
self.add_rule(ClassificationRule(name='node_cache', category='artifacts/node', patterns=['**/.npm/**', '**/npm-registry/**', '**/yarn-cache/**', '**/pnpm-store/**'], priority=10, description='Node.js package managers cache'))
|
||||
self.add_rule(ClassificationRule(name='go_cache', category='artifacts/go', patterns=['**/goproxy-cache/**', '**/go/pkg/mod/**', '**/go-module-cache/**'], priority=10, description='Go module cache'))
|
||||
self.add_rule(ClassificationRule(name='git_repos', category='development/git-infrastructure', patterns=['**/.git/**', '**/gitea/repositories/**'], priority=15, description='Git repositories and infrastructure'))
|
||||
self.add_rule(ClassificationRule(name='gitea', category='development/gitea', patterns=['**/gitea/**'], priority=12, description='Gitea server data'))
|
||||
self.add_rule(ClassificationRule(name='postgresql', category='databases/postgresql', patterns=['**/postgresql/**', '**/postgres/**', '**/*.sql'], priority=10, description='PostgreSQL databases'))
|
||||
self.add_rule(ClassificationRule(name='mysql', category='databases/mysql', patterns=['**/mysql/**', '**/mariadb/**'], priority=10, description='MySQL/MariaDB databases'))
|
||||
self.add_rule(ClassificationRule(name='mongodb', category='databases/mongodb', patterns=['**/mongodb/**', '**/mongo/**'], priority=10, description='MongoDB databases'))
|
||||
self.add_rule(ClassificationRule(name='redis', category='databases/redis', patterns=['**/redis/**', '**/*.rdb'], priority=10, description='Redis databases'))
|
||||
self.add_rule(ClassificationRule(name='sqlite', category='databases/sqlite', patterns=['**/*.db', '**/*.sqlite', '**/*.sqlite3'], priority=8, description='SQLite databases'))
|
||||
self.add_rule(ClassificationRule(name='llm_models', category='cache/llm-models', patterns=['**/hugging-face/**', '**/huggingface/**', '**/.cache/huggingface/**', '**/models/**/*.bin', '**/models/**/*.onnx', '**/models/**/*.safetensors', '**/llm*/**', '**/openai-cache/**'], priority=12, description='LLM and AI model files'))
|
||||
self.add_rule(ClassificationRule(name='docker_volumes', category='apps/volumes/docker-volumes', patterns=['**/docker/volumes/**', '**/var/lib/docker/volumes/**'], priority=10, description='Docker volumes'))
|
||||
self.add_rule(ClassificationRule(name='app_data', category='apps/volumes/app-data', patterns=['**/app-data/**', '**/application-data/**'], priority=8, description='Application data'))
|
||||
self.add_rule(ClassificationRule(name='build_output', category='development/build-tools', patterns=['**/target/**', '**/build/**', '**/dist/**', '**/out/**'], priority=5, description='Build output directories'))
|
||||
self.add_rule(ClassificationRule(name='system_backups', category='backups/system', patterns=['**/backup/**', '**/backups/**', '**/*.bak', '**/*.backup'], priority=10, description='System backups'))
|
||||
self.add_rule(ClassificationRule(name='database_backups', category='backups/database', patterns=['**/*.sql.gz', '**/*.dump', '**/db-backup/**'], priority=11, description='Database backups'))
|
||||
self.add_rule(ClassificationRule(name='archives', category='backups/archive', patterns=['**/*.tar', '**/*.tar.gz', '**/*.tgz', '**/*.zip', '**/*.7z'], priority=5, description='Archive files'))
|
||||
|
||||
def add_rule(self, rule: ClassificationRule) -> None:
|
||||
"""Add a classification rule
|
||||
|
||||
Args:
|
||||
rule: Rule to add
|
||||
"""
|
||||
self.rules.append(rule)
|
||||
# Sort rules by priority (higher priority first)
|
||||
self.rules.sort(key=lambda r: r.priority, reverse=True)
|
||||
|
||||
def remove_rule(self, rule_name: str) -> None:
|
||||
"""Remove a rule by name
|
||||
|
||||
Args:
|
||||
rule_name: Name of rule to remove
|
||||
"""
|
||||
self.rules = [r for r in self.rules if r.name != rule_name]
|
||||
|
||||
def match_path(self, path: Path) -> Optional[str]:
|
||||
"""Match path against rules
|
||||
|
||||
Args:
|
||||
path: Path to match
|
||||
|
||||
Returns:
|
||||
Category name or None if no match
|
||||
"""
|
||||
path_str = str(path)
|
||||
|
||||
# Try to match each rule in priority order
|
||||
for rule in self.rules:
|
||||
for pattern in rule.patterns:
|
||||
if fnmatch.fnmatch(path_str, pattern):
|
||||
return rule.category
|
||||
|
||||
return None
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
|
||||
"""Classify a file path
|
||||
|
||||
Args:
|
||||
path: Path to classify
|
||||
file_type: Optional file type hint
|
||||
|
||||
Returns:
|
||||
Category name or None if no match
|
||||
"""
|
||||
def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]:
|
||||
return self.match_path(path)
|
||||
|
||||
def get_category_rules(self, category: str) -> list[ClassificationRule]:
|
||||
"""Get all rules for a category
|
||||
|
||||
Args:
|
||||
category: Category name
|
||||
|
||||
Returns:
|
||||
List of rules for the category
|
||||
"""
|
||||
return [r for r in self.rules if r.category == category]
|
||||
|
||||
def get_all_categories(self) -> set[str]:
|
||||
"""Get all defined categories
|
||||
|
||||
Returns:
|
||||
Set of category names
|
||||
"""
|
||||
return {r.category for r in self.rules}
|
||||
|
||||
def get_rules_by_priority(self, min_priority: int = 0) -> list[ClassificationRule]:
|
||||
"""Get rules above a minimum priority
|
||||
|
||||
Args:
|
||||
min_priority: Minimum priority threshold
|
||||
|
||||
Returns:
|
||||
List of rules with priority >= min_priority
|
||||
"""
|
||||
def get_rules_by_priority(self, min_priority: int=0) -> list[ClassificationRule]:
|
||||
return [r for r in self.rules if r.priority >= min_priority]
|
||||
|
||||
Reference in New Issue
Block a user