base
This commit is contained in:
17
app/classification/__init__.py
Normal file
17
app/classification/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""Classification package exports"""
|
||||
from .rules import RuleBasedClassifier
|
||||
from .ml import create_ml_classifier, train_from_database, MLClassifier, DummyMLClassifier
|
||||
from .engine import ClassificationEngine
|
||||
from ._protocols import ClassificationRule, IClassifier, IRuleEngine
|
||||
|
||||
__all__ = [
|
||||
'RuleBasedClassifier',
|
||||
'MLClassifier',
|
||||
'DummyMLClassifier',
|
||||
'create_ml_classifier',
|
||||
'train_from_database',
|
||||
'ClassificationEngine',
|
||||
'ClassificationRule',
|
||||
'IClassifier',
|
||||
'IRuleEngine',
|
||||
]
|
||||
72
app/classification/_protocols.py
Normal file
72
app/classification/_protocols.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Protocol definitions for the classification package"""
|
||||
from typing import Protocol, Optional
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClassificationRule:
|
||||
"""Rule for classifying files"""
|
||||
name: str
|
||||
category: str
|
||||
patterns: list[str]
|
||||
priority: int = 0
|
||||
description: str = ""
|
||||
|
||||
|
||||
class IClassifier(Protocol):
|
||||
"""Protocol for classification operations"""
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
|
||||
"""Classify a file path
|
||||
|
||||
Args:
|
||||
path: Path to classify
|
||||
file_type: Optional file type hint
|
||||
|
||||
Returns:
|
||||
Category name or None if no match
|
||||
"""
|
||||
...
|
||||
|
||||
def get_category_rules(self, category: str) -> list[ClassificationRule]:
|
||||
"""Get all rules for a category
|
||||
|
||||
Args:
|
||||
category: Category name
|
||||
|
||||
Returns:
|
||||
List of rules for the category
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class IRuleEngine(Protocol):
|
||||
"""Protocol for rule-based classification"""
|
||||
|
||||
def add_rule(self, rule: ClassificationRule) -> None:
|
||||
"""Add a classification rule
|
||||
|
||||
Args:
|
||||
rule: Rule to add
|
||||
"""
|
||||
...
|
||||
|
||||
def remove_rule(self, rule_name: str) -> None:
|
||||
"""Remove a rule by name
|
||||
|
||||
Args:
|
||||
rule_name: Name of rule to remove
|
||||
"""
|
||||
...
|
||||
|
||||
def match_path(self, path: Path) -> Optional[str]:
|
||||
"""Match path against rules
|
||||
|
||||
Args:
|
||||
path: Path to match
|
||||
|
||||
Returns:
|
||||
Category name or None if no match
|
||||
"""
|
||||
...
|
||||
350
app/classification/engine.py
Normal file
350
app/classification/engine.py
Normal file
@@ -0,0 +1,350 @@
|
||||
"""Main classification engine"""
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
import psycopg2
|
||||
|
||||
from .rules import RuleBasedClassifier
|
||||
from .ml import create_ml_classifier, DummyMLClassifier
|
||||
from ..shared.models import ProcessingStats
|
||||
from ..shared.config import DatabaseConfig
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
|
||||
class ClassificationEngine:
|
||||
"""Engine for classifying files"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_config: DatabaseConfig,
|
||||
logger: ProgressLogger,
|
||||
use_ml: bool = False
|
||||
):
|
||||
"""Initialize classification engine
|
||||
|
||||
Args:
|
||||
db_config: Database configuration
|
||||
logger: Progress logger
|
||||
use_ml: Whether to use ML classification in addition to rules
|
||||
"""
|
||||
self.db_config = db_config
|
||||
self.logger = logger
|
||||
self.rule_classifier = RuleBasedClassifier()
|
||||
self.ml_classifier = create_ml_classifier() if use_ml else None
|
||||
self.use_ml = use_ml and not isinstance(self.ml_classifier, DummyMLClassifier)
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
"""Get or create database connection"""
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(
|
||||
host=self.db_config.host,
|
||||
port=self.db_config.port,
|
||||
database=self.db_config.database,
|
||||
user=self.db_config.user,
|
||||
password=self.db_config.password
|
||||
)
|
||||
return self._connection
|
||||
|
||||
def classify_all(
|
||||
self,
|
||||
disk: Optional[str] = None,
|
||||
batch_size: int = 1000,
|
||||
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
|
||||
) -> ProcessingStats:
|
||||
"""Classify all files in database
|
||||
|
||||
Args:
|
||||
disk: Optional disk filter
|
||||
batch_size: Number of files to process per batch
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
ProcessingStats with classification statistics
|
||||
"""
|
||||
self.logger.section("Starting Classification")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get files without categories
|
||||
if disk:
|
||||
cursor.execute("""
|
||||
SELECT path, checksum
|
||||
FROM files_bak
|
||||
WHERE disk = %s AND category IS NULL
|
||||
""", (disk,))
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT path, checksum
|
||||
FROM files_bak
|
||||
WHERE category IS NULL
|
||||
""")
|
||||
|
||||
files_to_classify = cursor.fetchall()
|
||||
total_files = len(files_to_classify)
|
||||
|
||||
self.logger.info(f"Found {total_files} files to classify")
|
||||
|
||||
stats = ProcessingStats()
|
||||
batch = []
|
||||
|
||||
for path_str, checksum in files_to_classify:
|
||||
path = Path(path_str)
|
||||
|
||||
# Classify using rules first
|
||||
category = self.rule_classifier.classify(path)
|
||||
|
||||
# If no rule match and ML is available, try ML
|
||||
if category is None and self.use_ml and self.ml_classifier:
|
||||
category = self.ml_classifier.classify(path)
|
||||
|
||||
# If still no category, assign default
|
||||
if category is None:
|
||||
category = "temp/processing"
|
||||
|
||||
batch.append((category, str(path)))
|
||||
stats.files_processed += 1
|
||||
|
||||
# Batch update
|
||||
if len(batch) >= batch_size:
|
||||
self._update_categories(cursor, batch)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
|
||||
# Progress callback
|
||||
if progress_callback:
|
||||
progress_callback(stats.files_processed, total_files, stats)
|
||||
|
||||
# Log progress
|
||||
if stats.files_processed % (batch_size * 10) == 0:
|
||||
self.logger.progress(
|
||||
stats.files_processed,
|
||||
total_files,
|
||||
prefix="Files classified",
|
||||
elapsed_seconds=stats.elapsed_seconds
|
||||
)
|
||||
|
||||
# Update remaining batch
|
||||
if batch:
|
||||
self._update_categories(cursor, batch)
|
||||
conn.commit()
|
||||
|
||||
stats.files_succeeded = stats.files_processed
|
||||
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(
|
||||
f"Classification complete: {stats.files_processed} files in {stats.elapsed_seconds:.1f}s"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
def _update_categories(self, cursor, batch: list[tuple[str, str]]):
|
||||
"""Update categories in batch
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
batch: List of (category, path) tuples
|
||||
"""
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
query = """
|
||||
UPDATE files_bak
|
||||
SET category = %s
|
||||
WHERE path = %s
|
||||
"""
|
||||
|
||||
execute_batch(cursor, query, batch)
|
||||
|
||||
def classify_path(self, path: Path) -> Optional[str]:
|
||||
"""Classify a single path
|
||||
|
||||
Args:
|
||||
path: Path to classify
|
||||
|
||||
Returns:
|
||||
Category name or None
|
||||
"""
|
||||
# Try rules first
|
||||
category = self.rule_classifier.classify(path)
|
||||
|
||||
# Try ML if available
|
||||
if category is None and self.use_ml and self.ml_classifier:
|
||||
category = self.ml_classifier.classify(path)
|
||||
|
||||
return category
|
||||
|
||||
def get_category_stats(self) -> dict[str, dict]:
|
||||
"""Get statistics by category
|
||||
|
||||
Returns:
|
||||
Dictionary mapping category to statistics
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
category,
|
||||
COUNT(*) as file_count,
|
||||
SUM(size) as total_size
|
||||
FROM files_bak
|
||||
WHERE category IS NOT NULL
|
||||
GROUP BY category
|
||||
ORDER BY total_size DESC
|
||||
""")
|
||||
|
||||
stats = {}
|
||||
for category, file_count, total_size in cursor.fetchall():
|
||||
stats[category] = {
|
||||
'file_count': file_count,
|
||||
'total_size': total_size
|
||||
}
|
||||
|
||||
cursor.close()
|
||||
|
||||
return stats
|
||||
|
||||
def get_uncategorized_count(self) -> int:
|
||||
"""Get count of uncategorized files
|
||||
|
||||
Returns:
|
||||
Number of files without category
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM files_bak WHERE category IS NULL")
|
||||
count = cursor.fetchone()[0]
|
||||
|
||||
cursor.close()
|
||||
|
||||
return count
|
||||
|
||||
def reclassify_category(
|
||||
self,
|
||||
old_category: str,
|
||||
new_category: str
|
||||
) -> int:
|
||||
"""Reclassify all files in a category
|
||||
|
||||
Args:
|
||||
old_category: Current category
|
||||
new_category: New category
|
||||
|
||||
Returns:
|
||||
Number of files reclassified
|
||||
"""
|
||||
self.logger.info(f"Reclassifying {old_category} -> {new_category}")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE files_bak
|
||||
SET category = %s
|
||||
WHERE category = %s
|
||||
""", (new_category, old_category))
|
||||
|
||||
count = cursor.rowcount
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(f"Reclassified {count} files")
|
||||
|
||||
return count
|
||||
|
||||
def train_ml_classifier(
|
||||
self,
|
||||
min_samples: int = 10
|
||||
) -> bool:
|
||||
"""Train ML classifier from existing categorized data
|
||||
|
||||
Args:
|
||||
min_samples: Minimum samples per category
|
||||
|
||||
Returns:
|
||||
True if training successful
|
||||
"""
|
||||
if not self.use_ml or self.ml_classifier is None:
|
||||
self.logger.warning("ML classifier not available")
|
||||
return False
|
||||
|
||||
self.logger.subsection("Training ML Classifier")
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get categorized files
|
||||
cursor.execute("""
|
||||
SELECT path, category
|
||||
FROM files_bak
|
||||
WHERE category IS NOT NULL
|
||||
""")
|
||||
|
||||
training_data = [(Path(path), category) for path, category in cursor.fetchall()]
|
||||
cursor.close()
|
||||
|
||||
if not training_data:
|
||||
self.logger.warning("No training data available")
|
||||
return False
|
||||
|
||||
# Count samples per category
|
||||
category_counts = {}
|
||||
for _, category in training_data:
|
||||
category_counts[category] = category_counts.get(category, 0) + 1
|
||||
|
||||
# Filter categories with enough samples
|
||||
filtered_data = [
|
||||
(path, category)
|
||||
for path, category in training_data
|
||||
if category_counts[category] >= min_samples
|
||||
]
|
||||
|
||||
if not filtered_data:
|
||||
self.logger.warning(f"No categories with >= {min_samples} samples")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Training with {len(filtered_data)} samples")
|
||||
|
||||
try:
|
||||
self.ml_classifier.train(filtered_data)
|
||||
self.logger.info("ML classifier trained successfully")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to train ML classifier: {e}")
|
||||
return False
|
||||
|
||||
def get_all_categories(self) -> list[str]:
|
||||
"""Get all categories from database
|
||||
|
||||
Returns:
|
||||
List of category names
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT DISTINCT category
|
||||
FROM files_bak
|
||||
WHERE category IS NOT NULL
|
||||
ORDER BY category
|
||||
""")
|
||||
|
||||
categories = [row[0] for row in cursor.fetchall()]
|
||||
cursor.close()
|
||||
|
||||
return categories
|
||||
|
||||
def close(self):
|
||||
"""Close database connection"""
|
||||
if self._connection and not self._connection.closed:
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close()
|
||||
269
app/classification/ml.py
Normal file
269
app/classification/ml.py
Normal file
@@ -0,0 +1,269 @@
|
||||
"""ML-based classification (optional, using sklearn if available)"""
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Tuple
|
||||
import pickle
|
||||
|
||||
try:
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
SKLEARN_AVAILABLE = True
|
||||
except ImportError:
|
||||
SKLEARN_AVAILABLE = False
|
||||
|
||||
|
||||
class MLClassifier:
|
||||
"""Machine learning-based file classifier
|
||||
|
||||
Uses path-based features and optional metadata to classify files.
|
||||
Requires scikit-learn to be installed.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize ML classifier"""
|
||||
if not SKLEARN_AVAILABLE:
|
||||
raise ImportError(
|
||||
"scikit-learn is required for ML classification. "
|
||||
"Install with: pip install scikit-learn"
|
||||
)
|
||||
|
||||
self.model: Optional[Pipeline] = None
|
||||
self.categories: List[str] = []
|
||||
self._is_trained = False
|
||||
|
||||
def _extract_features(self, path: Path) -> str:
|
||||
"""Extract features from path
|
||||
|
||||
Args:
|
||||
path: Path to extract features from
|
||||
|
||||
Returns:
|
||||
Feature string
|
||||
"""
|
||||
# Convert path to feature string
|
||||
# Include: path parts, extension, filename
|
||||
parts = path.parts
|
||||
extension = path.suffix
|
||||
filename = path.name
|
||||
|
||||
features = []
|
||||
|
||||
# Add path components
|
||||
features.extend(parts)
|
||||
|
||||
# Add extension
|
||||
if extension:
|
||||
features.append(f"ext:{extension}")
|
||||
|
||||
# Add filename components (split on common separators)
|
||||
name_parts = filename.replace('-', ' ').replace('_', ' ').replace('.', ' ').split()
|
||||
features.extend([f"name:{part}" for part in name_parts])
|
||||
|
||||
return ' '.join(features)
|
||||
|
||||
def train(self, training_data: List[Tuple[Path, str]]) -> None:
|
||||
"""Train the classifier
|
||||
|
||||
Args:
|
||||
training_data: List of (path, category) tuples
|
||||
"""
|
||||
if not training_data:
|
||||
raise ValueError("Training data cannot be empty")
|
||||
|
||||
# Extract features and labels
|
||||
X = [self._extract_features(path) for path, _ in training_data]
|
||||
y = [category for _, category in training_data]
|
||||
|
||||
# Store unique categories
|
||||
self.categories = sorted(set(y))
|
||||
|
||||
# Create and train pipeline
|
||||
self.model = Pipeline([
|
||||
('tfidf', TfidfVectorizer(
|
||||
max_features=1000,
|
||||
ngram_range=(1, 2),
|
||||
min_df=1
|
||||
)),
|
||||
('classifier', MultinomialNB())
|
||||
])
|
||||
|
||||
self.model.fit(X, y)
|
||||
self._is_trained = True
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
|
||||
"""Classify a file path
|
||||
|
||||
Args:
|
||||
path: Path to classify
|
||||
file_type: Optional file type hint (not used in ML classifier)
|
||||
|
||||
Returns:
|
||||
Category name or None if not trained
|
||||
"""
|
||||
if not self._is_trained or self.model is None:
|
||||
return None
|
||||
|
||||
features = self._extract_features(path)
|
||||
|
||||
try:
|
||||
prediction = self.model.predict([features])[0]
|
||||
return prediction
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def predict_proba(self, path: Path) -> dict[str, float]:
|
||||
"""Get prediction probabilities for all categories
|
||||
|
||||
Args:
|
||||
path: Path to classify
|
||||
|
||||
Returns:
|
||||
Dictionary mapping category to probability
|
||||
"""
|
||||
if not self._is_trained or self.model is None:
|
||||
return {}
|
||||
|
||||
features = self._extract_features(path)
|
||||
|
||||
try:
|
||||
probabilities = self.model.predict_proba([features])[0]
|
||||
return {
|
||||
category: float(prob)
|
||||
for category, prob in zip(self.categories, probabilities)
|
||||
}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def save_model(self, model_path: Path) -> None:
|
||||
"""Save trained model to disk
|
||||
|
||||
Args:
|
||||
model_path: Path to save model
|
||||
"""
|
||||
if not self._is_trained:
|
||||
raise ValueError("Cannot save untrained model")
|
||||
|
||||
model_data = {
|
||||
'model': self.model,
|
||||
'categories': self.categories,
|
||||
'is_trained': self._is_trained
|
||||
}
|
||||
|
||||
with open(model_path, 'wb') as f:
|
||||
pickle.dump(model_data, f)
|
||||
|
||||
def load_model(self, model_path: Path) -> None:
|
||||
"""Load trained model from disk
|
||||
|
||||
Args:
|
||||
model_path: Path to model file
|
||||
"""
|
||||
with open(model_path, 'rb') as f:
|
||||
model_data = pickle.load(f)
|
||||
|
||||
self.model = model_data['model']
|
||||
self.categories = model_data['categories']
|
||||
self._is_trained = model_data['is_trained']
|
||||
|
||||
@property
|
||||
def is_trained(self) -> bool:
|
||||
"""Check if model is trained"""
|
||||
return self._is_trained
|
||||
|
||||
|
||||
class DummyMLClassifier:
|
||||
"""Dummy ML classifier for when sklearn is not available"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize dummy classifier"""
|
||||
pass
|
||||
|
||||
def train(self, training_data: List[Tuple[Path, str]]) -> None:
|
||||
"""Dummy train method"""
|
||||
raise NotImplementedError(
|
||||
"ML classification requires scikit-learn. "
|
||||
"Install with: pip install scikit-learn"
|
||||
)
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
|
||||
"""Dummy classify method"""
|
||||
return None
|
||||
|
||||
def predict_proba(self, path: Path) -> dict[str, float]:
|
||||
"""Dummy predict_proba method"""
|
||||
return {}
|
||||
|
||||
def save_model(self, model_path: Path) -> None:
|
||||
"""Dummy save_model method"""
|
||||
raise NotImplementedError("ML classification not available")
|
||||
|
||||
def load_model(self, model_path: Path) -> None:
|
||||
"""Dummy load_model method"""
|
||||
raise NotImplementedError("ML classification not available")
|
||||
|
||||
@property
|
||||
def is_trained(self) -> bool:
|
||||
"""Check if model is trained"""
|
||||
return False
|
||||
|
||||
|
||||
def create_ml_classifier() -> MLClassifier | DummyMLClassifier:
|
||||
"""Create ML classifier if sklearn is available, otherwise return dummy
|
||||
|
||||
Returns:
|
||||
MLClassifier or DummyMLClassifier
|
||||
"""
|
||||
if SKLEARN_AVAILABLE:
|
||||
return MLClassifier()
|
||||
else:
|
||||
return DummyMLClassifier()
|
||||
|
||||
|
||||
def train_from_database(
|
||||
db_connection,
|
||||
min_samples_per_category: int = 10
|
||||
) -> MLClassifier | DummyMLClassifier:
|
||||
"""Train ML classifier from database
|
||||
|
||||
Args:
|
||||
db_connection: Database connection
|
||||
min_samples_per_category: Minimum samples required per category
|
||||
|
||||
Returns:
|
||||
Trained classifier
|
||||
"""
|
||||
classifier = create_ml_classifier()
|
||||
|
||||
if isinstance(classifier, DummyMLClassifier):
|
||||
return classifier
|
||||
|
||||
# Query classified files from database
|
||||
cursor = db_connection.cursor()
|
||||
cursor.execute("""
|
||||
SELECT path, category
|
||||
FROM files_bak
|
||||
WHERE category IS NOT NULL
|
||||
""")
|
||||
|
||||
training_data = [(Path(path), category) for path, category in cursor.fetchall()]
|
||||
cursor.close()
|
||||
|
||||
if not training_data:
|
||||
return classifier
|
||||
|
||||
# Count samples per category
|
||||
category_counts = {}
|
||||
for _, category in training_data:
|
||||
category_counts[category] = category_counts.get(category, 0) + 1
|
||||
|
||||
# Filter to categories with enough samples
|
||||
filtered_data = [
|
||||
(path, category)
|
||||
for path, category in training_data
|
||||
if category_counts[category] >= min_samples_per_category
|
||||
]
|
||||
|
||||
if filtered_data:
|
||||
classifier.train(filtered_data)
|
||||
|
||||
return classifier
|
||||
282
app/classification/rules.py
Normal file
282
app/classification/rules.py
Normal file
@@ -0,0 +1,282 @@
|
||||
"""Rule-based classification engine"""
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import fnmatch
|
||||
|
||||
from ._protocols import ClassificationRule
|
||||
|
||||
|
||||
class RuleBasedClassifier:
|
||||
"""Rule-based file classifier using pattern matching"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize rule-based classifier"""
|
||||
self.rules: list[ClassificationRule] = []
|
||||
self._load_default_rules()
|
||||
|
||||
def _load_default_rules(self):
|
||||
"""Load default classification rules based on ARCHITECTURE.md"""
|
||||
|
||||
# Build artifacts and caches
|
||||
self.add_rule(ClassificationRule(
|
||||
name="maven_cache",
|
||||
category="artifacts/java/maven",
|
||||
patterns=["**/.m2/**", "**/.maven/**", "**/maven-central-cache/**"],
|
||||
priority=10,
|
||||
description="Maven repository and cache"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="gradle_cache",
|
||||
category="artifacts/java/gradle",
|
||||
patterns=["**/.gradle/**", "**/gradle-cache/**", "**/gradle-build-cache/**"],
|
||||
priority=10,
|
||||
description="Gradle cache and artifacts"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="python_cache",
|
||||
category="cache/pycache",
|
||||
patterns=["**/__pycache__/**", "**/*.pyc", "**/*.pyo"],
|
||||
priority=10,
|
||||
description="Python cache files"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="python_artifacts",
|
||||
category="artifacts/python",
|
||||
patterns=["**/pip-cache/**", "**/pypi-cache/**", "**/wheelhouse/**"],
|
||||
priority=10,
|
||||
description="Python package artifacts"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="node_modules",
|
||||
category="cache/node_modules-archive",
|
||||
patterns=["**/node_modules/**"],
|
||||
priority=10,
|
||||
description="Node.js modules"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="node_cache",
|
||||
category="artifacts/node",
|
||||
patterns=["**/.npm/**", "**/npm-registry/**", "**/yarn-cache/**", "**/pnpm-store/**"],
|
||||
priority=10,
|
||||
description="Node.js package managers cache"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="go_cache",
|
||||
category="artifacts/go",
|
||||
patterns=["**/goproxy-cache/**", "**/go/pkg/mod/**", "**/go-module-cache/**"],
|
||||
priority=10,
|
||||
description="Go module cache"
|
||||
))
|
||||
|
||||
# Version control
|
||||
self.add_rule(ClassificationRule(
|
||||
name="git_repos",
|
||||
category="development/git-infrastructure",
|
||||
patterns=["**/.git/**", "**/gitea/repositories/**"],
|
||||
priority=15,
|
||||
description="Git repositories and infrastructure"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="gitea",
|
||||
category="development/gitea",
|
||||
patterns=["**/gitea/**"],
|
||||
priority=12,
|
||||
description="Gitea server data"
|
||||
))
|
||||
|
||||
# Databases
|
||||
self.add_rule(ClassificationRule(
|
||||
name="postgresql",
|
||||
category="databases/postgresql",
|
||||
patterns=["**/postgresql/**", "**/postgres/**", "**/*.sql"],
|
||||
priority=10,
|
||||
description="PostgreSQL databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="mysql",
|
||||
category="databases/mysql",
|
||||
patterns=["**/mysql/**", "**/mariadb/**"],
|
||||
priority=10,
|
||||
description="MySQL/MariaDB databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="mongodb",
|
||||
category="databases/mongodb",
|
||||
patterns=["**/mongodb/**", "**/mongo/**"],
|
||||
priority=10,
|
||||
description="MongoDB databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="redis",
|
||||
category="databases/redis",
|
||||
patterns=["**/redis/**", "**/*.rdb"],
|
||||
priority=10,
|
||||
description="Redis databases"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="sqlite",
|
||||
category="databases/sqlite",
|
||||
patterns=["**/*.db", "**/*.sqlite", "**/*.sqlite3"],
|
||||
priority=8,
|
||||
description="SQLite databases"
|
||||
))
|
||||
|
||||
# LLM and AI models
|
||||
self.add_rule(ClassificationRule(
|
||||
name="llm_models",
|
||||
category="cache/llm-models",
|
||||
patterns=[
|
||||
"**/hugging-face/**",
|
||||
"**/huggingface/**",
|
||||
"**/.cache/huggingface/**",
|
||||
"**/models/**/*.bin",
|
||||
"**/models/**/*.onnx",
|
||||
"**/models/**/*.safetensors",
|
||||
"**/llm*/**",
|
||||
"**/openai-cache/**"
|
||||
],
|
||||
priority=12,
|
||||
description="LLM and AI model files"
|
||||
))
|
||||
|
||||
# Docker and containers
|
||||
self.add_rule(ClassificationRule(
|
||||
name="docker_volumes",
|
||||
category="apps/volumes/docker-volumes",
|
||||
patterns=["**/docker/volumes/**", "**/var/lib/docker/volumes/**"],
|
||||
priority=10,
|
||||
description="Docker volumes"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="app_data",
|
||||
category="apps/volumes/app-data",
|
||||
patterns=["**/app-data/**", "**/application-data/**"],
|
||||
priority=8,
|
||||
description="Application data"
|
||||
))
|
||||
|
||||
# Build outputs
|
||||
self.add_rule(ClassificationRule(
|
||||
name="build_output",
|
||||
category="development/build-tools",
|
||||
patterns=["**/target/**", "**/build/**", "**/dist/**", "**/out/**"],
|
||||
priority=5,
|
||||
description="Build output directories"
|
||||
))
|
||||
|
||||
# Backups
|
||||
self.add_rule(ClassificationRule(
|
||||
name="system_backups",
|
||||
category="backups/system",
|
||||
patterns=["**/backup/**", "**/backups/**", "**/*.bak", "**/*.backup"],
|
||||
priority=10,
|
||||
description="System backups"
|
||||
))
|
||||
|
||||
self.add_rule(ClassificationRule(
|
||||
name="database_backups",
|
||||
category="backups/database",
|
||||
patterns=["**/*.sql.gz", "**/*.dump", "**/db-backup/**"],
|
||||
priority=11,
|
||||
description="Database backups"
|
||||
))
|
||||
|
||||
# Archives
|
||||
self.add_rule(ClassificationRule(
|
||||
name="archives",
|
||||
category="backups/archive",
|
||||
patterns=["**/*.tar", "**/*.tar.gz", "**/*.tgz", "**/*.zip", "**/*.7z"],
|
||||
priority=5,
|
||||
description="Archive files"
|
||||
))
|
||||
|
||||
def add_rule(self, rule: ClassificationRule) -> None:
|
||||
"""Add a classification rule
|
||||
|
||||
Args:
|
||||
rule: Rule to add
|
||||
"""
|
||||
self.rules.append(rule)
|
||||
# Sort rules by priority (higher priority first)
|
||||
self.rules.sort(key=lambda r: r.priority, reverse=True)
|
||||
|
||||
def remove_rule(self, rule_name: str) -> None:
|
||||
"""Remove a rule by name
|
||||
|
||||
Args:
|
||||
rule_name: Name of rule to remove
|
||||
"""
|
||||
self.rules = [r for r in self.rules if r.name != rule_name]
|
||||
|
||||
def match_path(self, path: Path) -> Optional[str]:
|
||||
"""Match path against rules
|
||||
|
||||
Args:
|
||||
path: Path to match
|
||||
|
||||
Returns:
|
||||
Category name or None if no match
|
||||
"""
|
||||
path_str = str(path)
|
||||
|
||||
# Try to match each rule in priority order
|
||||
for rule in self.rules:
|
||||
for pattern in rule.patterns:
|
||||
if fnmatch.fnmatch(path_str, pattern):
|
||||
return rule.category
|
||||
|
||||
return None
|
||||
|
||||
def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]:
|
||||
"""Classify a file path
|
||||
|
||||
Args:
|
||||
path: Path to classify
|
||||
file_type: Optional file type hint
|
||||
|
||||
Returns:
|
||||
Category name or None if no match
|
||||
"""
|
||||
return self.match_path(path)
|
||||
|
||||
def get_category_rules(self, category: str) -> list[ClassificationRule]:
|
||||
"""Get all rules for a category
|
||||
|
||||
Args:
|
||||
category: Category name
|
||||
|
||||
Returns:
|
||||
List of rules for the category
|
||||
"""
|
||||
return [r for r in self.rules if r.category == category]
|
||||
|
||||
def get_all_categories(self) -> set[str]:
|
||||
"""Get all defined categories
|
||||
|
||||
Returns:
|
||||
Set of category names
|
||||
"""
|
||||
return {r.category for r in self.rules}
|
||||
|
||||
def get_rules_by_priority(self, min_priority: int = 0) -> list[ClassificationRule]:
|
||||
"""Get rules above a minimum priority
|
||||
|
||||
Args:
|
||||
min_priority: Minimum priority threshold
|
||||
|
||||
Returns:
|
||||
List of rules with priority >= min_priority
|
||||
"""
|
||||
return [r for r in self.rules if r.priority >= min_priority]
|
||||
Reference in New Issue
Block a user