From 5098f5b29151dd3568cc2957fd2143153e49649b Mon Sep 17 00:00:00 2001 From: mike Date: Sat, 13 Dec 2025 11:53:29 +0100 Subject: [PATCH] fly wa --- app/analysis/folder_analyzer.py | 73 +---- app/classification/__init__.py | 1 - app/classification/_protocols.py | 46 +--- app/classification/classifier.py | 80 ++---- app/classification/engine.py | 264 +++--------------- app/classification/ml.py | 174 ++---------- app/classification/rules.py | 268 ++----------------- sql/{setup_database.sql => legacy_setup.sql} | 0 8 files changed, 100 insertions(+), 806 deletions(-) rename sql/{setup_database.sql => legacy_setup.sql} (100%) diff --git a/app/analysis/folder_analyzer.py b/app/analysis/folder_analyzer.py index 74d4e1c..6a3d48c 100644 --- a/app/analysis/folder_analyzer.py +++ b/app/analysis/folder_analyzer.py @@ -3,83 +3,42 @@ from typing import Dict, Set, List from collections import Counter class FolderAnalyzer: - def __init__(self): - self.manifest_files = { - 'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'], - 'javascript': ['package.json', 'yarn.lock', 'package-lock.json'], - 'python': ['pyproject.toml', 'setup.py', 'requirements.txt', 'Pipfile'], - 'go': ['go.mod', 'go.sum'], - 'rust': ['Cargo.toml', 'Cargo.lock'], - 'docker': ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml'], - 'k8s': ['helm', 'kustomization.yaml', 'deployment.yaml'] - } - self.intent_keywords = { - 'infrastructure': ['infra', 'deploy', 'k8s', 'docker', 'terraform', 'ansible'], - 'application': ['app', 'service', 'api', 'server', 'client'], - 'data': ['data', 'dataset', 'models', 'training', 'ml'], - 'documentation': ['docs', 'documentation', 'wiki', 'readme'], - 'testing': ['test', 'tests', 'spec', 'e2e', 'integration'], - 'build': ['build', 'dist', 'target', 'out', 'bin'], - 'config': ['config', 'conf', 'settings', 'env'] - } + def __init__(self): + self.manifest_files = {'java': ['pom.xml', 'build.gradle', 'build.gradle.kts'], 'javascript': ['package.json', 'yarn.lock', 'package-lock.json'], 'python': ['pyproject.toml', 'setup.py', 'requirements.txt', 'Pipfile'], 'go': ['go.mod', 'go.sum'], 'rust': ['Cargo.toml', 'Cargo.lock'], 'docker': ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml'], 'k8s': ['helm', 'kustomization.yaml', 'deployment.yaml']} + self.intent_keywords = {'infrastructure': ['infra', 'deploy', 'k8s', 'docker', 'terraform', 'ansible'], 'application': ['app', 'service', 'api', 'server', 'client'], 'data': ['data', 'dataset', 'models', 'training', 'ml'], 'documentation': ['docs', 'documentation', 'wiki', 'readme'], 'testing': ['test', 'tests', 'spec', 'e2e', 'integration'], 'build': ['build', 'dist', 'target', 'out', 'bin'], 'config': ['config', 'conf', 'settings', 'env']} def analyze_folder(self, folder_path: Path, files: List[Dict]) -> Dict: files_list = [Path(f['path']) for f in files] - - has_readme = any('readme' in f.name.lower() for f in files_list) - has_git = any('.git' in str(f) for f in files_list) - + has_readme = any(('readme' in f.name.lower() for f in files_list)) + has_git = any(('.git' in str(f) for f in files_list)) manifest_types = self._detect_manifests(files_list) has_manifest = len(manifest_types) > 0 - - file_types = Counter(f.suffix.lower() for f in files_list if f.suffix) + file_types = Counter((f.suffix.lower() for f in files_list if f.suffix)) dominant_types = dict(file_types.most_common(10)) - intent = self._infer_intent(folder_path.name.lower(), files_list) project_type = self._infer_project_type(manifest_types, dominant_types) - - structure = { - 'depth': len(folder_path.parts), - 'has_src': any('src' in str(f) for f in files_list[:20]), - 'has_tests': any('test' in str(f) for f in files_list[:20]), - 'has_docs': any('doc' in str(f) for f in files_list[:20]) - } - - return { - 'has_readme': has_readme, - 'has_git': has_git, - 'has_manifest': has_manifest, - 'manifest_types': manifest_types, - 'dominant_file_types': dominant_types, - 'project_type': project_type, - 'intent': intent, - 'structure': structure - } + structure = {'depth': len(folder_path.parts), 'has_src': any(('src' in str(f) for f in files_list[:20])), 'has_tests': any(('test' in str(f) for f in files_list[:20])), 'has_docs': any(('doc' in str(f) for f in files_list[:20]))} + return {'has_readme': has_readme, 'has_git': has_git, 'has_manifest': has_manifest, 'manifest_types': manifest_types, 'dominant_file_types': dominant_types, 'project_type': project_type, 'intent': intent, 'structure': structure} def _detect_manifests(self, files: List[Path]) -> List[str]: detected = [] file_names = {f.name for f in files} - for tech, manifests in self.manifest_files.items(): - if any(m in file_names for m in manifests): + if any((m in file_names for m in manifests)): detected.append(tech) - return detected def _infer_intent(self, folder_name: str, files: List[Path]) -> str: - file_str = ' '.join(str(f) for f in files[:50]) - + file_str = ' '.join((str(f) for f in files[:50])) for intent, keywords in self.intent_keywords.items(): - if any(kw in folder_name or kw in file_str.lower() for kw in keywords): + if any((kw in folder_name or kw in file_str.lower() for kw in keywords)): return intent - return 'unknown' def _infer_project_type(self, manifests: List[str], file_types: Dict) -> str: if manifests: return manifests[0] - if '.py' in file_types and file_types.get('.py', 0) > 5: return 'python' if '.js' in file_types or '.ts' in file_types: @@ -88,23 +47,17 @@ class FolderAnalyzer: return 'java' if '.go' in file_types: return 'go' - return 'mixed' - def generate_summary(self, folder_analysis: Dict, readme_text: str = None) -> str: + def generate_summary(self, folder_analysis: Dict, readme_text: str=None) -> str: parts = [] - if folder_analysis.get('project_type'): parts.append(f"{folder_analysis['project_type']} project") - if folder_analysis.get('intent'): parts.append(f"for {folder_analysis['intent']}") - if folder_analysis.get('manifest_types'): parts.append(f"using {', '.join(folder_analysis['manifest_types'])}") - if readme_text: first_para = readme_text.split('\n\n')[0][:200] - parts.append(f"Description: {first_para}") - + parts.append(f'Description: {first_para}') return ' '.join(parts) if parts else 'Mixed content folder' diff --git a/app/classification/__init__.py b/app/classification/__init__.py index 4b32f89..f298997 100644 --- a/app/classification/__init__.py +++ b/app/classification/__init__.py @@ -1,3 +1,2 @@ from .classifier import FileClassifier - __all__ = ['FileClassifier'] diff --git a/app/classification/_protocols.py b/app/classification/_protocols.py index 4623b3f..d2d43b7 100644 --- a/app/classification/_protocols.py +++ b/app/classification/_protocols.py @@ -1,72 +1,30 @@ -"""Protocol definitions for the classification package""" from typing import Protocol, Optional from pathlib import Path from dataclasses import dataclass - @dataclass class ClassificationRule: - """Rule for classifying files""" name: str category: str patterns: list[str] priority: int = 0 - description: str = "" - + description: str = '' class IClassifier(Protocol): - """Protocol for classification operations""" - def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]: - """Classify a file path - - Args: - path: Path to classify - file_type: Optional file type hint - - Returns: - Category name or None if no match - """ + def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]: ... def get_category_rules(self, category: str) -> list[ClassificationRule]: - """Get all rules for a category - - Args: - category: Category name - - Returns: - List of rules for the category - """ ... - class IRuleEngine(Protocol): - """Protocol for rule-based classification""" def add_rule(self, rule: ClassificationRule) -> None: - """Add a classification rule - - Args: - rule: Rule to add - """ ... def remove_rule(self, rule_name: str) -> None: - """Remove a rule by name - - Args: - rule_name: Name of rule to remove - """ ... def match_path(self, path: Path) -> Optional[str]: - """Match path against rules - - Args: - path: Path to match - - Returns: - Category name or None if no match - """ ... diff --git a/app/classification/classifier.py b/app/classification/classifier.py index f870c3f..5eed3f4 100644 --- a/app/classification/classifier.py +++ b/app/classification/classifier.py @@ -3,122 +3,72 @@ from typing import List, Set, Dict, Tuple import re class FileClassifier: + def __init__(self): - self.build_patterns = { - 'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', - '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', - 'site-packages', 'bower_components', 'jspm_packages' - } + self.build_patterns = {'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', 'site-packages', 'bower_components', 'jspm_packages'} + self.artifact_patterns = {'java': {'.jar', '.war', '.ear', '.class'}, 'python': {'.pyc', '.pyo', '.whl', '.egg'}, 'node': {'node_modules'}, 'go': {'vendor', 'pkg'}, 'rust': {'target'}, 'docker': {'.dockerignore', 'Dockerfile'}} + self.category_keywords = {'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, 'cache': {'cache', 'temp', 'tmp', '.cache'}, 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, 'backups': {'backup', 'bak', 'snapshot', 'archive'}, 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, 'artifacts': {'build', 'dist', 'release', 'output'}, 'temp': {'tmp', 'temp', 'staging', 'processing'}} + self.media_extensions = {'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, 'presentation': {'.ppt', '.pptx', '.odp'}} + self.code_extensions = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'} - self.artifact_patterns = { - 'java': {'.jar', '.war', '.ear', '.class'}, - 'python': {'.pyc', '.pyo', '.whl', '.egg'}, - 'node': {'node_modules'}, - 'go': {'vendor', 'pkg'}, - 'rust': {'target'}, - 'docker': {'.dockerignore', 'Dockerfile'} - } - - self.category_keywords = { - 'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, - 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, - 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, - 'cache': {'cache', 'temp', 'tmp', '.cache'}, - 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, - 'backups': {'backup', 'bak', 'snapshot', 'archive'}, - 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, - 'artifacts': {'build', 'dist', 'release', 'output'}, - 'temp': {'tmp', 'temp', 'staging', 'processing'} - } - - self.media_extensions = { - 'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, - 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, - 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, - 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, - 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, - 'presentation': {'.ppt', '.pptx', '.odp'} - } - - self.code_extensions = { - '.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', - '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r' - } - - def classify_path(self, path: str, size: int = 0) -> Tuple[Set[str], str, bool]: + def classify_path(self, path: str, size: int=0) -> Tuple[Set[str], str, bool]: p = Path(path) labels = set() primary_category = 'misc' is_build_artifact = False - parts = p.parts name_lower = p.name.lower() - for part in parts: part_lower = part.lower() if part_lower in self.build_patterns: is_build_artifact = True labels.add('build-artifact') break - if is_build_artifact: for artifact_type, patterns in self.artifact_patterns.items(): - if any(part.lower() in patterns for part in parts) or p.suffix in patterns: + if any((part.lower() in patterns for part in parts)) or p.suffix in patterns: primary_category = f'artifacts/{artifact_type}' labels.add('artifact') - return labels, primary_category, is_build_artifact - + return (labels, primary_category, is_build_artifact) if '.git' in parts: labels.add('vcs') primary_category = 'infra/git-infrastructure' - return labels, primary_category, False - + return (labels, primary_category, False) for category, keywords in self.category_keywords.items(): - if any(kw in name_lower or any(kw in part.lower() for part in parts) for kw in keywords): + if any((kw in name_lower or any((kw in part.lower() for part in parts)) for kw in keywords)): labels.add(category) primary_category = category break - for media_type, extensions in self.media_extensions.items(): if p.suffix.lower() in extensions: labels.add(media_type) labels.add('media') primary_category = f'user/{media_type}' break - if p.suffix.lower() in self.code_extensions: labels.add('code') if primary_category == 'misc': primary_category = 'dev' - if size > 100 * 1024 * 1024: labels.add('large-file') - - if any(kw in name_lower for kw in ['test', 'spec', 'mock']): + if any((kw in name_lower for kw in ['test', 'spec', 'mock'])): labels.add('test') - - if any(kw in name_lower for kw in ['config', 'settings', 'env']): + if any((kw in name_lower for kw in ['config', 'settings', 'env'])): labels.add('config') - - return labels, primary_category, is_build_artifact + return (labels, primary_category, is_build_artifact) def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str: p = Path(source_path) - if 'build-artifact' in labels: return f'trash/build-artifacts/{source_path}' - if category.startswith('artifacts/'): artifact_type = category.split('/')[-1] return f'artifacts/{artifact_type}/{p.name}' - if category.startswith('user/'): media_type = category.split('/')[-1] return f'user/{media_type}/{p.name}' - parts = [part for part in p.parts if part not in self.build_patterns] if len(parts) > 3: project_name = parts[0] if parts else 'misc' - return f'{category}/{project_name}/{"/".join(parts[1:])}' - + return f"{category}/{project_name}/{'/'.join(parts[1:])}" return f'{category}/{source_path}' diff --git a/app/classification/engine.py b/app/classification/engine.py index fbf105d..226b566 100644 --- a/app/classification/engine.py +++ b/app/classification/engine.py @@ -1,350 +1,148 @@ -"""Main classification engine""" from pathlib import Path from typing import Optional, Callable import psycopg2 - from .rules import RuleBasedClassifier from .ml import create_ml_classifier, DummyMLClassifier from ..shared.models import ProcessingStats from ..shared.config import DatabaseConfig from ..shared.logger import ProgressLogger - class ClassificationEngine: - """Engine for classifying files""" - def __init__( - self, - db_config: DatabaseConfig, - logger: ProgressLogger, - use_ml: bool = False - ): - """Initialize classification engine - - Args: - db_config: Database configuration - logger: Progress logger - use_ml: Whether to use ML classification in addition to rules - """ + def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, use_ml: bool=False): self.db_config = db_config self.logger = logger self.rule_classifier = RuleBasedClassifier() self.ml_classifier = create_ml_classifier() if use_ml else None - self.use_ml = use_ml and not isinstance(self.ml_classifier, DummyMLClassifier) + self.use_ml = use_ml and (not isinstance(self.ml_classifier, DummyMLClassifier)) self._connection = None def _get_connection(self): - """Get or create database connection""" if self._connection is None or self._connection.closed: - self._connection = psycopg2.connect( - host=self.db_config.host, - port=self.db_config.port, - database=self.db_config.database, - user=self.db_config.user, - password=self.db_config.password - ) + self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password) return self._connection - def classify_all( - self, - disk: Optional[str] = None, - batch_size: int = 1000, - progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None - ) -> ProcessingStats: - """Classify all files in database - - Args: - disk: Optional disk filter - batch_size: Number of files to process per batch - progress_callback: Optional callback for progress updates - - Returns: - ProcessingStats with classification statistics - """ - self.logger.section("Starting Classification") - + def classify_all(self, disk: Optional[str]=None, batch_size: int=1000, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats: + self.logger.section('Starting Classification') conn = self._get_connection() cursor = conn.cursor() - - # Get files without categories if disk: - cursor.execute(""" - SELECT path, checksum - FROM files - WHERE disk_label = %s AND category IS NULL - """, (disk,)) + cursor.execute('\n SELECT path, checksum\n FROM files\n WHERE disk_label = %s AND category IS NULL\n ', (disk,)) else: - cursor.execute(""" - SELECT path, checksum - FROM files - WHERE category IS NULL - """) - + cursor.execute('\n SELECT path, checksum\n FROM files\n WHERE category IS NULL\n ') files_to_classify = cursor.fetchall() total_files = len(files_to_classify) - - self.logger.info(f"Found {total_files} files to classify") - + self.logger.info(f'Found {total_files} files to classify') stats = ProcessingStats() batch = [] - for path_str, checksum in files_to_classify: path = Path(path_str) - - # Classify using rules first category = self.rule_classifier.classify(path) - - # If no rule match and ML is available, try ML if category is None and self.use_ml and self.ml_classifier: category = self.ml_classifier.classify(path) - - # If still no category, assign default if category is None: - category = "temp/processing" - + category = 'temp/processing' batch.append((category, str(path))) stats.files_processed += 1 - - # Batch update if len(batch) >= batch_size: self._update_categories(cursor, batch) conn.commit() batch.clear() - - # Progress callback if progress_callback: progress_callback(stats.files_processed, total_files, stats) - - # Log progress if stats.files_processed % (batch_size * 10) == 0: - self.logger.progress( - stats.files_processed, - total_files, - prefix="Files classified", - elapsed_seconds=stats.elapsed_seconds - ) - - # Update remaining batch + self.logger.progress(stats.files_processed, total_files, prefix='Files classified', elapsed_seconds=stats.elapsed_seconds) if batch: self._update_categories(cursor, batch) conn.commit() - stats.files_succeeded = stats.files_processed - cursor.close() - - self.logger.info( - f"Classification complete: {stats.files_processed} files in {stats.elapsed_seconds:.1f}s" - ) - + self.logger.info(f'Classification complete: {stats.files_processed} files in {stats.elapsed_seconds:.1f}s') return stats def _update_categories(self, cursor, batch: list[tuple[str, str]]): - """Update categories in batch - - Args: - cursor: Database cursor - batch: List of (category, path) tuples - """ from psycopg2.extras import execute_batch - - query = """ - UPDATE files - SET category = %s - WHERE path = %s - """ - + query = '\n UPDATE files\n SET category = %s\n WHERE path = %s\n ' execute_batch(cursor, query, batch) def classify_path(self, path: Path) -> Optional[str]: - """Classify a single path - - Args: - path: Path to classify - - Returns: - Category name or None - """ - # Try rules first category = self.rule_classifier.classify(path) - - # Try ML if available if category is None and self.use_ml and self.ml_classifier: category = self.ml_classifier.classify(path) - return category def get_category_stats(self) -> dict[str, dict]: - """Get statistics by category - - Returns: - Dictionary mapping category to statistics - """ conn = self._get_connection() cursor = conn.cursor() - - cursor.execute(""" - SELECT - category, - COUNT(*) as file_count, - SUM(size) as total_size - FROM files - WHERE category IS NOT NULL - GROUP BY category - ORDER BY total_size DESC - """) - + cursor.execute('\n SELECT\n category,\n COUNT(*) as file_count,\n SUM(size) as total_size\n FROM files\n WHERE category IS NOT NULL\n GROUP BY category\n ORDER BY total_size DESC\n ') stats = {} for category, file_count, total_size in cursor.fetchall(): - stats[category] = { - 'file_count': file_count, - 'total_size': total_size - } - + stats[category] = {'file_count': file_count, 'total_size': total_size} cursor.close() - return stats def get_uncategorized_count(self) -> int: - """Get count of uncategorized files - - Returns: - Number of files without category - """ conn = self._get_connection() cursor = conn.cursor() - - cursor.execute("SELECT COUNT(*) FROM files WHERE category IS NULL") + cursor.execute('SELECT COUNT(*) FROM files WHERE category IS NULL') count = cursor.fetchone()[0] - cursor.close() - return count - def reclassify_category( - self, - old_category: str, - new_category: str - ) -> int: - """Reclassify all files in a category - - Args: - old_category: Current category - new_category: New category - - Returns: - Number of files reclassified - """ - self.logger.info(f"Reclassifying {old_category} -> {new_category}") - + def reclassify_category(self, old_category: str, new_category: str) -> int: + self.logger.info(f'Reclassifying {old_category} -> {new_category}') conn = self._get_connection() cursor = conn.cursor() - - cursor.execute(""" - UPDATE files - SET category = %s - WHERE category = %s - """, (new_category, old_category)) - + cursor.execute('\n UPDATE files\n SET category = %s\n WHERE category = %s\n ', (new_category, old_category)) count = cursor.rowcount conn.commit() cursor.close() - - self.logger.info(f"Reclassified {count} files") - + self.logger.info(f'Reclassified {count} files') return count - def train_ml_classifier( - self, - min_samples: int = 10 - ) -> bool: - """Train ML classifier from existing categorized data - - Args: - min_samples: Minimum samples per category - - Returns: - True if training successful - """ + def train_ml_classifier(self, min_samples: int=10) -> bool: if not self.use_ml or self.ml_classifier is None: - self.logger.warning("ML classifier not available") + self.logger.warning('ML classifier not available') return False - - self.logger.subsection("Training ML Classifier") - + self.logger.subsection('Training ML Classifier') conn = self._get_connection() cursor = conn.cursor() - - # Get categorized files - cursor.execute(""" - SELECT path, category - FROM files - WHERE category IS NOT NULL - """) - + cursor.execute('\n SELECT path, category\n FROM files\n WHERE category IS NOT NULL\n ') training_data = [(Path(path), category) for path, category in cursor.fetchall()] cursor.close() - if not training_data: - self.logger.warning("No training data available") + self.logger.warning('No training data available') return False - - # Count samples per category category_counts = {} for _, category in training_data: category_counts[category] = category_counts.get(category, 0) + 1 - - # Filter categories with enough samples - filtered_data = [ - (path, category) - for path, category in training_data - if category_counts[category] >= min_samples - ] - + filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples] if not filtered_data: - self.logger.warning(f"No categories with >= {min_samples} samples") + self.logger.warning(f'No categories with >= {min_samples} samples') return False - - self.logger.info(f"Training with {len(filtered_data)} samples") - + self.logger.info(f'Training with {len(filtered_data)} samples') try: self.ml_classifier.train(filtered_data) - self.logger.info("ML classifier trained successfully") + self.logger.info('ML classifier trained successfully') return True except Exception as e: - self.logger.error(f"Failed to train ML classifier: {e}") + self.logger.error(f'Failed to train ML classifier: {e}') return False def get_all_categories(self) -> list[str]: - """Get all categories from database - - Returns: - List of category names - """ conn = self._get_connection() cursor = conn.cursor() - - cursor.execute(""" - SELECT DISTINCT category - FROM files - WHERE category IS NOT NULL - ORDER BY category - """) - + cursor.execute('\n SELECT DISTINCT category\n FROM files\n WHERE category IS NOT NULL\n ORDER BY category\n ') categories = [row[0] for row in cursor.fetchall()] cursor.close() - return categories def close(self): - """Close database connection""" - if self._connection and not self._connection.closed: + if self._connection and (not self._connection.closed): self._connection.close() def __enter__(self): - """Context manager entry""" return self def __exit__(self, exc_type, exc_val, exc_tb): - """Context manager exit""" self.close() diff --git a/app/classification/ml.py b/app/classification/ml.py index d334d90..942fa7e 100644 --- a/app/classification/ml.py +++ b/app/classification/ml.py @@ -1,8 +1,6 @@ -"""ML-based classification (optional, using sklearn if available)""" from pathlib import Path from typing import Optional, List, Tuple import pickle - try: from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB @@ -11,100 +9,41 @@ try: except ImportError: SKLEARN_AVAILABLE = False - class MLClassifier: - """Machine learning-based file classifier - - Uses path-based features and optional metadata to classify files. - Requires scikit-learn to be installed. - """ def __init__(self): - """Initialize ML classifier""" if not SKLEARN_AVAILABLE: - raise ImportError( - "scikit-learn is required for ML classification. " - "Install with: pip install scikit-learn" - ) - + raise ImportError('scikit-learn is required for ML classification. Install with: pip install scikit-learn') self.model: Optional[Pipeline] = None self.categories: List[str] = [] self._is_trained = False def _extract_features(self, path: Path) -> str: - """Extract features from path - - Args: - path: Path to extract features from - - Returns: - Feature string - """ - # Convert path to feature string - # Include: path parts, extension, filename parts = path.parts extension = path.suffix filename = path.name - features = [] - - # Add path components features.extend(parts) - - # Add extension if extension: - features.append(f"ext:{extension}") - - # Add filename components (split on common separators) + features.append(f'ext:{extension}') name_parts = filename.replace('-', ' ').replace('_', ' ').replace('.', ' ').split() - features.extend([f"name:{part}" for part in name_parts]) - + features.extend([f'name:{part}' for part in name_parts]) return ' '.join(features) def train(self, training_data: List[Tuple[Path, str]]) -> None: - """Train the classifier - - Args: - training_data: List of (path, category) tuples - """ if not training_data: - raise ValueError("Training data cannot be empty") - - # Extract features and labels + raise ValueError('Training data cannot be empty') X = [self._extract_features(path) for path, _ in training_data] y = [category for _, category in training_data] - - # Store unique categories self.categories = sorted(set(y)) - - # Create and train pipeline - self.model = Pipeline([ - ('tfidf', TfidfVectorizer( - max_features=1000, - ngram_range=(1, 2), - min_df=1 - )), - ('classifier', MultinomialNB()) - ]) - + self.model = Pipeline([('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 2), min_df=1)), ('classifier', MultinomialNB())]) self.model.fit(X, y) self._is_trained = True - def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]: - """Classify a file path - - Args: - path: Path to classify - file_type: Optional file type hint (not used in ML classifier) - - Returns: - Category name or None if not trained - """ + def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]: if not self._is_trained or self.model is None: return None - features = self._extract_features(path) - try: prediction = self.model.predict([features])[0] return prediction @@ -112,158 +51,77 @@ class MLClassifier: return None def predict_proba(self, path: Path) -> dict[str, float]: - """Get prediction probabilities for all categories - - Args: - path: Path to classify - - Returns: - Dictionary mapping category to probability - """ if not self._is_trained or self.model is None: return {} - features = self._extract_features(path) - try: probabilities = self.model.predict_proba([features])[0] - return { - category: float(prob) - for category, prob in zip(self.categories, probabilities) - } + return {category: float(prob) for category, prob in zip(self.categories, probabilities)} except Exception: return {} def save_model(self, model_path: Path) -> None: - """Save trained model to disk - - Args: - model_path: Path to save model - """ if not self._is_trained: - raise ValueError("Cannot save untrained model") - - model_data = { - 'model': self.model, - 'categories': self.categories, - 'is_trained': self._is_trained - } - + raise ValueError('Cannot save untrained model') + model_data = {'model': self.model, 'categories': self.categories, 'is_trained': self._is_trained} with open(model_path, 'wb') as f: pickle.dump(model_data, f) def load_model(self, model_path: Path) -> None: - """Load trained model from disk - - Args: - model_path: Path to model file - """ with open(model_path, 'rb') as f: model_data = pickle.load(f) - self.model = model_data['model'] self.categories = model_data['categories'] self._is_trained = model_data['is_trained'] @property def is_trained(self) -> bool: - """Check if model is trained""" return self._is_trained - class DummyMLClassifier: - """Dummy ML classifier for when sklearn is not available""" def __init__(self): - """Initialize dummy classifier""" pass def train(self, training_data: List[Tuple[Path, str]]) -> None: - """Dummy train method""" - raise NotImplementedError( - "ML classification requires scikit-learn. " - "Install with: pip install scikit-learn" - ) + raise NotImplementedError('ML classification requires scikit-learn. Install with: pip install scikit-learn') - def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]: - """Dummy classify method""" + def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]: return None def predict_proba(self, path: Path) -> dict[str, float]: - """Dummy predict_proba method""" return {} def save_model(self, model_path: Path) -> None: - """Dummy save_model method""" - raise NotImplementedError("ML classification not available") + raise NotImplementedError('ML classification not available') def load_model(self, model_path: Path) -> None: - """Dummy load_model method""" - raise NotImplementedError("ML classification not available") + raise NotImplementedError('ML classification not available') @property def is_trained(self) -> bool: - """Check if model is trained""" return False - def create_ml_classifier() -> MLClassifier | DummyMLClassifier: - """Create ML classifier if sklearn is available, otherwise return dummy - - Returns: - MLClassifier or DummyMLClassifier - """ if SKLEARN_AVAILABLE: return MLClassifier() else: return DummyMLClassifier() - -def train_from_database( - db_connection, - min_samples_per_category: int = 10 -) -> MLClassifier | DummyMLClassifier: - """Train ML classifier from database - - Args: - db_connection: Database connection - min_samples_per_category: Minimum samples required per category - - Returns: - Trained classifier - """ +def train_from_database(db_connection, min_samples_per_category: int=10) -> MLClassifier | DummyMLClassifier: classifier = create_ml_classifier() - if isinstance(classifier, DummyMLClassifier): return classifier - - # Query classified files from database cursor = db_connection.cursor() - cursor.execute(""" - SELECT path, category - FROM files - WHERE category IS NOT NULL - """) - + cursor.execute('\n SELECT path, category\n FROM files\n WHERE category IS NOT NULL\n ') training_data = [(Path(path), category) for path, category in cursor.fetchall()] cursor.close() - if not training_data: return classifier - - # Count samples per category category_counts = {} for _, category in training_data: category_counts[category] = category_counts.get(category, 0) + 1 - - # Filter to categories with enough samples - filtered_data = [ - (path, category) - for path, category in training_data - if category_counts[category] >= min_samples_per_category - ] - + filtered_data = [(path, category) for path, category in training_data if category_counts[category] >= min_samples_per_category] if filtered_data: classifier.train(filtered_data) - return classifier diff --git a/app/classification/rules.py b/app/classification/rules.py index b194c18..f7dbb76 100644 --- a/app/classification/rules.py +++ b/app/classification/rules.py @@ -1,282 +1,60 @@ -"""Rule-based classification engine""" from pathlib import Path from typing import Optional import fnmatch - from ._protocols import ClassificationRule - class RuleBasedClassifier: - """Rule-based file classifier using pattern matching""" def __init__(self): - """Initialize rule-based classifier""" self.rules: list[ClassificationRule] = [] self._load_default_rules() def _load_default_rules(self): - """Load default classification rules based on ARCHITECTURE.md""" - - # Build artifacts and caches - self.add_rule(ClassificationRule( - name="maven_cache", - category="artifacts/java/maven", - patterns=["**/.m2/**", "**/.maven/**", "**/maven-central-cache/**"], - priority=10, - description="Maven repository and cache" - )) - - self.add_rule(ClassificationRule( - name="gradle_cache", - category="artifacts/java/gradle", - patterns=["**/.gradle/**", "**/gradle-cache/**", "**/gradle-build-cache/**"], - priority=10, - description="Gradle cache and artifacts" - )) - - self.add_rule(ClassificationRule( - name="python_cache", - category="cache/pycache", - patterns=["**/__pycache__/**", "**/*.pyc", "**/*.pyo"], - priority=10, - description="Python cache files" - )) - - self.add_rule(ClassificationRule( - name="python_artifacts", - category="artifacts/python", - patterns=["**/pip-cache/**", "**/pypi-cache/**", "**/wheelhouse/**"], - priority=10, - description="Python package artifacts" - )) - - self.add_rule(ClassificationRule( - name="node_modules", - category="cache/node_modules-archive", - patterns=["**/node_modules/**"], - priority=10, - description="Node.js modules" - )) - - self.add_rule(ClassificationRule( - name="node_cache", - category="artifacts/node", - patterns=["**/.npm/**", "**/npm-registry/**", "**/yarn-cache/**", "**/pnpm-store/**"], - priority=10, - description="Node.js package managers cache" - )) - - self.add_rule(ClassificationRule( - name="go_cache", - category="artifacts/go", - patterns=["**/goproxy-cache/**", "**/go/pkg/mod/**", "**/go-module-cache/**"], - priority=10, - description="Go module cache" - )) - - # Version control - self.add_rule(ClassificationRule( - name="git_repos", - category="development/git-infrastructure", - patterns=["**/.git/**", "**/gitea/repositories/**"], - priority=15, - description="Git repositories and infrastructure" - )) - - self.add_rule(ClassificationRule( - name="gitea", - category="development/gitea", - patterns=["**/gitea/**"], - priority=12, - description="Gitea server data" - )) - - # Databases - self.add_rule(ClassificationRule( - name="postgresql", - category="databases/postgresql", - patterns=["**/postgresql/**", "**/postgres/**", "**/*.sql"], - priority=10, - description="PostgreSQL databases" - )) - - self.add_rule(ClassificationRule( - name="mysql", - category="databases/mysql", - patterns=["**/mysql/**", "**/mariadb/**"], - priority=10, - description="MySQL/MariaDB databases" - )) - - self.add_rule(ClassificationRule( - name="mongodb", - category="databases/mongodb", - patterns=["**/mongodb/**", "**/mongo/**"], - priority=10, - description="MongoDB databases" - )) - - self.add_rule(ClassificationRule( - name="redis", - category="databases/redis", - patterns=["**/redis/**", "**/*.rdb"], - priority=10, - description="Redis databases" - )) - - self.add_rule(ClassificationRule( - name="sqlite", - category="databases/sqlite", - patterns=["**/*.db", "**/*.sqlite", "**/*.sqlite3"], - priority=8, - description="SQLite databases" - )) - - # LLM and AI models - self.add_rule(ClassificationRule( - name="llm_models", - category="cache/llm-models", - patterns=[ - "**/hugging-face/**", - "**/huggingface/**", - "**/.cache/huggingface/**", - "**/models/**/*.bin", - "**/models/**/*.onnx", - "**/models/**/*.safetensors", - "**/llm*/**", - "**/openai-cache/**" - ], - priority=12, - description="LLM and AI model files" - )) - - # Docker and containers - self.add_rule(ClassificationRule( - name="docker_volumes", - category="apps/volumes/docker-volumes", - patterns=["**/docker/volumes/**", "**/var/lib/docker/volumes/**"], - priority=10, - description="Docker volumes" - )) - - self.add_rule(ClassificationRule( - name="app_data", - category="apps/volumes/app-data", - patterns=["**/app-data/**", "**/application-data/**"], - priority=8, - description="Application data" - )) - - # Build outputs - self.add_rule(ClassificationRule( - name="build_output", - category="development/build-tools", - patterns=["**/target/**", "**/build/**", "**/dist/**", "**/out/**"], - priority=5, - description="Build output directories" - )) - - # Backups - self.add_rule(ClassificationRule( - name="system_backups", - category="backups/system", - patterns=["**/backup/**", "**/backups/**", "**/*.bak", "**/*.backup"], - priority=10, - description="System backups" - )) - - self.add_rule(ClassificationRule( - name="database_backups", - category="backups/database", - patterns=["**/*.sql.gz", "**/*.dump", "**/db-backup/**"], - priority=11, - description="Database backups" - )) - - # Archives - self.add_rule(ClassificationRule( - name="archives", - category="backups/archive", - patterns=["**/*.tar", "**/*.tar.gz", "**/*.tgz", "**/*.zip", "**/*.7z"], - priority=5, - description="Archive files" - )) + self.add_rule(ClassificationRule(name='maven_cache', category='artifacts/java/maven', patterns=['**/.m2/**', '**/.maven/**', '**/maven-central-cache/**'], priority=10, description='Maven repository and cache')) + self.add_rule(ClassificationRule(name='gradle_cache', category='artifacts/java/gradle', patterns=['**/.gradle/**', '**/gradle-cache/**', '**/gradle-build-cache/**'], priority=10, description='Gradle cache and artifacts')) + self.add_rule(ClassificationRule(name='python_cache', category='cache/pycache', patterns=['**/__pycache__/**', '**/*.pyc', '**/*.pyo'], priority=10, description='Python cache files')) + self.add_rule(ClassificationRule(name='python_artifacts', category='artifacts/python', patterns=['**/pip-cache/**', '**/pypi-cache/**', '**/wheelhouse/**'], priority=10, description='Python package artifacts')) + self.add_rule(ClassificationRule(name='node_modules', category='cache/node_modules-archive', patterns=['**/node_modules/**'], priority=10, description='Node.js modules')) + self.add_rule(ClassificationRule(name='node_cache', category='artifacts/node', patterns=['**/.npm/**', '**/npm-registry/**', '**/yarn-cache/**', '**/pnpm-store/**'], priority=10, description='Node.js package managers cache')) + self.add_rule(ClassificationRule(name='go_cache', category='artifacts/go', patterns=['**/goproxy-cache/**', '**/go/pkg/mod/**', '**/go-module-cache/**'], priority=10, description='Go module cache')) + self.add_rule(ClassificationRule(name='git_repos', category='development/git-infrastructure', patterns=['**/.git/**', '**/gitea/repositories/**'], priority=15, description='Git repositories and infrastructure')) + self.add_rule(ClassificationRule(name='gitea', category='development/gitea', patterns=['**/gitea/**'], priority=12, description='Gitea server data')) + self.add_rule(ClassificationRule(name='postgresql', category='databases/postgresql', patterns=['**/postgresql/**', '**/postgres/**', '**/*.sql'], priority=10, description='PostgreSQL databases')) + self.add_rule(ClassificationRule(name='mysql', category='databases/mysql', patterns=['**/mysql/**', '**/mariadb/**'], priority=10, description='MySQL/MariaDB databases')) + self.add_rule(ClassificationRule(name='mongodb', category='databases/mongodb', patterns=['**/mongodb/**', '**/mongo/**'], priority=10, description='MongoDB databases')) + self.add_rule(ClassificationRule(name='redis', category='databases/redis', patterns=['**/redis/**', '**/*.rdb'], priority=10, description='Redis databases')) + self.add_rule(ClassificationRule(name='sqlite', category='databases/sqlite', patterns=['**/*.db', '**/*.sqlite', '**/*.sqlite3'], priority=8, description='SQLite databases')) + self.add_rule(ClassificationRule(name='llm_models', category='cache/llm-models', patterns=['**/hugging-face/**', '**/huggingface/**', '**/.cache/huggingface/**', '**/models/**/*.bin', '**/models/**/*.onnx', '**/models/**/*.safetensors', '**/llm*/**', '**/openai-cache/**'], priority=12, description='LLM and AI model files')) + self.add_rule(ClassificationRule(name='docker_volumes', category='apps/volumes/docker-volumes', patterns=['**/docker/volumes/**', '**/var/lib/docker/volumes/**'], priority=10, description='Docker volumes')) + self.add_rule(ClassificationRule(name='app_data', category='apps/volumes/app-data', patterns=['**/app-data/**', '**/application-data/**'], priority=8, description='Application data')) + self.add_rule(ClassificationRule(name='build_output', category='development/build-tools', patterns=['**/target/**', '**/build/**', '**/dist/**', '**/out/**'], priority=5, description='Build output directories')) + self.add_rule(ClassificationRule(name='system_backups', category='backups/system', patterns=['**/backup/**', '**/backups/**', '**/*.bak', '**/*.backup'], priority=10, description='System backups')) + self.add_rule(ClassificationRule(name='database_backups', category='backups/database', patterns=['**/*.sql.gz', '**/*.dump', '**/db-backup/**'], priority=11, description='Database backups')) + self.add_rule(ClassificationRule(name='archives', category='backups/archive', patterns=['**/*.tar', '**/*.tar.gz', '**/*.tgz', '**/*.zip', '**/*.7z'], priority=5, description='Archive files')) def add_rule(self, rule: ClassificationRule) -> None: - """Add a classification rule - - Args: - rule: Rule to add - """ self.rules.append(rule) - # Sort rules by priority (higher priority first) self.rules.sort(key=lambda r: r.priority, reverse=True) def remove_rule(self, rule_name: str) -> None: - """Remove a rule by name - - Args: - rule_name: Name of rule to remove - """ self.rules = [r for r in self.rules if r.name != rule_name] def match_path(self, path: Path) -> Optional[str]: - """Match path against rules - - Args: - path: Path to match - - Returns: - Category name or None if no match - """ path_str = str(path) - - # Try to match each rule in priority order for rule in self.rules: for pattern in rule.patterns: if fnmatch.fnmatch(path_str, pattern): return rule.category - return None - def classify(self, path: Path, file_type: Optional[str] = None) -> Optional[str]: - """Classify a file path - - Args: - path: Path to classify - file_type: Optional file type hint - - Returns: - Category name or None if no match - """ + def classify(self, path: Path, file_type: Optional[str]=None) -> Optional[str]: return self.match_path(path) def get_category_rules(self, category: str) -> list[ClassificationRule]: - """Get all rules for a category - - Args: - category: Category name - - Returns: - List of rules for the category - """ return [r for r in self.rules if r.category == category] def get_all_categories(self) -> set[str]: - """Get all defined categories - - Returns: - Set of category names - """ return {r.category for r in self.rules} - def get_rules_by_priority(self, min_priority: int = 0) -> list[ClassificationRule]: - """Get rules above a minimum priority - - Args: - min_priority: Minimum priority threshold - - Returns: - List of rules with priority >= min_priority - """ + def get_rules_by_priority(self, min_priority: int=0) -> list[ClassificationRule]: return [r for r in self.rules if r.priority >= min_priority] diff --git a/sql/setup_database.sql b/sql/legacy_setup.sql similarity index 100% rename from sql/setup_database.sql rename to sql/legacy_setup.sql