from pathlib import Path from typing import List, Set, Dict, Tuple import re class FileClassifier: def __init__(self): self.build_patterns = {'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', 'site-packages', 'bower_components', 'jspm_packages'} self.artifact_patterns = {'java': {'.jar', '.war', '.ear', '.class'}, 'python': {'.pyc', '.pyo', '.whl', '.egg'}, 'node': {'node_modules'}, 'go': {'vendor', 'pkg'}, 'rust': {'target'}, 'docker': {'.dockerignore', 'Dockerfile'}} self.category_keywords = {'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, 'cache': {'cache', 'temp', 'tmp', '.cache'}, 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, 'backups': {'backup', 'bak', 'snapshot', 'archive'}, 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, 'artifacts': {'build', 'dist', 'release', 'output'}, 'temp': {'tmp', 'temp', 'staging', 'processing'}} self.media_extensions = {'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, 'presentation': {'.ppt', '.pptx', '.odp'}} self.code_extensions = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'} def classify_path(self, path: str, size: int=0) -> Tuple[Set[str], str, bool]: p = Path(path) labels = set() primary_category = 'misc' is_build_artifact = False parts = p.parts name_lower = p.name.lower() for part in parts: part_lower = part.lower() if part_lower in self.build_patterns: is_build_artifact = True labels.add('build-artifact') break if is_build_artifact: for artifact_type, patterns in self.artifact_patterns.items(): if any((part.lower() in patterns for part in parts)) or p.suffix in patterns: primary_category = f'artifacts/{artifact_type}' labels.add('artifact') return (labels, primary_category, is_build_artifact) if '.git' in parts: labels.add('vcs') primary_category = 'infra/git-infrastructure' return (labels, primary_category, False) for category, keywords in self.category_keywords.items(): if any((kw in name_lower or any((kw in part.lower() for part in parts)) for kw in keywords)): labels.add(category) primary_category = category break for media_type, extensions in self.media_extensions.items(): if p.suffix.lower() in extensions: labels.add(media_type) labels.add('media') primary_category = f'user/{media_type}' break if p.suffix.lower() in self.code_extensions: labels.add('code') if primary_category == 'misc': primary_category = 'dev' if size > 100 * 1024 * 1024: labels.add('large-file') if any((kw in name_lower for kw in ['test', 'spec', 'mock'])): labels.add('test') if any((kw in name_lower for kw in ['config', 'settings', 'env'])): labels.add('config') return (labels, primary_category, is_build_artifact) def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str: p = Path(source_path) if 'build-artifact' in labels: return f'trash/build-artifacts/{source_path}' if category.startswith('artifacts/'): artifact_type = category.split('/')[-1] return f'artifacts/{artifact_type}/{p.name}' if category.startswith('user/'): media_type = category.split('/')[-1] return f'user/{media_type}/{p.name}' parts = [part for part in p.parts if part not in self.build_patterns] if len(parts) > 3: project_name = parts[0] if parts else 'misc' return f"{category}/{project_name}/{'/'.join(parts[1:])}" return f'{category}/{source_path}'