125 lines
4.9 KiB
Python
125 lines
4.9 KiB
Python
from pathlib import Path
|
|
from typing import List, Set, Dict, Tuple
|
|
import re
|
|
|
|
class FileClassifier:
|
|
def __init__(self):
|
|
self.build_patterns = {
|
|
'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist',
|
|
'.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv',
|
|
'site-packages', 'bower_components', 'jspm_packages'
|
|
}
|
|
|
|
self.artifact_patterns = {
|
|
'java': {'.jar', '.war', '.ear', '.class'},
|
|
'python': {'.pyc', '.pyo', '.whl', '.egg'},
|
|
'node': {'node_modules'},
|
|
'go': {'vendor', 'pkg'},
|
|
'rust': {'target'},
|
|
'docker': {'.dockerignore', 'Dockerfile'}
|
|
}
|
|
|
|
self.category_keywords = {
|
|
'apps': {'app', 'application', 'service', 'api', 'server', 'client'},
|
|
'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'},
|
|
'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'},
|
|
'cache': {'cache', 'temp', 'tmp', '.cache'},
|
|
'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'},
|
|
'backups': {'backup', 'bak', 'snapshot', 'archive'},
|
|
'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'},
|
|
'artifacts': {'build', 'dist', 'release', 'output'},
|
|
'temp': {'tmp', 'temp', 'staging', 'processing'}
|
|
}
|
|
|
|
self.media_extensions = {
|
|
'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'},
|
|
'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'},
|
|
'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'},
|
|
'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'},
|
|
'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'},
|
|
'presentation': {'.ppt', '.pptx', '.odp'}
|
|
}
|
|
|
|
self.code_extensions = {
|
|
'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h',
|
|
'.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'
|
|
}
|
|
|
|
def classify_path(self, path: str, size: int = 0) -> Tuple[Set[str], str, bool]:
|
|
p = Path(path)
|
|
labels = set()
|
|
primary_category = 'misc'
|
|
is_build_artifact = False
|
|
|
|
parts = p.parts
|
|
name_lower = p.name.lower()
|
|
|
|
for part in parts:
|
|
part_lower = part.lower()
|
|
if part_lower in self.build_patterns:
|
|
is_build_artifact = True
|
|
labels.add('build-artifact')
|
|
break
|
|
|
|
if is_build_artifact:
|
|
for artifact_type, patterns in self.artifact_patterns.items():
|
|
if any(part.lower() in patterns for part in parts) or p.suffix in patterns:
|
|
primary_category = f'artifacts/{artifact_type}'
|
|
labels.add('artifact')
|
|
return labels, primary_category, is_build_artifact
|
|
|
|
if '.git' in parts:
|
|
labels.add('vcs')
|
|
primary_category = 'infra/git-infrastructure'
|
|
return labels, primary_category, False
|
|
|
|
for category, keywords in self.category_keywords.items():
|
|
if any(kw in name_lower or any(kw in part.lower() for part in parts) for kw in keywords):
|
|
labels.add(category)
|
|
primary_category = category
|
|
break
|
|
|
|
for media_type, extensions in self.media_extensions.items():
|
|
if p.suffix.lower() in extensions:
|
|
labels.add(media_type)
|
|
labels.add('media')
|
|
primary_category = f'user/{media_type}'
|
|
break
|
|
|
|
if p.suffix.lower() in self.code_extensions:
|
|
labels.add('code')
|
|
if primary_category == 'misc':
|
|
primary_category = 'dev'
|
|
|
|
if size > 100 * 1024 * 1024:
|
|
labels.add('large-file')
|
|
|
|
if any(kw in name_lower for kw in ['test', 'spec', 'mock']):
|
|
labels.add('test')
|
|
|
|
if any(kw in name_lower for kw in ['config', 'settings', 'env']):
|
|
labels.add('config')
|
|
|
|
return labels, primary_category, is_build_artifact
|
|
|
|
def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str:
|
|
p = Path(source_path)
|
|
|
|
if 'build-artifact' in labels:
|
|
return f'trash/build-artifacts/{source_path}'
|
|
|
|
if category.startswith('artifacts/'):
|
|
artifact_type = category.split('/')[-1]
|
|
return f'artifacts/{artifact_type}/{p.name}'
|
|
|
|
if category.startswith('user/'):
|
|
media_type = category.split('/')[-1]
|
|
return f'user/{media_type}/{p.name}'
|
|
|
|
parts = [part for part in p.parts if part not in self.build_patterns]
|
|
if len(parts) > 3:
|
|
project_name = parts[0] if parts else 'misc'
|
|
return f'{category}/{project_name}/{"/".join(parts[1:])}'
|
|
|
|
return f'{category}/{source_path}'
|