75 lines
4.5 KiB
Python
75 lines
4.5 KiB
Python
from pathlib import Path
|
|
from typing import List, Set, Dict, Tuple
|
|
import re
|
|
|
|
class FileClassifier:
|
|
|
|
def __init__(self):
|
|
self.build_patterns = {'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist', '.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv', 'site-packages', 'bower_components', 'jspm_packages'}
|
|
self.artifact_patterns = {'java': {'.jar', '.war', '.ear', '.class'}, 'python': {'.pyc', '.pyo', '.whl', '.egg'}, 'node': {'node_modules'}, 'go': {'vendor', 'pkg'}, 'rust': {'target'}, 'docker': {'.dockerignore', 'Dockerfile'}}
|
|
self.category_keywords = {'apps': {'app', 'application', 'service', 'api', 'server', 'client'}, 'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'}, 'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'}, 'cache': {'cache', 'temp', 'tmp', '.cache'}, 'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'}, 'backups': {'backup', 'bak', 'snapshot', 'archive'}, 'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'}, 'artifacts': {'build', 'dist', 'release', 'output'}, 'temp': {'tmp', 'temp', 'staging', 'processing'}}
|
|
self.media_extensions = {'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}, 'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'}, 'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'}, 'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'}, 'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'}, 'presentation': {'.ppt', '.pptx', '.odp'}}
|
|
self.code_extensions = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'}
|
|
|
|
def classify_path(self, path: str, size: int=0) -> Tuple[Set[str], str, bool]:
|
|
p = Path(path)
|
|
labels = set()
|
|
primary_category = 'misc'
|
|
is_build_artifact = False
|
|
parts = p.parts
|
|
name_lower = p.name.lower()
|
|
for part in parts:
|
|
part_lower = part.lower()
|
|
if part_lower in self.build_patterns:
|
|
is_build_artifact = True
|
|
labels.add('build-artifact')
|
|
break
|
|
if is_build_artifact:
|
|
for artifact_type, patterns in self.artifact_patterns.items():
|
|
if any((part.lower() in patterns for part in parts)) or p.suffix in patterns:
|
|
primary_category = f'artifacts/{artifact_type}'
|
|
labels.add('artifact')
|
|
return (labels, primary_category, is_build_artifact)
|
|
if '.git' in parts:
|
|
labels.add('vcs')
|
|
primary_category = 'infra/git-infrastructure'
|
|
return (labels, primary_category, False)
|
|
for category, keywords in self.category_keywords.items():
|
|
if any((kw in name_lower or any((kw in part.lower() for part in parts)) for kw in keywords)):
|
|
labels.add(category)
|
|
primary_category = category
|
|
break
|
|
for media_type, extensions in self.media_extensions.items():
|
|
if p.suffix.lower() in extensions:
|
|
labels.add(media_type)
|
|
labels.add('media')
|
|
primary_category = f'user/{media_type}'
|
|
break
|
|
if p.suffix.lower() in self.code_extensions:
|
|
labels.add('code')
|
|
if primary_category == 'misc':
|
|
primary_category = 'dev'
|
|
if size > 100 * 1024 * 1024:
|
|
labels.add('large-file')
|
|
if any((kw in name_lower for kw in ['test', 'spec', 'mock'])):
|
|
labels.add('test')
|
|
if any((kw in name_lower for kw in ['config', 'settings', 'env'])):
|
|
labels.add('config')
|
|
return (labels, primary_category, is_build_artifact)
|
|
|
|
def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str:
|
|
p = Path(source_path)
|
|
if 'build-artifact' in labels:
|
|
return f'trash/build-artifacts/{source_path}'
|
|
if category.startswith('artifacts/'):
|
|
artifact_type = category.split('/')[-1]
|
|
return f'artifacts/{artifact_type}/{p.name}'
|
|
if category.startswith('user/'):
|
|
media_type = category.split('/')[-1]
|
|
return f'user/{media_type}/{p.name}'
|
|
parts = [part for part in p.parts if part not in self.build_patterns]
|
|
if len(parts) > 3:
|
|
project_name = parts[0] if parts else 'misc'
|
|
return f"{category}/{project_name}/{'/'.join(parts[1:])}"
|
|
return f'{category}/{source_path}'
|