Files
defrag/app/classification/classifier.py
2025-12-13 03:56:14 +01:00

125 lines
4.9 KiB
Python

from pathlib import Path
from typing import List, Set, Dict, Tuple
import re
class FileClassifier:
def __init__(self):
self.build_patterns = {
'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist',
'.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv',
'site-packages', 'bower_components', 'jspm_packages'
}
self.artifact_patterns = {
'java': {'.jar', '.war', '.ear', '.class'},
'python': {'.pyc', '.pyo', '.whl', '.egg'},
'node': {'node_modules'},
'go': {'vendor', 'pkg'},
'rust': {'target'},
'docker': {'.dockerignore', 'Dockerfile'}
}
self.category_keywords = {
'apps': {'app', 'application', 'service', 'api', 'server', 'client'},
'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'},
'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'},
'cache': {'cache', 'temp', 'tmp', '.cache'},
'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'},
'backups': {'backup', 'bak', 'snapshot', 'archive'},
'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'},
'artifacts': {'build', 'dist', 'release', 'output'},
'temp': {'tmp', 'temp', 'staging', 'processing'}
}
self.media_extensions = {
'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'},
'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'},
'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'},
'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'},
'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'},
'presentation': {'.ppt', '.pptx', '.odp'}
}
self.code_extensions = {
'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h',
'.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'
}
def classify_path(self, path: str, size: int = 0) -> Tuple[Set[str], str, bool]:
p = Path(path)
labels = set()
primary_category = 'misc'
is_build_artifact = False
parts = p.parts
name_lower = p.name.lower()
for part in parts:
part_lower = part.lower()
if part_lower in self.build_patterns:
is_build_artifact = True
labels.add('build-artifact')
break
if is_build_artifact:
for artifact_type, patterns in self.artifact_patterns.items():
if any(part.lower() in patterns for part in parts) or p.suffix in patterns:
primary_category = f'artifacts/{artifact_type}'
labels.add('artifact')
return labels, primary_category, is_build_artifact
if '.git' in parts:
labels.add('vcs')
primary_category = 'infra/git-infrastructure'
return labels, primary_category, False
for category, keywords in self.category_keywords.items():
if any(kw in name_lower or any(kw in part.lower() for part in parts) for kw in keywords):
labels.add(category)
primary_category = category
break
for media_type, extensions in self.media_extensions.items():
if p.suffix.lower() in extensions:
labels.add(media_type)
labels.add('media')
primary_category = f'user/{media_type}'
break
if p.suffix.lower() in self.code_extensions:
labels.add('code')
if primary_category == 'misc':
primary_category = 'dev'
if size > 100 * 1024 * 1024:
labels.add('large-file')
if any(kw in name_lower for kw in ['test', 'spec', 'mock']):
labels.add('test')
if any(kw in name_lower for kw in ['config', 'settings', 'env']):
labels.add('config')
return labels, primary_category, is_build_artifact
def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str:
p = Path(source_path)
if 'build-artifact' in labels:
return f'trash/build-artifacts/{source_path}'
if category.startswith('artifacts/'):
artifact_type = category.split('/')[-1]
return f'artifacts/{artifact_type}/{p.name}'
if category.startswith('user/'):
media_type = category.split('/')[-1]
return f'user/{media_type}/{p.name}'
parts = [part for part in p.parts if part not in self.build_patterns]
if len(parts) > 3:
project_name = parts[0] if parts else 'misc'
return f'{category}/{project_name}/{"/".join(parts[1:])}'
return f'{category}/{source_path}'