base
This commit is contained in:
@@ -1,17 +1,3 @@
|
||||
"""Classification package exports"""
|
||||
from .rules import RuleBasedClassifier
|
||||
from .ml import create_ml_classifier, train_from_database, MLClassifier, DummyMLClassifier
|
||||
from .engine import ClassificationEngine
|
||||
from ._protocols import ClassificationRule, IClassifier, IRuleEngine
|
||||
from .classifier import FileClassifier
|
||||
|
||||
__all__ = [
|
||||
'RuleBasedClassifier',
|
||||
'MLClassifier',
|
||||
'DummyMLClassifier',
|
||||
'create_ml_classifier',
|
||||
'train_from_database',
|
||||
'ClassificationEngine',
|
||||
'ClassificationRule',
|
||||
'IClassifier',
|
||||
'IRuleEngine',
|
||||
]
|
||||
__all__ = ['FileClassifier']
|
||||
|
||||
124
app/classification/classifier.py
Normal file
124
app/classification/classifier.py
Normal file
@@ -0,0 +1,124 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Set, Dict, Tuple
|
||||
import re
|
||||
|
||||
class FileClassifier:
|
||||
def __init__(self):
|
||||
self.build_patterns = {
|
||||
'node_modules', '__pycache__', '.pytest_cache', 'target', 'build', 'dist',
|
||||
'.gradle', 'bin', 'obj', '.next', '.nuxt', 'vendor', '.venv', 'venv',
|
||||
'site-packages', 'bower_components', 'jspm_packages'
|
||||
}
|
||||
|
||||
self.artifact_patterns = {
|
||||
'java': {'.jar', '.war', '.ear', '.class'},
|
||||
'python': {'.pyc', '.pyo', '.whl', '.egg'},
|
||||
'node': {'node_modules'},
|
||||
'go': {'vendor', 'pkg'},
|
||||
'rust': {'target'},
|
||||
'docker': {'.dockerignore', 'Dockerfile'}
|
||||
}
|
||||
|
||||
self.category_keywords = {
|
||||
'apps': {'app', 'application', 'service', 'api', 'server', 'client'},
|
||||
'infra': {'infrastructure', 'devops', 'docker', 'kubernetes', 'terraform', 'ansible', 'gitea', 'jenkins'},
|
||||
'dev': {'project', 'workspace', 'repo', 'src', 'code', 'dev'},
|
||||
'cache': {'cache', 'temp', 'tmp', '.cache'},
|
||||
'databases': {'postgres', 'mysql', 'redis', 'mongo', 'db', 'database'},
|
||||
'backups': {'backup', 'bak', 'snapshot', 'archive'},
|
||||
'user': {'documents', 'pictures', 'videos', 'downloads', 'desktop', 'music'},
|
||||
'artifacts': {'build', 'dist', 'release', 'output'},
|
||||
'temp': {'tmp', 'temp', 'staging', 'processing'}
|
||||
}
|
||||
|
||||
self.media_extensions = {
|
||||
'video': {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'},
|
||||
'audio': {'.mp3', '.flac', '.wav', '.ogg', '.m4a', '.aac'},
|
||||
'image': {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'},
|
||||
'document': {'.pdf', '.doc', '.docx', '.txt', '.md', '.odt'},
|
||||
'spreadsheet': {'.xls', '.xlsx', '.csv', '.ods'},
|
||||
'presentation': {'.ppt', '.pptx', '.odp'}
|
||||
}
|
||||
|
||||
self.code_extensions = {
|
||||
'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h',
|
||||
'.cs', '.rb', '.php', '.swift', '.kt', '.scala', '.clj', '.r'
|
||||
}
|
||||
|
||||
def classify_path(self, path: str, size: int = 0) -> Tuple[Set[str], str, bool]:
|
||||
p = Path(path)
|
||||
labels = set()
|
||||
primary_category = 'misc'
|
||||
is_build_artifact = False
|
||||
|
||||
parts = p.parts
|
||||
name_lower = p.name.lower()
|
||||
|
||||
for part in parts:
|
||||
part_lower = part.lower()
|
||||
if part_lower in self.build_patterns:
|
||||
is_build_artifact = True
|
||||
labels.add('build-artifact')
|
||||
break
|
||||
|
||||
if is_build_artifact:
|
||||
for artifact_type, patterns in self.artifact_patterns.items():
|
||||
if any(part.lower() in patterns for part in parts) or p.suffix in patterns:
|
||||
primary_category = f'artifacts/{artifact_type}'
|
||||
labels.add('artifact')
|
||||
return labels, primary_category, is_build_artifact
|
||||
|
||||
if '.git' in parts:
|
||||
labels.add('vcs')
|
||||
primary_category = 'infra/git-infrastructure'
|
||||
return labels, primary_category, False
|
||||
|
||||
for category, keywords in self.category_keywords.items():
|
||||
if any(kw in name_lower or any(kw in part.lower() for part in parts) for kw in keywords):
|
||||
labels.add(category)
|
||||
primary_category = category
|
||||
break
|
||||
|
||||
for media_type, extensions in self.media_extensions.items():
|
||||
if p.suffix.lower() in extensions:
|
||||
labels.add(media_type)
|
||||
labels.add('media')
|
||||
primary_category = f'user/{media_type}'
|
||||
break
|
||||
|
||||
if p.suffix.lower() in self.code_extensions:
|
||||
labels.add('code')
|
||||
if primary_category == 'misc':
|
||||
primary_category = 'dev'
|
||||
|
||||
if size > 100 * 1024 * 1024:
|
||||
labels.add('large-file')
|
||||
|
||||
if any(kw in name_lower for kw in ['test', 'spec', 'mock']):
|
||||
labels.add('test')
|
||||
|
||||
if any(kw in name_lower for kw in ['config', 'settings', 'env']):
|
||||
labels.add('config')
|
||||
|
||||
return labels, primary_category, is_build_artifact
|
||||
|
||||
def suggest_target_path(self, source_path: str, category: str, labels: Set[str]) -> str:
|
||||
p = Path(source_path)
|
||||
|
||||
if 'build-artifact' in labels:
|
||||
return f'trash/build-artifacts/{source_path}'
|
||||
|
||||
if category.startswith('artifacts/'):
|
||||
artifact_type = category.split('/')[-1]
|
||||
return f'artifacts/{artifact_type}/{p.name}'
|
||||
|
||||
if category.startswith('user/'):
|
||||
media_type = category.split('/')[-1]
|
||||
return f'user/{media_type}/{p.name}'
|
||||
|
||||
parts = [part for part in p.parts if part not in self.build_patterns]
|
||||
if len(parts) > 3:
|
||||
project_name = parts[0] if parts else 'misc'
|
||||
return f'{category}/{project_name}/{"/".join(parts[1:])}'
|
||||
|
||||
return f'{category}/{source_path}'
|
||||
104
app/content/extractors.py
Normal file
104
app/content/extractors.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
import json
|
||||
|
||||
class ContentExtractor:
|
||||
def __init__(self):
|
||||
self.extractors = {
|
||||
'pdf_text': self._extract_pdf,
|
||||
'ocr+caption': self._extract_image,
|
||||
'transcribe': self._extract_audio,
|
||||
'transcribe+scenes': self._extract_video,
|
||||
'office_text': self._extract_document,
|
||||
'read': self._extract_text,
|
||||
'read+syntax': self._extract_code
|
||||
}
|
||||
|
||||
def extract(self, file_path: Path, extractor_type: str) -> Dict:
|
||||
extractor = self.extractors.get(extractor_type)
|
||||
if not extractor:
|
||||
return {'error': f'Unknown extractor: {extractor_type}'}
|
||||
|
||||
try:
|
||||
return extractor(file_path)
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _extract_text(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read(1024 * 1024)
|
||||
return {
|
||||
'text': content,
|
||||
'char_count': len(content),
|
||||
'needs_llm': False
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _extract_code(self, file_path: Path) -> Dict:
|
||||
result = self._extract_text(file_path)
|
||||
if 'error' not in result:
|
||||
result['type'] = 'code'
|
||||
result['needs_llm'] = True
|
||||
return result
|
||||
|
||||
def _extract_pdf(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
import PyPDF2
|
||||
text_parts = []
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf = PyPDF2.PdfReader(f)
|
||||
for page in pdf.pages[:10]:
|
||||
text_parts.append(page.extract_text())
|
||||
|
||||
text = '\n'.join(text_parts)
|
||||
return {
|
||||
'text': text,
|
||||
'pages_extracted': len(text_parts),
|
||||
'needs_llm': len(text.strip()) > 100,
|
||||
'type': 'document'
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'needs_ocr': True}
|
||||
|
||||
def _extract_image(self, file_path: Path) -> Dict:
|
||||
return {
|
||||
'type': 'image',
|
||||
'needs_ocr': True,
|
||||
'needs_caption': True,
|
||||
'needs_llm': True,
|
||||
'pipeline': ['ocr', 'caption', 'embedding'],
|
||||
'status': 'pending'
|
||||
}
|
||||
|
||||
def _extract_audio(self, file_path: Path) -> Dict:
|
||||
return {
|
||||
'type': 'audio',
|
||||
'needs_transcription': True,
|
||||
'needs_llm': True,
|
||||
'pipeline': ['transcribe', 'summarize'],
|
||||
'status': 'pending'
|
||||
}
|
||||
|
||||
def _extract_video(self, file_path: Path) -> Dict:
|
||||
return {
|
||||
'type': 'video',
|
||||
'needs_transcription': True,
|
||||
'needs_scene_detection': True,
|
||||
'needs_llm': True,
|
||||
'pipeline': ['transcribe', 'scenes', 'summarize'],
|
||||
'status': 'pending'
|
||||
}
|
||||
|
||||
def _extract_document(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
import textract
|
||||
text = textract.process(str(file_path)).decode('utf-8')
|
||||
return {
|
||||
'text': text,
|
||||
'type': 'document',
|
||||
'needs_llm': len(text.strip()) > 100
|
||||
}
|
||||
except:
|
||||
return {'error': 'textract failed', 'needs_llm': True}
|
||||
155
app/content/profiler.py
Normal file
155
app/content/profiler.py
Normal file
@@ -0,0 +1,155 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Tuple
|
||||
import mimetypes
|
||||
import magic
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
class ContentProfiler:
|
||||
def __init__(self):
|
||||
self.mime_detector = magic.Magic(mime=True)
|
||||
|
||||
self.kind_mapping = {
|
||||
'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'],
|
||||
'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'],
|
||||
'pdf': ['application/pdf'],
|
||||
'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'],
|
||||
'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'],
|
||||
'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'],
|
||||
'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'],
|
||||
'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'],
|
||||
'spreadsheet': ['application/vnd.ms-excel', 'text/csv']
|
||||
}
|
||||
|
||||
self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
|
||||
self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
|
||||
self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
|
||||
|
||||
def profile_file(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
stat = file_path.stat()
|
||||
size = stat.st_size
|
||||
mtime = datetime.fromtimestamp(stat.st_mtime)
|
||||
|
||||
mime_type = self._detect_mime(file_path)
|
||||
kind = self._determine_kind(file_path, mime_type)
|
||||
|
||||
profile = {
|
||||
'path': str(file_path),
|
||||
'size': size,
|
||||
'mtime': mtime.isoformat(),
|
||||
'mime': mime_type,
|
||||
'kind': kind,
|
||||
'processable': kind in self.processable_kinds,
|
||||
'extractor': self._suggest_extractor(kind, mime_type),
|
||||
'hints': self._extract_hints(file_path, kind, mime_type, size)
|
||||
}
|
||||
|
||||
return profile
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'path': str(file_path),
|
||||
'error': str(e),
|
||||
'processable': False
|
||||
}
|
||||
|
||||
def _detect_mime(self, file_path: Path) -> str:
|
||||
try:
|
||||
return self.mime_detector.from_file(str(file_path))
|
||||
except:
|
||||
guess = mimetypes.guess_type(str(file_path))[0]
|
||||
return guess or 'application/octet-stream'
|
||||
|
||||
def _determine_kind(self, file_path: Path, mime_type: str) -> str:
|
||||
for kind, mimes in self.kind_mapping.items():
|
||||
if any(mime in mime_type for mime in mimes):
|
||||
return kind
|
||||
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix in self.text_exts:
|
||||
return 'text'
|
||||
if suffix in self.code_exts:
|
||||
return 'code'
|
||||
|
||||
return 'unknown'
|
||||
|
||||
def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
|
||||
extractors = {
|
||||
'pdf': 'pdf_text',
|
||||
'image': 'ocr+caption',
|
||||
'audio': 'transcribe',
|
||||
'video': 'transcribe+scenes',
|
||||
'document': 'office_text',
|
||||
'text': 'read',
|
||||
'code': 'read+syntax'
|
||||
}
|
||||
return extractors.get(kind)
|
||||
|
||||
def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
|
||||
hints = {}
|
||||
|
||||
if kind == 'text' or kind == 'code':
|
||||
hints['language'] = self._guess_language(file_path)
|
||||
if size < 1024 * 1024:
|
||||
hints['lines'] = self._count_lines(file_path)
|
||||
|
||||
if kind == 'pdf':
|
||||
hints['page_count'] = self._get_pdf_pages(file_path)
|
||||
|
||||
if kind in ['audio', 'video']:
|
||||
hints['duration'] = self._get_media_duration(file_path)
|
||||
|
||||
if kind == 'image':
|
||||
hints['has_exif'] = self._has_exif(file_path)
|
||||
hints['dimensions'] = self._get_image_dimensions(file_path)
|
||||
|
||||
return hints
|
||||
|
||||
def _guess_language(self, file_path: Path) -> Optional[str]:
|
||||
lang_map = {
|
||||
'.py': 'python', '.js': 'javascript', '.ts': 'typescript',
|
||||
'.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c',
|
||||
'.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'
|
||||
}
|
||||
return lang_map.get(file_path.suffix.lower())
|
||||
|
||||
def _count_lines(self, file_path: Path) -> Optional[int]:
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
return sum(1 for _ in f)
|
||||
except:
|
||||
return None
|
||||
|
||||
def _get_pdf_pages(self, file_path: Path) -> Optional[int]:
|
||||
try:
|
||||
import PyPDF2
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf = PyPDF2.PdfReader(f)
|
||||
return len(pdf.pages)
|
||||
except:
|
||||
return None
|
||||
|
||||
def _get_media_duration(self, file_path: Path) -> Optional[float]:
|
||||
try:
|
||||
import ffmpeg
|
||||
probe = ffmpeg.probe(str(file_path))
|
||||
return float(probe['format']['duration'])
|
||||
except:
|
||||
return None
|
||||
|
||||
def _has_exif(self, file_path: Path) -> bool:
|
||||
try:
|
||||
from PIL import Image
|
||||
img = Image.open(file_path)
|
||||
return hasattr(img, '_getexif') and img._getexif() is not None
|
||||
except:
|
||||
return False
|
||||
|
||||
def _get_image_dimensions(self, file_path: Path) -> Optional[Tuple[int, int]]:
|
||||
try:
|
||||
from PIL import Image
|
||||
with Image.open(file_path) as img:
|
||||
return img.size
|
||||
except:
|
||||
return None
|
||||
278
app/main.py
278
app/main.py
@@ -703,9 +703,252 @@ class DiskReorganizer:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def profile_content(self, disk: Optional[str] = None, update_db: bool = False, limit: Optional[int] = None):
|
||||
from content.profiler import ContentProfiler
|
||||
|
||||
profiler = ContentProfiler()
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
||||
params = []
|
||||
if disk:
|
||||
query += " AND disk_label = %s"
|
||||
params.append(disk)
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
total = len(files)
|
||||
logger.info(f"Profiling {total:,} files...")
|
||||
|
||||
kind_stats = {}
|
||||
processable = 0
|
||||
batch = []
|
||||
|
||||
for idx, (path, size, disk_label) in enumerate(files, 1):
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
||||
|
||||
if not full_path.exists():
|
||||
continue
|
||||
|
||||
profile = profiler.profile_file(full_path)
|
||||
|
||||
if 'error' not in profile:
|
||||
kind = profile['kind']
|
||||
if kind not in kind_stats:
|
||||
kind_stats[kind] = {'count': 0, 'processable': 0}
|
||||
kind_stats[kind]['count'] += 1
|
||||
if profile['processable']:
|
||||
kind_stats[kind]['processable'] += 1
|
||||
processable += 1
|
||||
|
||||
if update_db:
|
||||
profile_json = json.dumps(profile)
|
||||
batch.append((kind, profile_json, path))
|
||||
|
||||
if len(batch) >= 500:
|
||||
cursor.executemany(
|
||||
"UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s",
|
||||
[(pj, p) for k, pj, p in batch]
|
||||
)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
|
||||
if idx % 100 == 0:
|
||||
print(f"\rProfiled: {idx:,}/{total:,}", end='', flush=True)
|
||||
|
||||
if update_db and batch:
|
||||
cursor.executemany(
|
||||
"UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s",
|
||||
[(pj, p) for k, pj, p in batch]
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
print()
|
||||
print(f"\n=== CONTENT PROFILE SUMMARY ===")
|
||||
print(f"Total files: {total:,}")
|
||||
print(f"Processable: {processable:,}\n")
|
||||
print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
|
||||
print("-" * 60)
|
||||
for kind in sorted(kind_stats.keys()):
|
||||
stats = kind_stats[kind]
|
||||
extractor = profiler._suggest_extractor(kind, '')
|
||||
print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def extract_content(self, kind: Optional[str] = None, limit: int = 10):
|
||||
from content.profiler import ContentProfiler
|
||||
from content.extractors import ContentExtractor
|
||||
|
||||
profiler = ContentProfiler()
|
||||
extractor = ContentExtractor()
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
|
||||
params = []
|
||||
if kind:
|
||||
query += " AND metadata->'profile'->>'kind' = %s"
|
||||
params.append(kind)
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
|
||||
print(f"\n=== EXTRACTING CONTENT ===")
|
||||
print(f"Processing {len(files)} files\n")
|
||||
|
||||
for path, size, disk_label, metadata in files:
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
||||
|
||||
if not full_path.exists():
|
||||
continue
|
||||
|
||||
profile = metadata.get('profile', {}) if metadata else {}
|
||||
extractor_type = profile.get('extractor')
|
||||
|
||||
if not extractor_type:
|
||||
continue
|
||||
|
||||
print(f"Extracting: {path}")
|
||||
print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}")
|
||||
|
||||
result = extractor.extract(full_path, extractor_type)
|
||||
|
||||
if 'text' in result:
|
||||
preview = result['text'][:200]
|
||||
print(f" Preview: {preview}...")
|
||||
elif 'pipeline' in result:
|
||||
print(f" Pipeline: {' → '.join(result['pipeline'])}")
|
||||
print(f" Status: {result.get('status', 'pending')}")
|
||||
|
||||
print()
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def classify_files(self, disk: Optional[str] = None, update_db: bool = False):
|
||||
from classification.classifier import FileClassifier
|
||||
|
||||
classifier = FileClassifier()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
if disk:
|
||||
cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s", (disk,))
|
||||
else:
|
||||
cursor.execute("SELECT path, size, disk_label FROM files")
|
||||
|
||||
files = cursor.fetchall()
|
||||
total = len(files)
|
||||
logger.info(f"Classifying {total:,} files...")
|
||||
|
||||
categories = {}
|
||||
build_artifacts = 0
|
||||
batch = []
|
||||
|
||||
for idx, (path, size, disk_label) in enumerate(files, 1):
|
||||
labels, category, is_build = classifier.classify_path(path, int(size))
|
||||
|
||||
if is_build:
|
||||
build_artifacts += 1
|
||||
|
||||
if category not in categories:
|
||||
categories[category] = {'count': 0, 'size': 0}
|
||||
categories[category]['count'] += 1
|
||||
categories[category]['size'] += int(size)
|
||||
|
||||
if update_db:
|
||||
labels_str = ','.join(labels)
|
||||
batch.append((category, labels_str, path))
|
||||
|
||||
if len(batch) >= 1000:
|
||||
cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch])
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
|
||||
if idx % 1000 == 0:
|
||||
print(f"\rClassified: {idx:,}/{total:,}", end='', flush=True)
|
||||
|
||||
if update_db and batch:
|
||||
cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch])
|
||||
conn.commit()
|
||||
|
||||
print()
|
||||
print(f"\n=== CLASSIFICATION SUMMARY ===")
|
||||
print(f"Total files: {total:,}")
|
||||
print(f"Build artifacts: {build_artifacts:,}")
|
||||
print(f"\nCategories:")
|
||||
for category in sorted(categories.keys()):
|
||||
info = categories[category]
|
||||
print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def review_migration(self, category: Optional[str] = None, show_build: bool = False):
|
||||
from classification.classifier import FileClassifier
|
||||
|
||||
classifier = FileClassifier()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, category FROM files WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if category:
|
||||
query += " AND category = %s"
|
||||
params.append(category)
|
||||
|
||||
if not show_build:
|
||||
query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"
|
||||
|
||||
query += " ORDER BY category, size DESC LIMIT 100"
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
|
||||
if not files:
|
||||
print("No files found matching criteria")
|
||||
return
|
||||
|
||||
print(f"\n=== MIGRATION PREVIEW ===")
|
||||
print(f"Showing {len(files)} files\n")
|
||||
|
||||
current_category = None
|
||||
for path, size, cat in files:
|
||||
if cat != current_category:
|
||||
current_category = cat
|
||||
print(f"\n{cat}:")
|
||||
|
||||
labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
|
||||
target = classifier.suggest_target_path(path, suggested_cat, labels)
|
||||
print(f" {path}")
|
||||
print(f" → {target} ({self.format_size(int(size))})")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
@staticmethod
|
||||
def format_size(size: int) -> str:
|
||||
"""Format bytes to human readable string"""
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
if size < 1024:
|
||||
return f"{size:.1f}{unit}"
|
||||
@@ -744,6 +987,27 @@ def main():
|
||||
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
|
||||
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
|
||||
|
||||
# Profile command
|
||||
profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
|
||||
profile_parser.add_argument('--disk', help='Profile specific disk')
|
||||
profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
|
||||
profile_parser.add_argument('--limit', type=int, help='Limit number of files')
|
||||
|
||||
# Extract command
|
||||
extract_parser = subparsers.add_parser('extract', help='Extract content from files')
|
||||
extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
|
||||
extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
|
||||
|
||||
# Classify command
|
||||
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
|
||||
classify_parser.add_argument('--disk', help='Classify specific disk')
|
||||
classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
|
||||
|
||||
# Review command
|
||||
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
|
||||
review_parser.add_argument('--category', help='Review specific category')
|
||||
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
|
||||
|
||||
# Report command
|
||||
report_parser = subparsers.add_parser('report', help='Show current status')
|
||||
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
|
||||
@@ -772,6 +1036,18 @@ def main():
|
||||
elif args.command == 'execute':
|
||||
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
|
||||
|
||||
elif args.command == 'profile':
|
||||
tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
|
||||
|
||||
elif args.command == 'extract':
|
||||
tool.extract_content(kind=args.kind, limit=args.limit)
|
||||
|
||||
elif args.command == 'classify':
|
||||
tool.classify_files(disk=args.disk, update_db=args.update)
|
||||
|
||||
elif args.command == 'review':
|
||||
tool.review_migration(category=args.category, show_build=args.show_build)
|
||||
|
||||
elif args.command == 'report':
|
||||
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user