base
This commit is contained in:
278
app/main.py
278
app/main.py
@@ -703,9 +703,252 @@ class DiskReorganizer:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def profile_content(self, disk: Optional[str] = None, update_db: bool = False, limit: Optional[int] = None):
|
||||
from content.profiler import ContentProfiler
|
||||
|
||||
profiler = ContentProfiler()
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
||||
params = []
|
||||
if disk:
|
||||
query += " AND disk_label = %s"
|
||||
params.append(disk)
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
total = len(files)
|
||||
logger.info(f"Profiling {total:,} files...")
|
||||
|
||||
kind_stats = {}
|
||||
processable = 0
|
||||
batch = []
|
||||
|
||||
for idx, (path, size, disk_label) in enumerate(files, 1):
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
||||
|
||||
if not full_path.exists():
|
||||
continue
|
||||
|
||||
profile = profiler.profile_file(full_path)
|
||||
|
||||
if 'error' not in profile:
|
||||
kind = profile['kind']
|
||||
if kind not in kind_stats:
|
||||
kind_stats[kind] = {'count': 0, 'processable': 0}
|
||||
kind_stats[kind]['count'] += 1
|
||||
if profile['processable']:
|
||||
kind_stats[kind]['processable'] += 1
|
||||
processable += 1
|
||||
|
||||
if update_db:
|
||||
profile_json = json.dumps(profile)
|
||||
batch.append((kind, profile_json, path))
|
||||
|
||||
if len(batch) >= 500:
|
||||
cursor.executemany(
|
||||
"UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s",
|
||||
[(pj, p) for k, pj, p in batch]
|
||||
)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
|
||||
if idx % 100 == 0:
|
||||
print(f"\rProfiled: {idx:,}/{total:,}", end='', flush=True)
|
||||
|
||||
if update_db and batch:
|
||||
cursor.executemany(
|
||||
"UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s",
|
||||
[(pj, p) for k, pj, p in batch]
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
print()
|
||||
print(f"\n=== CONTENT PROFILE SUMMARY ===")
|
||||
print(f"Total files: {total:,}")
|
||||
print(f"Processable: {processable:,}\n")
|
||||
print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
|
||||
print("-" * 60)
|
||||
for kind in sorted(kind_stats.keys()):
|
||||
stats = kind_stats[kind]
|
||||
extractor = profiler._suggest_extractor(kind, '')
|
||||
print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def extract_content(self, kind: Optional[str] = None, limit: int = 10):
|
||||
from content.profiler import ContentProfiler
|
||||
from content.extractors import ContentExtractor
|
||||
|
||||
profiler = ContentProfiler()
|
||||
extractor = ContentExtractor()
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
|
||||
params = []
|
||||
if kind:
|
||||
query += " AND metadata->'profile'->>'kind' = %s"
|
||||
params.append(kind)
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
|
||||
print(f"\n=== EXTRACTING CONTENT ===")
|
||||
print(f"Processing {len(files)} files\n")
|
||||
|
||||
for path, size, disk_label, metadata in files:
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
||||
|
||||
if not full_path.exists():
|
||||
continue
|
||||
|
||||
profile = metadata.get('profile', {}) if metadata else {}
|
||||
extractor_type = profile.get('extractor')
|
||||
|
||||
if not extractor_type:
|
||||
continue
|
||||
|
||||
print(f"Extracting: {path}")
|
||||
print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}")
|
||||
|
||||
result = extractor.extract(full_path, extractor_type)
|
||||
|
||||
if 'text' in result:
|
||||
preview = result['text'][:200]
|
||||
print(f" Preview: {preview}...")
|
||||
elif 'pipeline' in result:
|
||||
print(f" Pipeline: {' → '.join(result['pipeline'])}")
|
||||
print(f" Status: {result.get('status', 'pending')}")
|
||||
|
||||
print()
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def classify_files(self, disk: Optional[str] = None, update_db: bool = False):
|
||||
from classification.classifier import FileClassifier
|
||||
|
||||
classifier = FileClassifier()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
if disk:
|
||||
cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s", (disk,))
|
||||
else:
|
||||
cursor.execute("SELECT path, size, disk_label FROM files")
|
||||
|
||||
files = cursor.fetchall()
|
||||
total = len(files)
|
||||
logger.info(f"Classifying {total:,} files...")
|
||||
|
||||
categories = {}
|
||||
build_artifacts = 0
|
||||
batch = []
|
||||
|
||||
for idx, (path, size, disk_label) in enumerate(files, 1):
|
||||
labels, category, is_build = classifier.classify_path(path, int(size))
|
||||
|
||||
if is_build:
|
||||
build_artifacts += 1
|
||||
|
||||
if category not in categories:
|
||||
categories[category] = {'count': 0, 'size': 0}
|
||||
categories[category]['count'] += 1
|
||||
categories[category]['size'] += int(size)
|
||||
|
||||
if update_db:
|
||||
labels_str = ','.join(labels)
|
||||
batch.append((category, labels_str, path))
|
||||
|
||||
if len(batch) >= 1000:
|
||||
cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch])
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
|
||||
if idx % 1000 == 0:
|
||||
print(f"\rClassified: {idx:,}/{total:,}", end='', flush=True)
|
||||
|
||||
if update_db and batch:
|
||||
cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch])
|
||||
conn.commit()
|
||||
|
||||
print()
|
||||
print(f"\n=== CLASSIFICATION SUMMARY ===")
|
||||
print(f"Total files: {total:,}")
|
||||
print(f"Build artifacts: {build_artifacts:,}")
|
||||
print(f"\nCategories:")
|
||||
for category in sorted(categories.keys()):
|
||||
info = categories[category]
|
||||
print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def review_migration(self, category: Optional[str] = None, show_build: bool = False):
|
||||
from classification.classifier import FileClassifier
|
||||
|
||||
classifier = FileClassifier()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, category FROM files WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if category:
|
||||
query += " AND category = %s"
|
||||
params.append(category)
|
||||
|
||||
if not show_build:
|
||||
query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"
|
||||
|
||||
query += " ORDER BY category, size DESC LIMIT 100"
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
|
||||
if not files:
|
||||
print("No files found matching criteria")
|
||||
return
|
||||
|
||||
print(f"\n=== MIGRATION PREVIEW ===")
|
||||
print(f"Showing {len(files)} files\n")
|
||||
|
||||
current_category = None
|
||||
for path, size, cat in files:
|
||||
if cat != current_category:
|
||||
current_category = cat
|
||||
print(f"\n{cat}:")
|
||||
|
||||
labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
|
||||
target = classifier.suggest_target_path(path, suggested_cat, labels)
|
||||
print(f" {path}")
|
||||
print(f" → {target} ({self.format_size(int(size))})")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
@staticmethod
|
||||
def format_size(size: int) -> str:
|
||||
"""Format bytes to human readable string"""
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
if size < 1024:
|
||||
return f"{size:.1f}{unit}"
|
||||
@@ -744,6 +987,27 @@ def main():
|
||||
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
|
||||
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
|
||||
|
||||
# Profile command
|
||||
profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
|
||||
profile_parser.add_argument('--disk', help='Profile specific disk')
|
||||
profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
|
||||
profile_parser.add_argument('--limit', type=int, help='Limit number of files')
|
||||
|
||||
# Extract command
|
||||
extract_parser = subparsers.add_parser('extract', help='Extract content from files')
|
||||
extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
|
||||
extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
|
||||
|
||||
# Classify command
|
||||
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
|
||||
classify_parser.add_argument('--disk', help='Classify specific disk')
|
||||
classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
|
||||
|
||||
# Review command
|
||||
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
|
||||
review_parser.add_argument('--category', help='Review specific category')
|
||||
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
|
||||
|
||||
# Report command
|
||||
report_parser = subparsers.add_parser('report', help='Show current status')
|
||||
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
|
||||
@@ -772,6 +1036,18 @@ def main():
|
||||
elif args.command == 'execute':
|
||||
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
|
||||
|
||||
elif args.command == 'profile':
|
||||
tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
|
||||
|
||||
elif args.command == 'extract':
|
||||
tool.extract_content(kind=args.kind, limit=args.limit)
|
||||
|
||||
elif args.command == 'classify':
|
||||
tool.classify_files(disk=args.disk, update_db=args.update)
|
||||
|
||||
elif args.command == 'review':
|
||||
tool.review_migration(category=args.category, show_build=args.show_build)
|
||||
|
||||
elif args.command == 'report':
|
||||
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user