import os import sys from dataclasses import dataclass import psycopg2 import shutil import hashlib import argparse import json from pathlib import Path from typing import List, Dict, Optional from datetime import datetime import logging import time logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)]) logger = logging.getLogger(__name__) @dataclass class FileRecord: path: str size: int modified_time: float disk_label: str checksum: Optional[str] = None status: str = 'indexed' class DiskReorganizer: def __init__(self, db_config: Dict=None): if db_config is None: db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')} self.db_config = db_config self.init_database() def get_connection(self): return psycopg2.connect(**self.db_config) def init_database(self): try: conn = self.get_connection() cursor = conn.cursor() cursor.execute("\n SELECT table_name FROM information_schema.tables\n WHERE table_schema = 'public' AND table_name IN ('files', 'operations')\n ") tables = cursor.fetchall() if len(tables) < 2: logger.error('Database tables not found! Please run setup_database.sh first.') raise Exception('Database not properly initialized. Run setup_database.sh') cursor.close() conn.close() logger.info('Database connection verified successfully') except psycopg2.Error as e: logger.error(f'Database connection failed: {e}') raise def index_disk(self, disk_root: str, disk_name: str): logger.info(f'Indexing disk: {disk_name} at {disk_root}') disk_path = Path(disk_root) if not disk_path.exists(): logger.error(f'Disk path {disk_root} does not exist!') return files_count = 0 total_size = 0 start_time = time.time() conn = self.get_connection() cursor = conn.cursor() try: for root, dirs, files in os.walk(disk_path): dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))] for file in files: try: file_path = Path(root) / file if not file_path.is_file(): continue stat = file_path.stat() size = stat.st_size mtime = datetime.fromtimestamp(stat.st_mtime) rel_path = str(file_path.relative_to(disk_path)) cursor.execute('\n INSERT INTO files (path, size, modified_time, disk_label, checksum, status)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n disk_label = EXCLUDED.disk_label,\n status = EXCLUDED.status\n ', (rel_path, size, mtime, disk_name, None, 'indexed')) files_count += 1 total_size += size if files_count % 100 == 0: elapsed = time.time() - start_time rate = files_count / elapsed if elapsed > 0 else 0 display_path = str(file_path) if len(display_path) > 60: display_path = '...' + display_path[-57:] print(f'\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}', end='', flush=True) if files_count % 1000 == 0: conn.commit() except Exception as e: conn.rollback() logger.warning(f'\nSkipping {file_path}: {e}') continue conn.commit() print() logger.info(f'Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}') finally: cursor.close() conn.close() def calculate_disk_usage(self) -> Dict[str, Dict]: conn = self.get_connection() cursor = conn.cursor() try: cursor.execute('\n SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count\n FROM files\n GROUP BY disk_label\n ') usage = {} for row in cursor.fetchall(): disk = row[0] size = int(row[1] or 0) count = int(row[2]) usage[disk] = {'size': size, 'count': count, 'formatted_size': self.format_size(size)} return usage finally: cursor.close() conn.close() def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict: logger.info(f'Planning migration to free up {target_disk}') usage = self.calculate_disk_usage() if target_disk not in usage: logger.error(f'Target disk {target_disk} not found in index!') return {} conn = self.get_connection() cursor = conn.cursor() cursor.execute('SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC', (target_disk,)) files_to_move = cursor.fetchall() cursor.close() conn.close() target_disk_usage = usage[target_disk]['size'] logger.info(f'Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}') dest_availability = [] for disk in destination_disks: if disk not in usage: available = float('inf') else: available = float('inf') dest_availability.append({'disk': disk, 'available': available, 'planned_usage': 0}) plan = {'target_disk': target_disk, 'total_size': target_disk_usage, 'file_count': len(files_to_move), 'operations': [], 'destination_disks': destination_disks} conn = self.get_connection() cursor = conn.cursor() try: for file_info in files_to_move: rel_path, size, mtime = file_info dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)] op = {'source_disk': target_disk, 'source_path': rel_path, 'dest_disk': dest_disk, 'target_path': rel_path, 'size': int(size)} plan['operations'].append(op) cursor.execute('INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)', (f'{target_disk}:{rel_path}', f'{dest_disk}:{rel_path}', 'move', 'pending')) conn.commit() finally: cursor.close() conn.close() plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(plan_file, 'w') as f: json.dump(plan, f, indent=2) logger.info(f"Plan created with {len(plan['operations'])} operations") logger.info(f'Plan saved to {plan_file}') return plan def verify_operation(self, source: Path, dest: Path) -> bool: if not dest.exists(): return False try: source_stat = source.stat() dest_stat = dest.stat() if source_stat.st_size != dest_stat.st_size: return False return True except Exception as e: logger.error(f'Verification error: {e}') return False @staticmethod def file_checksum(path: Path) -> str: hash_md5 = hashlib.md5() with open(path, 'rb') as f: for chunk in iter(lambda: f.read(4096), b''): hash_md5.update(chunk) return hash_md5.hexdigest() def execute_migration(self, plan_file: str, dry_run: bool=True): logger.info(f"{('DRY RUN' if dry_run else 'EXECUTING')} migration from {plan_file}") with open(plan_file, 'r') as f: plan = json.load(f) operations = plan['operations'] logger.info(f'Processing {len(operations)} operations...') success_count = 0 error_count = 0 start_time = time.time() conn = self.get_connection() cursor = conn.cursor() try: for i, op in enumerate(operations, 1): source_disk = op['source_disk'] source_path = op['source_path'] dest_disk = op['dest_disk'] target_path = op['target_path'] source_full = Path(source_disk) / source_path dest_full = Path(dest_disk) / target_path elapsed = time.time() - start_time rate = i / elapsed if elapsed > 0 else 0 eta = (len(operations) - i) / rate if rate > 0 else 0 display_path = str(source_path) if len(display_path) > 50: display_path = '...' + display_path[-47:] print(f'\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}', end='', flush=True) if dry_run: if source_full.exists(): success_count += 1 else: logger.warning(f'\n Source does not exist: {source_full}') error_count += 1 continue try: dest_full.parent.mkdir(parents=True, exist_ok=True) if source_full.exists(): shutil.copy2(source_full, dest_full) if self.verify_operation(source_full, dest_full): cursor.execute("UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s", (dest_disk, source_path, source_disk)) cursor.execute('UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s', (f'{source_disk}:{source_path}',)) success_count += 1 else: raise Exception('Verification failed') else: logger.warning(f'\n Source missing: {source_full}') error_count += 1 except Exception as e: logger.error(f'\n Error processing {source_path}: {e}') cursor.execute('UPDATE operations SET error = %s WHERE source_path = %s', (str(e), f'{source_disk}:{source_path}')) error_count += 1 if i % 10 == 0: conn.commit() conn.commit() print() finally: cursor.close() conn.close() logger.info(f'Migration complete: {success_count} success, {error_count} errors') if not dry_run and error_count == 0: logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!") logger.info(f" Remember to safely delete original files from {plan['target_disk']}") def run_deduplication(self, disk: Optional[str]=None, use_chunks: bool=True): logger.info(f"Starting deduplication{(' for disk ' + disk if disk else '')}") disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'} conn = self.get_connection() cursor = conn.cursor() def hash_file_local(file_path: Path) -> str: hasher = hashlib.sha256() with open(file_path, 'rb') as f: while (chunk := f.read(65536)): hasher.update(chunk) return hasher.hexdigest() try: if disk: cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC', (disk,)) else: cursor.execute('SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC') files_to_process = cursor.fetchall() total = len(files_to_process) logger.info(f'Found {total} files to hash') processed = 0 skipped = 0 start_time = time.time() batch = [] print(f'Phase 1: Computing checksums...') for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1): try: mount_point = disk_mount_map.get(disk_label, disk_label) full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str) if not full_path.exists(): skipped += 1 if idx % 100 == 0: elapsed = time.time() - start_time rate = (processed + skipped) / elapsed if elapsed > 0 else 0 remaining = (total - idx) / rate if rate > 0 else 0 pct = 100 * idx / total print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True) continue checksum = hash_file_local(full_path) batch.append((checksum, path_str)) processed += 1 if len(batch) >= 1000: try: cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch) conn.commit() batch.clear() except Exception as e: conn.rollback() batch.clear() print(f'\nBatch update failed: {e}') if idx % 100 == 0: elapsed = time.time() - start_time rate = (processed + skipped) / elapsed if elapsed > 0 else 0 remaining = (total - idx) / rate if rate > 0 else 0 pct = 100 * idx / total print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True) except Exception as e: skipped += 1 if idx <= 5: print(f'\nDebug: {full_path} - {e}') if batch: try: cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch) conn.commit() except Exception as e: conn.rollback() print(f'\nFinal batch failed: {e}') print() elapsed = time.time() - start_time logger.info(f'Phase 1 done: {processed:,} files in {int(elapsed / 60)}m{int(elapsed % 60):02d}s ({skipped:,} skipped)') print('Phase 2: Finding duplicates...') cursor.execute('\n UPDATE files f1 SET duplicate_of = (\n SELECT MIN(path) FROM files f2\n WHERE f2.checksum = f1.checksum AND f2.path < f1.path\n )\n WHERE checksum IS NOT NULL\n ') conn.commit() cursor.execute('SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL') dup_count = cursor.fetchone()[0] logger.info(f'Phase 2 done: Found {dup_count:,} duplicates') finally: cursor.close() conn.close() def plan_merge(self, sources: List[str], target: str, output_file: str, filter_system: bool=False, network_target: str=None): logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}") if filter_system: sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from filters import GitignoreFilter file_filter = GitignoreFilter() logger.info('System/build file filtering enabled') conn = self.get_connection() cursor = conn.cursor() try: placeholders = ','.join(['%s'] * len(sources)) cursor.execute(f'\n SELECT path, size, checksum, disk_label, duplicate_of\n FROM files\n WHERE disk_label IN ({placeholders})\n ORDER BY size DESC\n ', tuple(sources)) files = cursor.fetchall() total_files = len(files) total_size = sum((int(f[1]) for f in files)) unique_files = {} duplicate_count = 0 duplicate_size = 0 filtered_count = 0 filtered_size = 0 for path, size, checksum, disk_label, duplicate_of in files: if filter_system and file_filter.should_exclude(path): filtered_count += 1 filtered_size += int(size) continue if checksum and checksum in unique_files: duplicate_count += 1 duplicate_size += int(size) elif checksum: unique_files[checksum] = (path, int(size), disk_label) unique_count = len(unique_files) unique_size = sum((f[1] for f in unique_files.values())) plan = {'sources': sources, 'target': target or network_target, 'network': network_target is not None, 'total_files': total_files, 'total_size': total_size, 'unique_files': unique_count, 'unique_size': unique_size, 'duplicate_files': duplicate_count, 'duplicate_size': duplicate_size, 'filtered_files': filtered_count if filter_system else 0, 'filtered_size': filtered_size if filter_system else 0, 'space_saved': duplicate_size + (filtered_size if filter_system else 0), 'operations': []} for checksum, (path, size, disk_label) in unique_files.items(): plan['operations'].append({'source_disk': disk_label, 'source_path': path, 'target_disk': target or network_target, 'target_path': path, 'size': size, 'checksum': checksum}) with open(output_file, 'w') as f: json.dump(plan, f, indent=2) logger.info(f'Merge plan saved to {output_file}') print(f'\n=== MERGE PLAN SUMMARY ===') print(f"Sources: {', '.join(sources)}") print(f'Target: {target or network_target}') print(f'Total files: {total_files:,} ({self.format_size(total_size)})') if filter_system: print(f'Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})') print(f'Unique files: {unique_count:,} ({self.format_size(unique_size)})') print(f'Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})') print(f"Total space saved: {self.format_size(plan['space_saved'])}") print(f'Space needed on target: {self.format_size(unique_size)}') finally: cursor.close() conn.close() def generate_report(self, format='text', show_duplicates=False, preview_merge=None): conn = self.get_connection() cursor = conn.cursor() try: if preview_merge: with open(preview_merge, 'r') as f: plan = json.load(f) print('\n=== MERGE PLAN PREVIEW ===') print(f"Sources: {', '.join(plan['sources'])}") print(f"Target: {plan['target']}") print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})") print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})") print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})") print(f"Space saved: {self.format_size(plan['space_saved'])}") print(f"Space needed on target: {self.format_size(plan['unique_size'])}") return cursor.execute('\n SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status\n ') print('\n=== FILE MIGRATION REPORT ===') for row in cursor.fetchall(): status, count, size = row print(f'{status:15}: {count:6} files, {self.format_size(int(size or 0))}') cursor.execute('\n SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label\n ') print('\n=== DISK USAGE ===') for row in cursor.fetchall(): disk, count, size = row print(f'{disk:20}: {count:6} files, {self.format_size(int(size or 0))}') cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL\n ') hashed_count, hashed_size = cursor.fetchone() cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL\n ') dup_count, dup_size = cursor.fetchone() print('\n=== DEDUPLICATION STATS ===') print(f'Files with checksums: {hashed_count or 0:6}') print(f'Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})') if show_duplicates and dup_count: print('\n=== DUPLICATE FILES ===') cursor.execute('\n SELECT path, size, duplicate_of FROM files\n WHERE duplicate_of IS NOT NULL\n ORDER BY size DESC\n LIMIT 20\n ') for path, size, dup_of in cursor.fetchall(): print(f' {path} ({self.format_size(int(size))}) → {dup_of}') cursor.execute('\n SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified\n ') print('\n=== OPERATIONS REPORT ===') for row in cursor.fetchall(): op_type, executed, verified, count = row status = 'EXECUTED' if executed else 'PENDING' if verified: status += '+VERIFIED' print(f'{op_type:10} {status:15}: {count} operations') finally: cursor.close() conn.close() def profile_content(self, disk: Optional[str]=None, update_db: bool=False, limit: Optional[int]=None): from content.profiler import ContentProfiler profiler = ContentProfiler() disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'} conn = self.get_connection() cursor = conn.cursor() try: query = 'SELECT path, size, disk_label FROM files WHERE 1=1' params = [] if disk: query += ' AND disk_label = %s' params.append(disk) if limit: query += f' LIMIT {limit}' cursor.execute(query, params) files = cursor.fetchall() total = len(files) logger.info(f'Profiling {total:,} files...') kind_stats = {} processable = 0 batch = [] for idx, (path, size, disk_label) in enumerate(files, 1): mount_point = disk_mount_map.get(disk_label, disk_label) full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path) if not full_path.exists(): continue profile = profiler.profile_file(full_path) if 'error' not in profile: kind = profile['kind'] if kind not in kind_stats: kind_stats[kind] = {'count': 0, 'processable': 0} kind_stats[kind]['count'] += 1 if profile['processable']: kind_stats[kind]['processable'] += 1 processable += 1 if update_db: profile_json = json.dumps(profile) batch.append((kind, profile_json, path)) if len(batch) >= 500: cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch]) conn.commit() batch.clear() if idx % 100 == 0: print(f'\rProfiled: {idx:,}/{total:,}', end='', flush=True) if update_db and batch: cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch]) conn.commit() print() print(f'\n=== CONTENT PROFILE SUMMARY ===') print(f'Total files: {total:,}') print(f'Processable: {processable:,}\n') print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}") print('-' * 60) for kind in sorted(kind_stats.keys()): stats = kind_stats[kind] extractor = profiler._suggest_extractor(kind, '') print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}") finally: cursor.close() conn.close() def extract_content(self, kind: Optional[str]=None, limit: int=10): from content.profiler import ContentProfiler from content.extractors import ContentExtractor profiler = ContentProfiler() extractor = ContentExtractor() disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'} conn = self.get_connection() cursor = conn.cursor() try: query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'" params = [] if kind: query += " AND metadata->'profile'->>'kind' = %s" params.append(kind) query += f' LIMIT {limit}' cursor.execute(query, params) files = cursor.fetchall() print(f'\n=== EXTRACTING CONTENT ===') print(f'Processing {len(files)} files\n') for path, size, disk_label, metadata in files: mount_point = disk_mount_map.get(disk_label, disk_label) full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path) if not full_path.exists(): continue profile = metadata.get('profile', {}) if metadata else {} extractor_type = profile.get('extractor') if not extractor_type: continue print(f'Extracting: {path}') print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}") result = extractor.extract(full_path, extractor_type) if 'text' in result: preview = result['text'][:200] print(f' Preview: {preview}...') elif 'pipeline' in result: print(f" Pipeline: {' → '.join(result['pipeline'])}") print(f" Status: {result.get('status', 'pending')}") print() finally: cursor.close() conn.close() def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False): from parsers.text_parser import TextParser from parsers.code_parser import CodeParser from parsers.pdf_parser import PDFParser parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()} disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'} conn = self.get_connection() cursor = conn.cursor() try: query = "SELECT path, size, disk_label FROM files WHERE 1=1" if kind: suffix_map = { 'text': ['.txt', '.md', '.log', '.json', '.yaml', '.yml'], 'code': ['.py', '.js', '.java', '.go', '.rs', '.ts', '.cpp', '.h'], 'pdf': ['.pdf'] } if kind in suffix_map: conditions = ' OR '.join([f"path LIKE '%{ext}'" for ext in suffix_map[kind]]) query += f" AND ({conditions})" query += f" LIMIT {limit}" cursor.execute(query) files = cursor.fetchall() print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n") parsed_count = 0 for path, size, disk_label in files: mount_point = disk_mount_map.get(disk_label, disk_label) full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path) if not full_path.exists() or int(size) > 10 * 1024 * 1024: continue file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text' parser = parsers.get(file_kind) if not parser: continue result = parser.parse(full_path) if 'error' not in result: text = result.get('text', '') quality = result.get('quality', 'unknown') print(f"{path[:60]} | {file_kind} | {len(text):,} chars") if update_db and text: cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path)) parsed_count += 1 if parsed_count % 10 == 0: conn.commit() if update_db: conn.commit() print(f"\nParsed {parsed_count} files") finally: cursor.close() conn.close() def enrich_files(self, limit: int = 10, use_llm: bool = False, use_local: bool = True, batch_size: int = 100): from enrichment.enricher import ContentEnricher from enrichment.llm_client import LLMClient llm_client = LLMClient(use_local=use_local) if use_llm else None enricher = ContentEnricher(llm_client=llm_client) conn = self.get_connection() cursor = conn.cursor() try: cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL AND (enrichment IS NULL OR enrichment = '{{}}'::jsonb) LIMIT {limit}") files = cursor.fetchall() print(f"\n=== ENRICHING CONTENT ===") print(f"Processing {len(files)} files") if use_llm: print(f"Using LLM: {'Local OLLAMA' if use_local else 'Network LM_STUDIO'}\n") else: print("Using rule-based enrichment only\n") enriched_count = 0 batch = [] for idx, (path, text) in enumerate(files, 1): if not text: continue enrichment = enricher.enrich(text[:5000], use_llm=use_llm) print(f"{idx}/{len(files)} {path[:60]}") print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}") if enrichment.get('security', {}).get('has_pii'): print(f" PII: {list(enrichment.get('security', {}).get('pii_details', {}).keys())}") if enrichment.get('tech_stack'): print(f" Tech: {', '.join(enrichment['tech_stack'][:5])}") if enrichment.get('topics'): print(f" Topics: {', '.join(enrichment['topics'][:5])}") if use_llm and enrichment.get('llm_summary'): print(f" LLM Summary: {enrichment['llm_summary'][:100]}...") if use_llm and enrichment.get('llm_intent'): print(f" Intent: {enrichment['llm_intent'][:100]}...") print() batch.append((json.dumps(enrichment), path)) enriched_count += 1 if len(batch) >= batch_size: cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch) conn.commit() batch.clear() print(f" Committed batch ({enriched_count} files so far)") if batch: cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch) conn.commit() print(f"\nEnriched {enriched_count} files") finally: cursor.close() conn.close() def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True): from classification.classifier import FileClassifier classifier = FileClassifier() conn = self.get_connection() cursor = conn.cursor() try: task_name = f"classify_{disk or 'all'}" skip_count = 0 if resume and update_db: cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,)) checkpoint = cursor.fetchone() if checkpoint: last_path, skip_count = checkpoint logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed') if disk: cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,)) else: cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path') files = cursor.fetchall() total = len(files) logger.info(f'Classifying {total:,} files...') categories = {} build_artifacts = 0 batch = [] processed = 0 for idx, (path, size, disk_label) in enumerate(files, 1): if idx <= skip_count: continue labels, category, is_build = classifier.classify_path(path, int(size)) if is_build: build_artifacts += 1 if category not in categories: categories[category] = {'count': 0, 'size': 0} categories[category]['count'] += 1 categories[category]['size'] += int(size) if update_db: labels_str = ','.join(labels) batch.append((category, labels_str, path)) if len(batch) >= 1000: cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch]) cursor.execute(''' INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at) VALUES (%s, %s, %s, CURRENT_TIMESTAMP) ON CONFLICT (task_name) DO UPDATE SET last_processed_path = EXCLUDED.last_processed_path, processed_count = EXCLUDED.processed_count, updated_at = CURRENT_TIMESTAMP ''', (task_name, path, idx)) conn.commit() batch.clear() processed += 1 if idx % 1000 == 0: print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True) if update_db and batch: cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch]) cursor.execute(''' INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at) VALUES (%s, %s, %s, CURRENT_TIMESTAMP) ON CONFLICT (task_name) DO UPDATE SET last_processed_path = EXCLUDED.last_processed_path, processed_count = EXCLUDED.processed_count, updated_at = CURRENT_TIMESTAMP ''', (task_name, files[-1][0] if files else '', total)) conn.commit() print() print(f'\n=== CLASSIFICATION SUMMARY ===') print(f'Total files: {total:,}') print(f'Build artifacts: {build_artifacts:,}') print(f'\nCategories:') for category in sorted(categories.keys()): info = categories[category] print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}") finally: cursor.close() conn.close() def search_content(self, query: str, limit: int=20, search_type: str='text'): conn = self.get_connection() cursor = conn.cursor() try: if search_type == 'text': cursor.execute(''' SELECT path, disk_label, size, category, ts_rank(to_tsvector('english', COALESCE(extracted_text, '')), plainto_tsquery('english', %s)) as rank, LEFT(extracted_text, 200) as snippet FROM files WHERE extracted_text IS NOT NULL AND to_tsvector('english', extracted_text) @@ plainto_tsquery('english', %s) ORDER BY rank DESC LIMIT %s ''', (query, query, limit)) elif search_type == 'enrichment': cursor.execute(''' SELECT path, disk_label, size, category, enrichment FROM files WHERE enrichment IS NOT NULL AND enrichment::text ILIKE %s LIMIT %s ''', (f'%{query}%', limit)) elif search_type == 'path': cursor.execute(''' SELECT path, disk_label, size, category FROM files WHERE path ILIKE %s LIMIT %s ''', (f'%{query}%', limit)) else: logger.error(f'Unknown search type: {search_type}') return results = cursor.fetchall() if not results: print(f'No results found for: {query}') return print(f'\n=== SEARCH RESULTS: {len(results)} matches for "{query}" ===\n') for idx, row in enumerate(results, 1): if search_type == 'text': path, disk, size, category, rank, snippet = row print(f'{idx}. {path}') print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}') print(f' Rank: {rank:.4f}') if snippet: print(f' Snippet: {snippet[:150]}...') elif search_type == 'enrichment': path, disk, size, category, enrichment = row print(f'{idx}. {path}') print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}') if enrichment: import json enrich_data = json.loads(enrichment) if isinstance(enrichment, str) else enrichment if 'topics' in enrich_data: print(f' Topics: {", ".join(enrich_data["topics"][:5])}') if 'tech_stack' in enrich_data: print(f' Tech: {", ".join(enrich_data["tech_stack"][:5])}') else: path, disk, size, category = row print(f'{idx}. {path}') print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}') print() finally: cursor.close() conn.close() def analyze_folders(self, disk: Optional[str]=None, min_files: int=3): from analysis.folder_analyzer import FolderAnalyzer analyzer = FolderAnalyzer() conn = self.get_connection() cursor = conn.cursor() try: query = ''' SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label FROM files WHERE 1=1 ''' params = [] if disk: query += ' AND disk_label = %s' params.append(disk) cursor.execute(query, params) potential_folders = cursor.fetchall() logger.info(f'Found {len(potential_folders)} potential folders to analyze') processed = 0 for folder_name, disk_label in potential_folders: cursor.execute(''' SELECT path, size FROM files WHERE disk_label = %s AND path LIKE %s ''', (disk_label, f'{folder_name}%')) files = cursor.fetchall() if len(files) < min_files: continue files_list = [{'path': f[0], 'size': int(f[1])} for f in files] folder_path = Path(folder_name) analysis = analyzer.analyze_folder(folder_path, files_list) readme_text = None for file_dict in files_list: if 'readme' in file_dict['path'].lower(): readme_text = f"Found README at {file_dict['path']}" break summary = analyzer.generate_summary(analysis, readme_text) cursor.execute(''' INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary, has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (path) DO UPDATE SET file_count = EXCLUDED.file_count, total_size = EXCLUDED.total_size, project_type = EXCLUDED.project_type, intent = EXCLUDED.intent, summary = EXCLUDED.summary, has_readme = EXCLUDED.has_readme, has_git = EXCLUDED.has_git, has_manifest = EXCLUDED.has_manifest, manifest_types = EXCLUDED.manifest_types, dominant_file_types = EXCLUDED.dominant_file_types, structure = EXCLUDED.structure, updated_at = CURRENT_TIMESTAMP ''', ( str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list), analysis.get('project_type'), analysis.get('intent'), summary, analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'), analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})), json.dumps(analysis.get('structure', {})) )) processed += 1 if processed % 100 == 0: conn.commit() print(f'\rAnalyzed: {processed} folders', end='', flush=True) conn.commit() print() logger.info(f'Completed folder analysis: {processed} folders') cursor.execute(''' SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size) FROM folders GROUP BY project_type ''') print(f'\n=== FOLDER ANALYSIS SUMMARY ===') for row in cursor.fetchall(): proj_type, count, files, size = row print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}') finally: cursor.close() conn.close() def inventory_file_types(self, disk: Optional[str]=None, limit: int=50): from analysis.inventory import FileTypeInventory inventory = FileTypeInventory(self.db_config) results = inventory.analyze(disk=disk, limit=limit) print(f'\n=== FILE TYPE INVENTORY ===\n') print(f'{"Extension":<15} {"Count":>10} {"Total Size":>12} {"Parsed":>8} {"Status":>8} {"Parser":>15}') print('=' * 95) for ext_info in results['extensions']: ext = ext_info['extension'] count = ext_info['count'] size = ext_info['total_size'] parsed = ext_info['parsed'] ptype = ext_info['parser_type'] status = '✓' if ext_info['is_parseable'] else '✗' print(f'{ext:<15} {count:>10,} {inventory.format_size(size):>12} {parsed:>8,} {status:>8} {ptype:>15}') print('=' * 95) summary = results['summary'] print(f'Total files: {summary["total_files"]:,}') print(f'Parseable: {summary["parseable_files"]:,} ({100*summary["parseable_files"]/summary["total_files"]:.1f}%)') print(f'Parsed: {summary["parsed_files"]:,} ({summary["coverage"]:.1f}% coverage)') print(f'\n=== PARSER STATUS ===\n') for ptype, info in results['parser_status'].items(): status = '✓ Implemented' if info['implemented'] else '✗ Not yet' print(f'{ptype:<15} {status:<20} {", ".join(info["extensions"][:10])}') if results['unparsed_by_type']: print(f'\n=== UNPARSED FILES BY TYPE ===\n') for ptype, info in sorted(results['unparsed_by_type'].items(), key=lambda x: x[1]['count'], reverse=True): print(f'{ptype:<15} {info["count"]:>10,} files unparsed') exts = sorted(info["extensions"])[:10] print(f' Extensions: {", ".join(exts)}') def review_migration(self, category: Optional[str]=None, show_build: bool=False): from classification.classifier import FileClassifier classifier = FileClassifier() conn = self.get_connection() cursor = conn.cursor() try: query = 'SELECT path, size, category FROM files WHERE 1=1' params = [] if category: query += ' AND category = %s' params.append(category) if not show_build: query += " AND category NOT LIKE 'artifacts%'" query += ' LIMIT 100' cursor.execute(query, params) results = cursor.fetchall() print(f'\n=== MIGRATION REVIEW ({len(results)} files) ===\n') for path, size, cat in results: print(f'{path[:70]:<70} {cat:>20}') finally: cursor.close() conn.close() def format_size(self, size: int) -> str: for unit in ['B', 'KB', 'MB', 'GB', 'TB']: if size < 1024: return f'{size:.1f}{unit}' size /= 1024 return f'{size:.1f}PB' def main(): parser = argparse.ArgumentParser(description='Disk Reorganizer with Content Understanding') subparsers = parser.add_subparsers(dest='command', required=True) inventory_parser = subparsers.add_parser('inventory', help='Analyze file types and parser coverage') inventory_parser.add_argument('--disk', help='Analyze specific disk') inventory_parser.add_argument('--limit', type=int, default=50, help='Limit results') index_parser = subparsers.add_parser('index', help='Index files on a disk') index_parser.add_argument('disk_root', help='Root path of disk') index_parser.add_argument('disk_name', help='Logical name for disk') parse_parser = subparsers.add_parser('parse', help='Parse files to extract text') parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)') parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch') parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database') enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis') enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch') enrich_parser.add_argument('--use-llm', action='store_true', help='Use LLM for summarization') enrich_parser.add_argument('--network', action='store_true', help='Use network LM_STUDIO') search_parser = subparsers.add_parser('search', help='Search indexed content') search_parser.add_argument('query', help='Search query') search_parser.add_argument('--type', choices=['text', 'enrichment', 'path'], default='enrichment') search_parser.add_argument('--limit', type=int, default=20, help='Max results') classify_parser = subparsers.add_parser('classify', help='Classify files') classify_parser.add_argument('--disk', help='Classify specific disk') classify_parser.add_argument('--update', action='store_true', help='Update database') classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch') folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure') folders_parser.add_argument('--disk', help='Analyze specific disk') folders_parser.add_argument('--min-files', type=int, default=3) args = parser.parse_args() tool = DiskReorganizer() if args.command == 'inventory': tool.inventory_file_types(disk=args.disk, limit=args.limit) elif args.command == 'index': tool.index_disk(args.disk_root, args.disk_name) elif args.command == 'parse': tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update) elif args.command == 'enrich': tool.enrich_files(limit=args.limit, use_llm=args.use_llm, use_local=not args.network) elif args.command == 'search': tool.search_content(query=args.query, limit=args.limit, search_type=args.type) elif args.command == 'classify': tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume) elif args.command == 'analyze-folders': tool.analyze_folders(disk=args.disk, min_files=args.min_files) if __name__ == '__main__': main()