897 lines
44 KiB
Python
897 lines
44 KiB
Python
import os
|
|
import sys
|
|
from dataclasses import dataclass
|
|
import psycopg2
|
|
import shutil
|
|
import hashlib
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
import logging
|
|
import time
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)])
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class FileRecord:
|
|
path: str
|
|
size: int
|
|
modified_time: float
|
|
disk_label: str
|
|
checksum: Optional[str] = None
|
|
status: str = 'indexed'
|
|
|
|
class DiskReorganizer:
|
|
|
|
def __init__(self, db_config: Dict=None):
|
|
if db_config is None:
|
|
db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
|
|
self.db_config = db_config
|
|
self.init_database()
|
|
|
|
def get_connection(self):
|
|
return psycopg2.connect(**self.db_config)
|
|
|
|
def init_database(self):
|
|
try:
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
cursor.execute("\n SELECT table_name FROM information_schema.tables\n WHERE table_schema = 'public' AND table_name IN ('files', 'operations')\n ")
|
|
tables = cursor.fetchall()
|
|
if len(tables) < 2:
|
|
logger.error('Database tables not found! Please run setup_database.sh first.')
|
|
raise Exception('Database not properly initialized. Run setup_database.sh')
|
|
cursor.close()
|
|
conn.close()
|
|
logger.info('Database connection verified successfully')
|
|
except psycopg2.Error as e:
|
|
logger.error(f'Database connection failed: {e}')
|
|
raise
|
|
|
|
def index_disk(self, disk_root: str, disk_name: str):
|
|
logger.info(f'Indexing disk: {disk_name} at {disk_root}')
|
|
disk_path = Path(disk_root)
|
|
if not disk_path.exists():
|
|
logger.error(f'Disk path {disk_root} does not exist!')
|
|
return
|
|
files_count = 0
|
|
total_size = 0
|
|
start_time = time.time()
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
for root, dirs, files in os.walk(disk_path):
|
|
dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))]
|
|
for file in files:
|
|
try:
|
|
file_path = Path(root) / file
|
|
if not file_path.is_file():
|
|
continue
|
|
stat = file_path.stat()
|
|
size = stat.st_size
|
|
mtime = datetime.fromtimestamp(stat.st_mtime)
|
|
rel_path = str(file_path.relative_to(disk_path))
|
|
cursor.execute('\n INSERT INTO files (path, size, modified_time, disk_label, checksum, status)\n VALUES (%s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n disk_label = EXCLUDED.disk_label,\n status = EXCLUDED.status\n ', (rel_path, size, mtime, disk_name, None, 'indexed'))
|
|
files_count += 1
|
|
total_size += size
|
|
if files_count % 100 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = files_count / elapsed if elapsed > 0 else 0
|
|
display_path = str(file_path)
|
|
if len(display_path) > 60:
|
|
display_path = '...' + display_path[-57:]
|
|
print(f'\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}', end='', flush=True)
|
|
if files_count % 1000 == 0:
|
|
conn.commit()
|
|
except Exception as e:
|
|
conn.rollback()
|
|
logger.warning(f'\nSkipping {file_path}: {e}')
|
|
continue
|
|
conn.commit()
|
|
print()
|
|
logger.info(f'Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}')
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def calculate_disk_usage(self) -> Dict[str, Dict]:
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
cursor.execute('\n SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count\n FROM files\n GROUP BY disk_label\n ')
|
|
usage = {}
|
|
for row in cursor.fetchall():
|
|
disk = row[0]
|
|
size = int(row[1] or 0)
|
|
count = int(row[2])
|
|
usage[disk] = {'size': size, 'count': count, 'formatted_size': self.format_size(size)}
|
|
return usage
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict:
|
|
logger.info(f'Planning migration to free up {target_disk}')
|
|
usage = self.calculate_disk_usage()
|
|
if target_disk not in usage:
|
|
logger.error(f'Target disk {target_disk} not found in index!')
|
|
return {}
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
cursor.execute('SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC', (target_disk,))
|
|
files_to_move = cursor.fetchall()
|
|
cursor.close()
|
|
conn.close()
|
|
target_disk_usage = usage[target_disk]['size']
|
|
logger.info(f'Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}')
|
|
dest_availability = []
|
|
for disk in destination_disks:
|
|
if disk not in usage:
|
|
available = float('inf')
|
|
else:
|
|
available = float('inf')
|
|
dest_availability.append({'disk': disk, 'available': available, 'planned_usage': 0})
|
|
plan = {'target_disk': target_disk, 'total_size': target_disk_usage, 'file_count': len(files_to_move), 'operations': [], 'destination_disks': destination_disks}
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
for file_info in files_to_move:
|
|
rel_path, size, mtime = file_info
|
|
dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)]
|
|
op = {'source_disk': target_disk, 'source_path': rel_path, 'dest_disk': dest_disk, 'target_path': rel_path, 'size': int(size)}
|
|
plan['operations'].append(op)
|
|
cursor.execute('INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)', (f'{target_disk}:{rel_path}', f'{dest_disk}:{rel_path}', 'move', 'pending'))
|
|
conn.commit()
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(plan_file, 'w') as f:
|
|
json.dump(plan, f, indent=2)
|
|
logger.info(f"Plan created with {len(plan['operations'])} operations")
|
|
logger.info(f'Plan saved to {plan_file}')
|
|
return plan
|
|
|
|
def verify_operation(self, source: Path, dest: Path) -> bool:
|
|
if not dest.exists():
|
|
return False
|
|
try:
|
|
source_stat = source.stat()
|
|
dest_stat = dest.stat()
|
|
if source_stat.st_size != dest_stat.st_size:
|
|
return False
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f'Verification error: {e}')
|
|
return False
|
|
|
|
@staticmethod
|
|
def file_checksum(path: Path) -> str:
|
|
hash_md5 = hashlib.md5()
|
|
with open(path, 'rb') as f:
|
|
for chunk in iter(lambda: f.read(4096), b''):
|
|
hash_md5.update(chunk)
|
|
return hash_md5.hexdigest()
|
|
|
|
def execute_migration(self, plan_file: str, dry_run: bool=True):
|
|
logger.info(f"{('DRY RUN' if dry_run else 'EXECUTING')} migration from {plan_file}")
|
|
with open(plan_file, 'r') as f:
|
|
plan = json.load(f)
|
|
operations = plan['operations']
|
|
logger.info(f'Processing {len(operations)} operations...')
|
|
success_count = 0
|
|
error_count = 0
|
|
start_time = time.time()
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
for i, op in enumerate(operations, 1):
|
|
source_disk = op['source_disk']
|
|
source_path = op['source_path']
|
|
dest_disk = op['dest_disk']
|
|
target_path = op['target_path']
|
|
source_full = Path(source_disk) / source_path
|
|
dest_full = Path(dest_disk) / target_path
|
|
elapsed = time.time() - start_time
|
|
rate = i / elapsed if elapsed > 0 else 0
|
|
eta = (len(operations) - i) / rate if rate > 0 else 0
|
|
display_path = str(source_path)
|
|
if len(display_path) > 50:
|
|
display_path = '...' + display_path[-47:]
|
|
print(f'\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}', end='', flush=True)
|
|
if dry_run:
|
|
if source_full.exists():
|
|
success_count += 1
|
|
else:
|
|
logger.warning(f'\n Source does not exist: {source_full}')
|
|
error_count += 1
|
|
continue
|
|
try:
|
|
dest_full.parent.mkdir(parents=True, exist_ok=True)
|
|
if source_full.exists():
|
|
shutil.copy2(source_full, dest_full)
|
|
if self.verify_operation(source_full, dest_full):
|
|
cursor.execute("UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s", (dest_disk, source_path, source_disk))
|
|
cursor.execute('UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s', (f'{source_disk}:{source_path}',))
|
|
success_count += 1
|
|
else:
|
|
raise Exception('Verification failed')
|
|
else:
|
|
logger.warning(f'\n Source missing: {source_full}')
|
|
error_count += 1
|
|
except Exception as e:
|
|
logger.error(f'\n Error processing {source_path}: {e}')
|
|
cursor.execute('UPDATE operations SET error = %s WHERE source_path = %s', (str(e), f'{source_disk}:{source_path}'))
|
|
error_count += 1
|
|
if i % 10 == 0:
|
|
conn.commit()
|
|
conn.commit()
|
|
print()
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
logger.info(f'Migration complete: {success_count} success, {error_count} errors')
|
|
if not dry_run and error_count == 0:
|
|
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
|
|
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
|
|
|
|
def run_deduplication(self, disk: Optional[str]=None, use_chunks: bool=True):
|
|
logger.info(f"Starting deduplication{(' for disk ' + disk if disk else '')}")
|
|
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
def hash_file_local(file_path: Path) -> str:
|
|
hasher = hashlib.sha256()
|
|
with open(file_path, 'rb') as f:
|
|
while (chunk := f.read(65536)):
|
|
hasher.update(chunk)
|
|
return hasher.hexdigest()
|
|
try:
|
|
if disk:
|
|
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC', (disk,))
|
|
else:
|
|
cursor.execute('SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC')
|
|
files_to_process = cursor.fetchall()
|
|
total = len(files_to_process)
|
|
logger.info(f'Found {total} files to hash')
|
|
processed = 0
|
|
skipped = 0
|
|
start_time = time.time()
|
|
batch = []
|
|
print(f'Phase 1: Computing checksums...')
|
|
for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
|
|
try:
|
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
|
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
|
|
if not full_path.exists():
|
|
skipped += 1
|
|
if idx % 100 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
|
|
remaining = (total - idx) / rate if rate > 0 else 0
|
|
pct = 100 * idx / total
|
|
print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
|
|
continue
|
|
checksum = hash_file_local(full_path)
|
|
batch.append((checksum, path_str))
|
|
processed += 1
|
|
if len(batch) >= 1000:
|
|
try:
|
|
cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
|
|
conn.commit()
|
|
batch.clear()
|
|
except Exception as e:
|
|
conn.rollback()
|
|
batch.clear()
|
|
print(f'\nBatch update failed: {e}')
|
|
if idx % 100 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
|
|
remaining = (total - idx) / rate if rate > 0 else 0
|
|
pct = 100 * idx / total
|
|
print(f'\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining / 60)}m{int(remaining % 60):02d}s | Skip: {skipped:,}', end='', flush=True)
|
|
except Exception as e:
|
|
skipped += 1
|
|
if idx <= 5:
|
|
print(f'\nDebug: {full_path} - {e}')
|
|
if batch:
|
|
try:
|
|
cursor.executemany('UPDATE files SET checksum = %s WHERE path = %s', batch)
|
|
conn.commit()
|
|
except Exception as e:
|
|
conn.rollback()
|
|
print(f'\nFinal batch failed: {e}')
|
|
print()
|
|
elapsed = time.time() - start_time
|
|
logger.info(f'Phase 1 done: {processed:,} files in {int(elapsed / 60)}m{int(elapsed % 60):02d}s ({skipped:,} skipped)')
|
|
print('Phase 2: Finding duplicates...')
|
|
cursor.execute('\n UPDATE files f1 SET duplicate_of = (\n SELECT MIN(path) FROM files f2\n WHERE f2.checksum = f1.checksum AND f2.path < f1.path\n )\n WHERE checksum IS NOT NULL\n ')
|
|
conn.commit()
|
|
cursor.execute('SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL')
|
|
dup_count = cursor.fetchone()[0]
|
|
logger.info(f'Phase 2 done: Found {dup_count:,} duplicates')
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def plan_merge(self, sources: List[str], target: str, output_file: str, filter_system: bool=False, network_target: str=None):
|
|
logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
|
|
if filter_system:
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from filters import GitignoreFilter
|
|
file_filter = GitignoreFilter()
|
|
logger.info('System/build file filtering enabled')
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
placeholders = ','.join(['%s'] * len(sources))
|
|
cursor.execute(f'\n SELECT path, size, checksum, disk_label, duplicate_of\n FROM files\n WHERE disk_label IN ({placeholders})\n ORDER BY size DESC\n ', tuple(sources))
|
|
files = cursor.fetchall()
|
|
total_files = len(files)
|
|
total_size = sum((int(f[1]) for f in files))
|
|
unique_files = {}
|
|
duplicate_count = 0
|
|
duplicate_size = 0
|
|
filtered_count = 0
|
|
filtered_size = 0
|
|
for path, size, checksum, disk_label, duplicate_of in files:
|
|
if filter_system and file_filter.should_exclude(path):
|
|
filtered_count += 1
|
|
filtered_size += int(size)
|
|
continue
|
|
if checksum and checksum in unique_files:
|
|
duplicate_count += 1
|
|
duplicate_size += int(size)
|
|
elif checksum:
|
|
unique_files[checksum] = (path, int(size), disk_label)
|
|
unique_count = len(unique_files)
|
|
unique_size = sum((f[1] for f in unique_files.values()))
|
|
plan = {'sources': sources, 'target': target or network_target, 'network': network_target is not None, 'total_files': total_files, 'total_size': total_size, 'unique_files': unique_count, 'unique_size': unique_size, 'duplicate_files': duplicate_count, 'duplicate_size': duplicate_size, 'filtered_files': filtered_count if filter_system else 0, 'filtered_size': filtered_size if filter_system else 0, 'space_saved': duplicate_size + (filtered_size if filter_system else 0), 'operations': []}
|
|
for checksum, (path, size, disk_label) in unique_files.items():
|
|
plan['operations'].append({'source_disk': disk_label, 'source_path': path, 'target_disk': target or network_target, 'target_path': path, 'size': size, 'checksum': checksum})
|
|
with open(output_file, 'w') as f:
|
|
json.dump(plan, f, indent=2)
|
|
logger.info(f'Merge plan saved to {output_file}')
|
|
print(f'\n=== MERGE PLAN SUMMARY ===')
|
|
print(f"Sources: {', '.join(sources)}")
|
|
print(f'Target: {target or network_target}')
|
|
print(f'Total files: {total_files:,} ({self.format_size(total_size)})')
|
|
if filter_system:
|
|
print(f'Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})')
|
|
print(f'Unique files: {unique_count:,} ({self.format_size(unique_size)})')
|
|
print(f'Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})')
|
|
print(f"Total space saved: {self.format_size(plan['space_saved'])}")
|
|
print(f'Space needed on target: {self.format_size(unique_size)}')
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
if preview_merge:
|
|
with open(preview_merge, 'r') as f:
|
|
plan = json.load(f)
|
|
print('\n=== MERGE PLAN PREVIEW ===')
|
|
print(f"Sources: {', '.join(plan['sources'])}")
|
|
print(f"Target: {plan['target']}")
|
|
print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
|
|
print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
|
|
print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
|
|
print(f"Space saved: {self.format_size(plan['space_saved'])}")
|
|
print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
|
|
return
|
|
cursor.execute('\n SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status\n ')
|
|
print('\n=== FILE MIGRATION REPORT ===')
|
|
for row in cursor.fetchall():
|
|
status, count, size = row
|
|
print(f'{status:15}: {count:6} files, {self.format_size(int(size or 0))}')
|
|
cursor.execute('\n SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label\n ')
|
|
print('\n=== DISK USAGE ===')
|
|
for row in cursor.fetchall():
|
|
disk, count, size = row
|
|
print(f'{disk:20}: {count:6} files, {self.format_size(int(size or 0))}')
|
|
cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL\n ')
|
|
hashed_count, hashed_size = cursor.fetchone()
|
|
cursor.execute('\n SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL\n ')
|
|
dup_count, dup_size = cursor.fetchone()
|
|
print('\n=== DEDUPLICATION STATS ===')
|
|
print(f'Files with checksums: {hashed_count or 0:6}')
|
|
print(f'Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})')
|
|
if show_duplicates and dup_count:
|
|
print('\n=== DUPLICATE FILES ===')
|
|
cursor.execute('\n SELECT path, size, duplicate_of FROM files\n WHERE duplicate_of IS NOT NULL\n ORDER BY size DESC\n LIMIT 20\n ')
|
|
for path, size, dup_of in cursor.fetchall():
|
|
print(f' {path} ({self.format_size(int(size))}) → {dup_of}')
|
|
cursor.execute('\n SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified\n ')
|
|
print('\n=== OPERATIONS REPORT ===')
|
|
for row in cursor.fetchall():
|
|
op_type, executed, verified, count = row
|
|
status = 'EXECUTED' if executed else 'PENDING'
|
|
if verified:
|
|
status += '+VERIFIED'
|
|
print(f'{op_type:10} {status:15}: {count} operations')
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def profile_content(self, disk: Optional[str]=None, update_db: bool=False, limit: Optional[int]=None):
|
|
from content.profiler import ContentProfiler
|
|
profiler = ContentProfiler()
|
|
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
query = 'SELECT path, size, disk_label FROM files WHERE 1=1'
|
|
params = []
|
|
if disk:
|
|
query += ' AND disk_label = %s'
|
|
params.append(disk)
|
|
if limit:
|
|
query += f' LIMIT {limit}'
|
|
cursor.execute(query, params)
|
|
files = cursor.fetchall()
|
|
total = len(files)
|
|
logger.info(f'Profiling {total:,} files...')
|
|
kind_stats = {}
|
|
processable = 0
|
|
batch = []
|
|
for idx, (path, size, disk_label) in enumerate(files, 1):
|
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
|
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
|
if not full_path.exists():
|
|
continue
|
|
profile = profiler.profile_file(full_path)
|
|
if 'error' not in profile:
|
|
kind = profile['kind']
|
|
if kind not in kind_stats:
|
|
kind_stats[kind] = {'count': 0, 'processable': 0}
|
|
kind_stats[kind]['count'] += 1
|
|
if profile['processable']:
|
|
kind_stats[kind]['processable'] += 1
|
|
processable += 1
|
|
if update_db:
|
|
profile_json = json.dumps(profile)
|
|
batch.append((kind, profile_json, path))
|
|
if len(batch) >= 500:
|
|
cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
|
|
conn.commit()
|
|
batch.clear()
|
|
if idx % 100 == 0:
|
|
print(f'\rProfiled: {idx:,}/{total:,}', end='', flush=True)
|
|
if update_db and batch:
|
|
cursor.executemany("UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s", [(pj, p) for k, pj, p in batch])
|
|
conn.commit()
|
|
print()
|
|
print(f'\n=== CONTENT PROFILE SUMMARY ===')
|
|
print(f'Total files: {total:,}')
|
|
print(f'Processable: {processable:,}\n')
|
|
print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
|
|
print('-' * 60)
|
|
for kind in sorted(kind_stats.keys()):
|
|
stats = kind_stats[kind]
|
|
extractor = profiler._suggest_extractor(kind, '')
|
|
print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def extract_content(self, kind: Optional[str]=None, limit: int=10):
|
|
from content.profiler import ContentProfiler
|
|
from content.extractors import ContentExtractor
|
|
profiler = ContentProfiler()
|
|
extractor = ContentExtractor()
|
|
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
|
|
params = []
|
|
if kind:
|
|
query += " AND metadata->'profile'->>'kind' = %s"
|
|
params.append(kind)
|
|
query += f' LIMIT {limit}'
|
|
cursor.execute(query, params)
|
|
files = cursor.fetchall()
|
|
print(f'\n=== EXTRACTING CONTENT ===')
|
|
print(f'Processing {len(files)} files\n')
|
|
for path, size, disk_label, metadata in files:
|
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
|
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
|
if not full_path.exists():
|
|
continue
|
|
profile = metadata.get('profile', {}) if metadata else {}
|
|
extractor_type = profile.get('extractor')
|
|
if not extractor_type:
|
|
continue
|
|
print(f'Extracting: {path}')
|
|
print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}")
|
|
result = extractor.extract(full_path, extractor_type)
|
|
if 'text' in result:
|
|
preview = result['text'][:200]
|
|
print(f' Preview: {preview}...')
|
|
elif 'pipeline' in result:
|
|
print(f" Pipeline: {' → '.join(result['pipeline'])}")
|
|
print(f" Status: {result.get('status', 'pending')}")
|
|
print()
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False):
|
|
from parsers.text_parser import TextParser
|
|
from parsers.code_parser import CodeParser
|
|
from parsers.pdf_parser import PDFParser
|
|
|
|
parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()}
|
|
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
|
if kind:
|
|
suffix_map = {
|
|
'text': ['.txt', '.md', '.log', '.json', '.yaml', '.yml'],
|
|
'code': ['.py', '.js', '.java', '.go', '.rs', '.ts', '.cpp', '.h'],
|
|
'pdf': ['.pdf']
|
|
}
|
|
if kind in suffix_map:
|
|
conditions = ' OR '.join([f"path LIKE '%{ext}'" for ext in suffix_map[kind]])
|
|
query += f" AND ({conditions})"
|
|
query += f" LIMIT {limit}"
|
|
|
|
cursor.execute(query)
|
|
files = cursor.fetchall()
|
|
|
|
print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
|
|
|
|
parsed_count = 0
|
|
for path, size, disk_label in files:
|
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
|
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
|
|
|
if not full_path.exists() or int(size) > 10 * 1024 * 1024:
|
|
continue
|
|
|
|
file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text'
|
|
parser = parsers.get(file_kind)
|
|
if not parser:
|
|
continue
|
|
|
|
result = parser.parse(full_path)
|
|
if 'error' not in result:
|
|
text = result.get('text', '')
|
|
quality = result.get('quality', 'unknown')
|
|
print(f"{path[:60]} | {file_kind} | {len(text):,} chars")
|
|
|
|
if update_db and text:
|
|
cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path))
|
|
parsed_count += 1
|
|
if parsed_count % 10 == 0:
|
|
conn.commit()
|
|
|
|
if update_db:
|
|
conn.commit()
|
|
print(f"\nParsed {parsed_count} files")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def enrich_files(self, limit: int = 10, use_llm: bool = False, use_local: bool = True, batch_size: int = 100):
|
|
from enrichment.enricher import ContentEnricher
|
|
from enrichment.llm_client import LLMClient
|
|
|
|
llm_client = LLMClient(use_local=use_local) if use_llm else None
|
|
enricher = ContentEnricher(llm_client=llm_client)
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL AND (enrichment IS NULL OR enrichment = '{{}}'::jsonb) LIMIT {limit}")
|
|
files = cursor.fetchall()
|
|
|
|
print(f"\n=== ENRICHING CONTENT ===")
|
|
print(f"Processing {len(files)} files")
|
|
if use_llm:
|
|
print(f"Using LLM: {'Local OLLAMA' if use_local else 'Network LM_STUDIO'}\n")
|
|
else:
|
|
print("Using rule-based enrichment only\n")
|
|
|
|
enriched_count = 0
|
|
batch = []
|
|
for idx, (path, text) in enumerate(files, 1):
|
|
if not text:
|
|
continue
|
|
|
|
enrichment = enricher.enrich(text[:5000], use_llm=use_llm)
|
|
|
|
print(f"{idx}/{len(files)} {path[:60]}")
|
|
print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
|
|
if enrichment.get('security', {}).get('has_pii'):
|
|
print(f" PII: {list(enrichment.get('security', {}).get('pii_details', {}).keys())}")
|
|
if enrichment.get('tech_stack'):
|
|
print(f" Tech: {', '.join(enrichment['tech_stack'][:5])}")
|
|
if enrichment.get('topics'):
|
|
print(f" Topics: {', '.join(enrichment['topics'][:5])}")
|
|
if use_llm and enrichment.get('llm_summary'):
|
|
print(f" LLM Summary: {enrichment['llm_summary'][:100]}...")
|
|
if use_llm and enrichment.get('llm_intent'):
|
|
print(f" Intent: {enrichment['llm_intent'][:100]}...")
|
|
print()
|
|
|
|
batch.append((json.dumps(enrichment), path))
|
|
enriched_count += 1
|
|
|
|
if len(batch) >= batch_size:
|
|
cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
|
|
conn.commit()
|
|
batch.clear()
|
|
print(f" Committed batch ({enriched_count} files so far)")
|
|
|
|
if batch:
|
|
cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
|
|
conn.commit()
|
|
|
|
print(f"\nEnriched {enriched_count} files")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True):
|
|
from classification.classifier import FileClassifier
|
|
classifier = FileClassifier()
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
task_name = f"classify_{disk or 'all'}"
|
|
skip_count = 0
|
|
|
|
if resume and update_db:
|
|
cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,))
|
|
checkpoint = cursor.fetchone()
|
|
if checkpoint:
|
|
last_path, skip_count = checkpoint
|
|
logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed')
|
|
|
|
if disk:
|
|
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,))
|
|
else:
|
|
cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path')
|
|
files = cursor.fetchall()
|
|
total = len(files)
|
|
logger.info(f'Classifying {total:,} files...')
|
|
|
|
categories = {}
|
|
build_artifacts = 0
|
|
batch = []
|
|
processed = 0
|
|
|
|
for idx, (path, size, disk_label) in enumerate(files, 1):
|
|
if idx <= skip_count:
|
|
continue
|
|
|
|
labels, category, is_build = classifier.classify_path(path, int(size))
|
|
if is_build:
|
|
build_artifacts += 1
|
|
if category not in categories:
|
|
categories[category] = {'count': 0, 'size': 0}
|
|
categories[category]['count'] += 1
|
|
categories[category]['size'] += int(size)
|
|
|
|
if update_db:
|
|
labels_str = ','.join(labels)
|
|
batch.append((category, labels_str, path))
|
|
|
|
if len(batch) >= 1000:
|
|
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
|
|
cursor.execute('''
|
|
INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
|
|
VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
|
|
ON CONFLICT (task_name) DO UPDATE SET
|
|
last_processed_path = EXCLUDED.last_processed_path,
|
|
processed_count = EXCLUDED.processed_count,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
''', (task_name, path, idx))
|
|
conn.commit()
|
|
batch.clear()
|
|
|
|
processed += 1
|
|
if idx % 1000 == 0:
|
|
print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True)
|
|
|
|
if update_db and batch:
|
|
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
|
|
cursor.execute('''
|
|
INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
|
|
VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
|
|
ON CONFLICT (task_name) DO UPDATE SET
|
|
last_processed_path = EXCLUDED.last_processed_path,
|
|
processed_count = EXCLUDED.processed_count,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
''', (task_name, files[-1][0] if files else '', total))
|
|
conn.commit()
|
|
|
|
print()
|
|
print(f'\n=== CLASSIFICATION SUMMARY ===')
|
|
print(f'Total files: {total:,}')
|
|
print(f'Build artifacts: {build_artifacts:,}')
|
|
print(f'\nCategories:')
|
|
for category in sorted(categories.keys()):
|
|
info = categories[category]
|
|
print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def search_content(self, query: str, limit: int=20, search_type: str='text'):
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
if search_type == 'text':
|
|
cursor.execute('''
|
|
SELECT path, disk_label, size, category,
|
|
ts_rank(to_tsvector('english', COALESCE(extracted_text, '')), plainto_tsquery('english', %s)) as rank,
|
|
LEFT(extracted_text, 200) as snippet
|
|
FROM files
|
|
WHERE extracted_text IS NOT NULL
|
|
AND to_tsvector('english', extracted_text) @@ plainto_tsquery('english', %s)
|
|
ORDER BY rank DESC
|
|
LIMIT %s
|
|
''', (query, query, limit))
|
|
elif search_type == 'enrichment':
|
|
cursor.execute('''
|
|
SELECT path, disk_label, size, category, enrichment
|
|
FROM files
|
|
WHERE enrichment IS NOT NULL
|
|
AND enrichment::text ILIKE %s
|
|
LIMIT %s
|
|
''', (f'%{query}%', limit))
|
|
elif search_type == 'path':
|
|
cursor.execute('''
|
|
SELECT path, disk_label, size, category
|
|
FROM files
|
|
WHERE path ILIKE %s
|
|
LIMIT %s
|
|
''', (f'%{query}%', limit))
|
|
else:
|
|
logger.error(f'Unknown search type: {search_type}')
|
|
return
|
|
|
|
results = cursor.fetchall()
|
|
if not results:
|
|
print(f'No results found for: {query}')
|
|
return
|
|
|
|
print(f'\n=== SEARCH RESULTS: {len(results)} matches for "{query}" ===\n')
|
|
for idx, row in enumerate(results, 1):
|
|
if search_type == 'text':
|
|
path, disk, size, category, rank, snippet = row
|
|
print(f'{idx}. {path}')
|
|
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
|
print(f' Rank: {rank:.4f}')
|
|
if snippet:
|
|
print(f' Snippet: {snippet[:150]}...')
|
|
elif search_type == 'enrichment':
|
|
path, disk, size, category, enrichment = row
|
|
print(f'{idx}. {path}')
|
|
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
|
if enrichment:
|
|
import json
|
|
enrich_data = json.loads(enrichment) if isinstance(enrichment, str) else enrichment
|
|
if 'topics' in enrich_data:
|
|
print(f' Topics: {", ".join(enrich_data["topics"][:5])}')
|
|
if 'tech_stack' in enrich_data:
|
|
print(f' Tech: {", ".join(enrich_data["tech_stack"][:5])}')
|
|
else:
|
|
path, disk, size, category = row
|
|
print(f'{idx}. {path}')
|
|
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
|
print()
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
|
|
from analysis.folder_analyzer import FolderAnalyzer
|
|
analyzer = FolderAnalyzer()
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
query = '''
|
|
SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label
|
|
FROM files
|
|
WHERE 1=1
|
|
'''
|
|
params = []
|
|
if disk:
|
|
query += ' AND disk_label = %s'
|
|
params.append(disk)
|
|
|
|
cursor.execute(query, params)
|
|
potential_folders = cursor.fetchall()
|
|
|
|
logger.info(f'Found {len(potential_folders)} potential folders to analyze')
|
|
|
|
processed = 0
|
|
for folder_name, disk_label in potential_folders:
|
|
cursor.execute('''
|
|
SELECT path, size FROM files
|
|
WHERE disk_label = %s AND path LIKE %s
|
|
''', (disk_label, f'{folder_name}%'))
|
|
|
|
files = cursor.fetchall()
|
|
if len(files) < min_files:
|
|
continue
|
|
|
|
files_list = [{'path': f[0], 'size': int(f[1])} for f in files]
|
|
folder_path = Path(folder_name)
|
|
|
|
analysis = analyzer.analyze_folder(folder_path, files_list)
|
|
|
|
readme_text = None
|
|
for file_dict in files_list:
|
|
if 'readme' in file_dict['path'].lower():
|
|
readme_text = f"Found README at {file_dict['path']}"
|
|
break
|
|
|
|
summary = analyzer.generate_summary(analysis, readme_text)
|
|
|
|
cursor.execute('''
|
|
INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary,
|
|
has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (path) DO UPDATE SET
|
|
file_count = EXCLUDED.file_count,
|
|
total_size = EXCLUDED.total_size,
|
|
project_type = EXCLUDED.project_type,
|
|
intent = EXCLUDED.intent,
|
|
summary = EXCLUDED.summary,
|
|
has_readme = EXCLUDED.has_readme,
|
|
has_git = EXCLUDED.has_git,
|
|
has_manifest = EXCLUDED.has_manifest,
|
|
manifest_types = EXCLUDED.manifest_types,
|
|
dominant_file_types = EXCLUDED.dominant_file_types,
|
|
structure = EXCLUDED.structure,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
''', (
|
|
str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list),
|
|
analysis.get('project_type'), analysis.get('intent'), summary,
|
|
analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'),
|
|
analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})),
|
|
json.dumps(analysis.get('structure', {}))
|
|
))
|
|
|
|
processed += 1
|
|
if processed % 100 == 0:
|
|
conn.commit()
|
|
print(f'\rAnalyzed: {processed} folders', end='', flush=True)
|
|
|
|
conn.commit()
|
|
print()
|
|
logger.info(f'Completed folder analysis: {processed} folders')
|
|
|
|
cursor.execute('''
|
|
SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size)
|
|
FROM folders
|
|
GROUP BY project_type
|
|
''')
|
|
print(f'\n=== FOLDER ANALYSIS SUMMARY ===')
|
|
for row in cursor.fetchall():
|
|
proj_type, count, files, size = row
|
|
print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}')
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|