1055 lines
42 KiB
Python
1055 lines
42 KiB
Python
#!/usr/bin/env python3
|
|
import os
|
|
import sys
|
|
from dataclasses import dataclass
|
|
|
|
import psycopg2
|
|
import shutil
|
|
import hashlib
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
import logging
|
|
import time
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class FileRecord:
|
|
"""Represents a file in the index"""
|
|
path: str
|
|
size: int
|
|
modified_time: float
|
|
disk_label: str
|
|
checksum: Optional[str] = None
|
|
status: str = 'indexed' # indexed, planned, moved, verified
|
|
|
|
class DiskReorganizer:
|
|
def __init__(self, db_config: Dict = None):
|
|
"""
|
|
Initialize DiskReorganizer with PostgreSQL connection
|
|
:param db_config: Database configuration dict with host, port, database, user, password
|
|
"""
|
|
if db_config is None:
|
|
db_config = {
|
|
'host': os.getenv('DB_HOST', '192.168.1.159'),
|
|
'port': int(os.getenv('DB_PORT', 5432)),
|
|
'database': os.getenv('DB_NAME', 'disk_reorganizer_db'),
|
|
'user': os.getenv('DB_USER', 'disk_reorg_user'),
|
|
'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')
|
|
}
|
|
self.db_config = db_config
|
|
self.init_database()
|
|
|
|
def get_connection(self):
|
|
"""Get PostgreSQL database connection"""
|
|
return psycopg2.connect(**self.db_config)
|
|
|
|
def init_database(self):
|
|
"""Verify PostgreSQL database connection and tables exist"""
|
|
try:
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
# Test connection and verify tables exist
|
|
cursor.execute("""
|
|
SELECT table_name FROM information_schema.tables
|
|
WHERE table_schema = 'public' AND table_name IN ('files', 'operations')
|
|
""")
|
|
tables = cursor.fetchall()
|
|
|
|
if len(tables) < 2:
|
|
logger.error("Database tables not found! Please run setup_database.sh first.")
|
|
raise Exception("Database not properly initialized. Run setup_database.sh")
|
|
|
|
cursor.close()
|
|
conn.close()
|
|
logger.info("Database connection verified successfully")
|
|
except psycopg2.Error as e:
|
|
logger.error(f"Database connection failed: {e}")
|
|
raise
|
|
|
|
def index_disk(self, disk_root: str, disk_name: str):
|
|
"""
|
|
Index all files on a disk/partition with dynamic progress display
|
|
:param disk_root: Root path of disk (e.g., 'D:\\')
|
|
:param disk_name: Logical name for the disk
|
|
"""
|
|
logger.info(f"Indexing disk: {disk_name} at {disk_root}")
|
|
disk_path = Path(disk_root)
|
|
|
|
if not disk_path.exists():
|
|
logger.error(f"Disk path {disk_root} does not exist!")
|
|
return
|
|
|
|
files_count = 0
|
|
total_size = 0
|
|
start_time = time.time()
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
# Walk through all files
|
|
for root, dirs, files in os.walk(disk_path):
|
|
# Skip system directories
|
|
dirs[:] = [d for d in dirs if not d.startswith(('$', 'System Volume Information', 'Recovery'))]
|
|
|
|
for file in files:
|
|
try:
|
|
file_path = Path(root) / file
|
|
if not file_path.is_file():
|
|
continue
|
|
|
|
stat = file_path.stat()
|
|
size = stat.st_size
|
|
mtime = datetime.fromtimestamp(stat.st_mtime)
|
|
|
|
# Calculate relative path for portability
|
|
rel_path = str(file_path.relative_to(disk_path))
|
|
|
|
# PostgreSQL INSERT ... ON CONFLICT for upsert
|
|
cursor.execute("""
|
|
INSERT INTO files (path, size, modified_time, disk_label, checksum, status)
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (path) DO UPDATE SET
|
|
size = EXCLUDED.size,
|
|
modified_time = EXCLUDED.modified_time,
|
|
disk_label = EXCLUDED.disk_label,
|
|
status = EXCLUDED.status
|
|
""", (rel_path, size, mtime, disk_name, None, 'indexed'))
|
|
|
|
files_count += 1
|
|
total_size += size
|
|
|
|
# Dynamic progress display - update every 100 files
|
|
if files_count % 100 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = files_count / elapsed if elapsed > 0 else 0
|
|
# Truncate path for display
|
|
display_path = str(file_path)
|
|
if len(display_path) > 60:
|
|
display_path = '...' + display_path[-57:]
|
|
|
|
# Use \r to overwrite the line
|
|
print(f"\rIndexing: {files_count:,} files | {self.format_size(total_size)} | {rate:.0f} files/s | {display_path}", end='', flush=True)
|
|
|
|
# Commit every 1000 files for performance
|
|
if files_count % 1000 == 0:
|
|
conn.commit()
|
|
|
|
except Exception as e:
|
|
conn.rollback()
|
|
logger.warning(f"\nSkipping {file_path}: {e}")
|
|
continue
|
|
|
|
conn.commit()
|
|
print() # New line after progress display
|
|
logger.info(f"Completed indexing {disk_name}: {files_count} files, {self.format_size(total_size)}")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def calculate_disk_usage(self) -> Dict[str, Dict]:
|
|
"""Calculate current usage per disk"""
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
cursor.execute("""
|
|
SELECT disk_label, SUM(size) as total_size, COUNT(*) as file_count
|
|
FROM files
|
|
GROUP BY disk_label
|
|
""")
|
|
|
|
usage = {}
|
|
for row in cursor.fetchall():
|
|
disk = row[0]
|
|
size = int(row[1] or 0)
|
|
count = int(row[2])
|
|
usage[disk] = {
|
|
'size': size,
|
|
'count': count,
|
|
'formatted_size': self.format_size(size)
|
|
}
|
|
|
|
return usage
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def plan_migration(self, target_disk: str, destination_disks: List[str]) -> Dict:
|
|
"""
|
|
Create a migration plan to free up target_disk
|
|
:param target_disk: Disk to free up (e.g., 'D:')
|
|
:param destination_disks: List of disks to move files to
|
|
:return: Migration plan dictionary
|
|
"""
|
|
logger.info(f"Planning migration to free up {target_disk}")
|
|
|
|
usage = self.calculate_disk_usage()
|
|
|
|
if target_disk not in usage:
|
|
logger.error(f"Target disk {target_disk} not found in index!")
|
|
return {}
|
|
|
|
# Get files on target disk
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute(
|
|
"SELECT path, size, modified_time FROM files WHERE disk_label = %s ORDER BY size DESC",
|
|
(target_disk,)
|
|
)
|
|
files_to_move = cursor.fetchall()
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
target_disk_usage = usage[target_disk]['size']
|
|
logger.info(f"Need to move {len(files_to_move)} files, {self.format_size(target_disk_usage)}")
|
|
|
|
# Calculate available space on destination disks
|
|
dest_availability = []
|
|
for disk in destination_disks:
|
|
if disk not in usage:
|
|
# Assume empty disk
|
|
available = float('inf')
|
|
else:
|
|
# In real scenario, query actual disk free space
|
|
available = float('inf') # Placeholder
|
|
|
|
dest_availability.append({
|
|
'disk': disk,
|
|
'available': available,
|
|
'planned_usage': 0
|
|
})
|
|
|
|
# Generate move plan
|
|
plan = {
|
|
'target_disk': target_disk,
|
|
'total_size': target_disk_usage,
|
|
'file_count': len(files_to_move),
|
|
'operations': [],
|
|
'destination_disks': destination_disks
|
|
}
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
for file_info in files_to_move:
|
|
rel_path, size, mtime = file_info
|
|
|
|
# Find best destination (simple round-robin for balance)
|
|
dest_disk = destination_disks[len(plan['operations']) % len(destination_disks)]
|
|
|
|
# Record operation
|
|
op = {
|
|
'source_disk': target_disk,
|
|
'source_path': rel_path,
|
|
'dest_disk': dest_disk,
|
|
'target_path': rel_path, # Keep same relative path
|
|
'size': int(size)
|
|
}
|
|
plan['operations'].append(op)
|
|
|
|
# Store in database
|
|
cursor.execute(
|
|
"INSERT INTO operations (source_path, target_path, operation_type, status) VALUES (%s, %s, %s, %s)",
|
|
(f"{target_disk}:{rel_path}", f"{dest_disk}:{rel_path}", 'move', 'pending')
|
|
)
|
|
|
|
conn.commit()
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
# Save plan to JSON
|
|
plan_file = f"migration_plan_{target_disk}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(plan_file, 'w') as f:
|
|
json.dump(plan, f, indent=2)
|
|
|
|
logger.info(f"Plan created with {len(plan['operations'])} operations")
|
|
logger.info(f"Plan saved to {plan_file}")
|
|
|
|
return plan
|
|
|
|
def verify_operation(self, source: Path, dest: Path) -> bool:
|
|
"""Verify file was copied correctly (size + optional checksum)"""
|
|
if not dest.exists():
|
|
return False
|
|
|
|
try:
|
|
source_stat = source.stat()
|
|
dest_stat = dest.stat()
|
|
|
|
if source_stat.st_size != dest_stat.st_size:
|
|
return False
|
|
|
|
# Optional: checksum verification for critical files
|
|
# if source_stat.st_size < 100*1024*1024: # Only for files < 100MB
|
|
# return self.file_checksum(source) == self.file_checksum(dest)
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Verification error: {e}")
|
|
return False
|
|
|
|
@staticmethod
|
|
def file_checksum(path: Path) -> str:
|
|
"""Calculate MD5 checksum of file"""
|
|
hash_md5 = hashlib.md5()
|
|
with open(path, "rb") as f:
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
hash_md5.update(chunk)
|
|
return hash_md5.hexdigest()
|
|
|
|
def execute_migration(self, plan_file: str, dry_run: bool = True):
|
|
"""
|
|
Execute migration plan
|
|
:param plan_file: Path to plan JSON file
|
|
:param dry_run: If True, only simulate operations
|
|
"""
|
|
logger.info(f"{'DRY RUN' if dry_run else 'EXECUTING'} migration from {plan_file}")
|
|
|
|
with open(plan_file, 'r') as f:
|
|
plan = json.load(f)
|
|
|
|
operations = plan['operations']
|
|
logger.info(f"Processing {len(operations)} operations...")
|
|
|
|
success_count = 0
|
|
error_count = 0
|
|
start_time = time.time()
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
for i, op in enumerate(operations, 1):
|
|
source_disk = op['source_disk']
|
|
source_path = op['source_path']
|
|
dest_disk = op['dest_disk']
|
|
target_path = op['target_path']
|
|
|
|
source_full = Path(source_disk) / source_path
|
|
dest_full = Path(dest_disk) / target_path
|
|
|
|
# Dynamic progress display
|
|
elapsed = time.time() - start_time
|
|
rate = i / elapsed if elapsed > 0 else 0
|
|
eta = (len(operations) - i) / rate if rate > 0 else 0
|
|
display_path = str(source_path)
|
|
if len(display_path) > 50:
|
|
display_path = '...' + display_path[-47:]
|
|
|
|
print(f"\r[{i}/{len(operations)}] {success_count} OK, {error_count} ERR | {rate:.1f} files/s | ETA: {int(eta)}s | {display_path}", end='', flush=True)
|
|
|
|
if dry_run:
|
|
# Simulate
|
|
if source_full.exists():
|
|
success_count += 1
|
|
else:
|
|
logger.warning(f"\n Source does not exist: {source_full}")
|
|
error_count += 1
|
|
continue
|
|
|
|
try:
|
|
# Create destination directory
|
|
dest_full.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Move file (copy + verify + delete)
|
|
if source_full.exists():
|
|
# Copy with metadata
|
|
shutil.copy2(source_full, dest_full)
|
|
|
|
# Verify
|
|
if self.verify_operation(source_full, dest_full):
|
|
# Update database
|
|
cursor.execute(
|
|
"UPDATE files SET disk_label = %s, status = 'moved' WHERE path = %s AND disk_label = %s",
|
|
(dest_disk, source_path, source_disk)
|
|
)
|
|
|
|
# Safe delete (could be made optional)
|
|
# source_full.unlink()
|
|
|
|
# Log operation as executed
|
|
cursor.execute(
|
|
"UPDATE operations SET executed = 1, executed_at = CURRENT_TIMESTAMP WHERE source_path = %s",
|
|
(f"{source_disk}:{source_path}",)
|
|
)
|
|
|
|
success_count += 1
|
|
else:
|
|
raise Exception("Verification failed")
|
|
else:
|
|
logger.warning(f"\n Source missing: {source_full}")
|
|
error_count += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"\n Error processing {source_path}: {e}")
|
|
cursor.execute(
|
|
"UPDATE operations SET error = %s WHERE source_path = %s",
|
|
(str(e), f"{source_disk}:{source_path}")
|
|
)
|
|
error_count += 1
|
|
|
|
# Commit every 10 operations
|
|
if i % 10 == 0:
|
|
conn.commit()
|
|
|
|
conn.commit()
|
|
print() # New line after progress display
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
logger.info(f"Migration complete: {success_count} success, {error_count} errors")
|
|
|
|
if not dry_run and error_count == 0:
|
|
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
|
|
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
|
|
|
|
def run_deduplication(self, disk: Optional[str] = None, use_chunks: bool = True):
|
|
logger.info(f"Starting deduplication{' for disk ' + disk if disk else ''}")
|
|
|
|
disk_mount_map = {
|
|
'SMT': '/media/mike/SMT',
|
|
'DISK1': '/media/mike/DISK1',
|
|
'LLM': '/media/mike/LLM'
|
|
}
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
def hash_file_local(file_path: Path) -> str:
|
|
hasher = hashlib.sha256()
|
|
with open(file_path, 'rb') as f:
|
|
while chunk := f.read(65536):
|
|
hasher.update(chunk)
|
|
return hasher.hexdigest()
|
|
|
|
try:
|
|
if disk:
|
|
cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC", (disk,))
|
|
else:
|
|
cursor.execute("SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC")
|
|
|
|
files_to_process = cursor.fetchall()
|
|
total = len(files_to_process)
|
|
logger.info(f"Found {total} files to hash")
|
|
|
|
processed = 0
|
|
skipped = 0
|
|
start_time = time.time()
|
|
batch = []
|
|
|
|
print(f"Phase 1: Computing checksums...")
|
|
|
|
for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
|
|
try:
|
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
|
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
|
|
|
|
if not full_path.exists():
|
|
skipped += 1
|
|
if idx % 100 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
|
|
remaining = (total - idx) / rate if rate > 0 else 0
|
|
pct = 100 * idx / total
|
|
print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
|
|
continue
|
|
|
|
checksum = hash_file_local(full_path)
|
|
batch.append((checksum, path_str))
|
|
|
|
processed += 1
|
|
if len(batch) >= 1000:
|
|
try:
|
|
cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
|
|
conn.commit()
|
|
batch.clear()
|
|
except Exception as e:
|
|
conn.rollback()
|
|
batch.clear()
|
|
print(f"\nBatch update failed: {e}")
|
|
|
|
if idx % 100 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
|
|
remaining = (total - idx) / rate if rate > 0 else 0
|
|
pct = 100 * idx / total
|
|
print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
|
|
|
|
except Exception as e:
|
|
skipped += 1
|
|
if idx <= 5:
|
|
print(f"\nDebug: {full_path} - {e}")
|
|
|
|
if batch:
|
|
try:
|
|
cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
|
|
conn.commit()
|
|
except Exception as e:
|
|
conn.rollback()
|
|
print(f"\nFinal batch failed: {e}")
|
|
|
|
print()
|
|
elapsed = time.time() - start_time
|
|
logger.info(f"Phase 1 done: {processed:,} files in {int(elapsed/60)}m{int(elapsed%60):02d}s ({skipped:,} skipped)")
|
|
|
|
print("Phase 2: Finding duplicates...")
|
|
cursor.execute("""
|
|
UPDATE files f1 SET duplicate_of = (
|
|
SELECT MIN(path) FROM files f2
|
|
WHERE f2.checksum = f1.checksum AND f2.path < f1.path
|
|
)
|
|
WHERE checksum IS NOT NULL
|
|
""")
|
|
conn.commit()
|
|
|
|
cursor.execute("SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL")
|
|
dup_count = cursor.fetchone()[0]
|
|
logger.info(f"Phase 2 done: Found {dup_count:,} duplicates")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def plan_merge(self, sources: List[str], target: str, output_file: str,
|
|
filter_system: bool = False, network_target: str = None):
|
|
"""Plan merge of multiple source disks to target with deduplication"""
|
|
logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
|
|
|
|
if filter_system:
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from filters import GitignoreFilter
|
|
file_filter = GitignoreFilter()
|
|
logger.info("System/build file filtering enabled")
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
placeholders = ','.join(['%s'] * len(sources))
|
|
cursor.execute(f"""
|
|
SELECT path, size, checksum, disk_label, duplicate_of
|
|
FROM files
|
|
WHERE disk_label IN ({placeholders})
|
|
ORDER BY size DESC
|
|
""", tuple(sources))
|
|
|
|
files = cursor.fetchall()
|
|
total_files = len(files)
|
|
total_size = sum(int(f[1]) for f in files)
|
|
|
|
unique_files = {}
|
|
duplicate_count = 0
|
|
duplicate_size = 0
|
|
filtered_count = 0
|
|
filtered_size = 0
|
|
|
|
for path, size, checksum, disk_label, duplicate_of in files:
|
|
if filter_system and file_filter.should_exclude(path):
|
|
filtered_count += 1
|
|
filtered_size += int(size)
|
|
continue
|
|
|
|
if checksum and checksum in unique_files:
|
|
duplicate_count += 1
|
|
duplicate_size += int(size)
|
|
else:
|
|
if checksum:
|
|
unique_files[checksum] = (path, int(size), disk_label)
|
|
|
|
unique_count = len(unique_files)
|
|
unique_size = sum(f[1] for f in unique_files.values())
|
|
|
|
plan = {
|
|
'sources': sources,
|
|
'target': target or network_target,
|
|
'network': network_target is not None,
|
|
'total_files': total_files,
|
|
'total_size': total_size,
|
|
'unique_files': unique_count,
|
|
'unique_size': unique_size,
|
|
'duplicate_files': duplicate_count,
|
|
'duplicate_size': duplicate_size,
|
|
'filtered_files': filtered_count if filter_system else 0,
|
|
'filtered_size': filtered_size if filter_system else 0,
|
|
'space_saved': duplicate_size + (filtered_size if filter_system else 0),
|
|
'operations': []
|
|
}
|
|
|
|
for checksum, (path, size, disk_label) in unique_files.items():
|
|
plan['operations'].append({
|
|
'source_disk': disk_label,
|
|
'source_path': path,
|
|
'target_disk': target or network_target,
|
|
'target_path': path,
|
|
'size': size,
|
|
'checksum': checksum
|
|
})
|
|
|
|
with open(output_file, 'w') as f:
|
|
json.dump(plan, f, indent=2)
|
|
|
|
logger.info(f"Merge plan saved to {output_file}")
|
|
print(f"\n=== MERGE PLAN SUMMARY ===")
|
|
print(f"Sources: {', '.join(sources)}")
|
|
print(f"Target: {target or network_target}")
|
|
print(f"Total files: {total_files:,} ({self.format_size(total_size)})")
|
|
if filter_system:
|
|
print(f"Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})")
|
|
print(f"Unique files: {unique_count:,} ({self.format_size(unique_size)})")
|
|
print(f"Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})")
|
|
print(f"Total space saved: {self.format_size(plan['space_saved'])}")
|
|
print(f"Space needed on target: {self.format_size(unique_size)}")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
|
|
"""Generate status report"""
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
if preview_merge:
|
|
# Load and display merge plan
|
|
with open(preview_merge, 'r') as f:
|
|
plan = json.load(f)
|
|
|
|
print("\n=== MERGE PLAN PREVIEW ===")
|
|
print(f"Sources: {', '.join(plan['sources'])}")
|
|
print(f"Target: {plan['target']}")
|
|
print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
|
|
print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
|
|
print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
|
|
print(f"Space saved: {self.format_size(plan['space_saved'])}")
|
|
print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
|
|
return
|
|
|
|
cursor.execute("""
|
|
SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
|
|
""")
|
|
|
|
print("\n=== FILE MIGRATION REPORT ===")
|
|
for row in cursor.fetchall():
|
|
status, count, size = row
|
|
print(f"{status:15}: {count:6} files, {self.format_size(int(size or 0))}")
|
|
|
|
# Disk usage summary
|
|
cursor.execute("""
|
|
SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label
|
|
""")
|
|
|
|
print("\n=== DISK USAGE ===")
|
|
for row in cursor.fetchall():
|
|
disk, count, size = row
|
|
print(f"{disk:20}: {count:6} files, {self.format_size(int(size or 0))}")
|
|
|
|
# Deduplication stats
|
|
cursor.execute("""
|
|
SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL
|
|
""")
|
|
hashed_count, hashed_size = cursor.fetchone()
|
|
|
|
cursor.execute("""
|
|
SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL
|
|
""")
|
|
dup_count, dup_size = cursor.fetchone()
|
|
|
|
print("\n=== DEDUPLICATION STATS ===")
|
|
print(f"Files with checksums: {hashed_count or 0:6}")
|
|
print(f"Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})")
|
|
|
|
if show_duplicates and dup_count:
|
|
print("\n=== DUPLICATE FILES ===")
|
|
cursor.execute("""
|
|
SELECT path, size, duplicate_of FROM files
|
|
WHERE duplicate_of IS NOT NULL
|
|
ORDER BY size DESC
|
|
LIMIT 20
|
|
""")
|
|
for path, size, dup_of in cursor.fetchall():
|
|
print(f" {path} ({self.format_size(int(size))}) → {dup_of}")
|
|
|
|
cursor.execute("""
|
|
SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
|
|
""")
|
|
|
|
print("\n=== OPERATIONS REPORT ===")
|
|
for row in cursor.fetchall():
|
|
op_type, executed, verified, count = row
|
|
status = "EXECUTED" if executed else "PENDING"
|
|
if verified:
|
|
status += "+VERIFIED"
|
|
print(f"{op_type:10} {status:15}: {count} operations")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def profile_content(self, disk: Optional[str] = None, update_db: bool = False, limit: Optional[int] = None):
|
|
from content.profiler import ContentProfiler
|
|
|
|
profiler = ContentProfiler()
|
|
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
|
params = []
|
|
if disk:
|
|
query += " AND disk_label = %s"
|
|
params.append(disk)
|
|
if limit:
|
|
query += f" LIMIT {limit}"
|
|
|
|
cursor.execute(query, params)
|
|
files = cursor.fetchall()
|
|
total = len(files)
|
|
logger.info(f"Profiling {total:,} files...")
|
|
|
|
kind_stats = {}
|
|
processable = 0
|
|
batch = []
|
|
|
|
for idx, (path, size, disk_label) in enumerate(files, 1):
|
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
|
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
|
|
|
if not full_path.exists():
|
|
continue
|
|
|
|
profile = profiler.profile_file(full_path)
|
|
|
|
if 'error' not in profile:
|
|
kind = profile['kind']
|
|
if kind not in kind_stats:
|
|
kind_stats[kind] = {'count': 0, 'processable': 0}
|
|
kind_stats[kind]['count'] += 1
|
|
if profile['processable']:
|
|
kind_stats[kind]['processable'] += 1
|
|
processable += 1
|
|
|
|
if update_db:
|
|
profile_json = json.dumps(profile)
|
|
batch.append((kind, profile_json, path))
|
|
|
|
if len(batch) >= 500:
|
|
cursor.executemany(
|
|
"UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s",
|
|
[(pj, p) for k, pj, p in batch]
|
|
)
|
|
conn.commit()
|
|
batch.clear()
|
|
|
|
if idx % 100 == 0:
|
|
print(f"\rProfiled: {idx:,}/{total:,}", end='', flush=True)
|
|
|
|
if update_db and batch:
|
|
cursor.executemany(
|
|
"UPDATE files SET metadata = jsonb_set(COALESCE(metadata, '{}'::jsonb), '{profile}', %s::jsonb) WHERE path = %s",
|
|
[(pj, p) for k, pj, p in batch]
|
|
)
|
|
conn.commit()
|
|
|
|
print()
|
|
print(f"\n=== CONTENT PROFILE SUMMARY ===")
|
|
print(f"Total files: {total:,}")
|
|
print(f"Processable: {processable:,}\n")
|
|
print(f"{'Kind':<15} {'Total':<10} {'Processable':<12} {'Extractor'}")
|
|
print("-" * 60)
|
|
for kind in sorted(kind_stats.keys()):
|
|
stats = kind_stats[kind]
|
|
extractor = profiler._suggest_extractor(kind, '')
|
|
print(f"{kind:<15} {stats['count']:<10,} {stats['processable']:<12,} {extractor or 'none'}")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def extract_content(self, kind: Optional[str] = None, limit: int = 10):
|
|
from content.profiler import ContentProfiler
|
|
from content.extractors import ContentExtractor
|
|
|
|
profiler = ContentProfiler()
|
|
extractor = ContentExtractor()
|
|
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
|
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
query = "SELECT path, size, disk_label, metadata FROM files WHERE metadata->'profile'->>'processable' = 'true'"
|
|
params = []
|
|
if kind:
|
|
query += " AND metadata->'profile'->>'kind' = %s"
|
|
params.append(kind)
|
|
query += f" LIMIT {limit}"
|
|
|
|
cursor.execute(query, params)
|
|
files = cursor.fetchall()
|
|
|
|
print(f"\n=== EXTRACTING CONTENT ===")
|
|
print(f"Processing {len(files)} files\n")
|
|
|
|
for path, size, disk_label, metadata in files:
|
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
|
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
|
|
|
if not full_path.exists():
|
|
continue
|
|
|
|
profile = metadata.get('profile', {}) if metadata else {}
|
|
extractor_type = profile.get('extractor')
|
|
|
|
if not extractor_type:
|
|
continue
|
|
|
|
print(f"Extracting: {path}")
|
|
print(f" Type: {profile.get('kind')} | Extractor: {extractor_type}")
|
|
|
|
result = extractor.extract(full_path, extractor_type)
|
|
|
|
if 'text' in result:
|
|
preview = result['text'][:200]
|
|
print(f" Preview: {preview}...")
|
|
elif 'pipeline' in result:
|
|
print(f" Pipeline: {' → '.join(result['pipeline'])}")
|
|
print(f" Status: {result.get('status', 'pending')}")
|
|
|
|
print()
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def classify_files(self, disk: Optional[str] = None, update_db: bool = False):
|
|
from classification.classifier import FileClassifier
|
|
|
|
classifier = FileClassifier()
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
if disk:
|
|
cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s", (disk,))
|
|
else:
|
|
cursor.execute("SELECT path, size, disk_label FROM files")
|
|
|
|
files = cursor.fetchall()
|
|
total = len(files)
|
|
logger.info(f"Classifying {total:,} files...")
|
|
|
|
categories = {}
|
|
build_artifacts = 0
|
|
batch = []
|
|
|
|
for idx, (path, size, disk_label) in enumerate(files, 1):
|
|
labels, category, is_build = classifier.classify_path(path, int(size))
|
|
|
|
if is_build:
|
|
build_artifacts += 1
|
|
|
|
if category not in categories:
|
|
categories[category] = {'count': 0, 'size': 0}
|
|
categories[category]['count'] += 1
|
|
categories[category]['size'] += int(size)
|
|
|
|
if update_db:
|
|
labels_str = ','.join(labels)
|
|
batch.append((category, labels_str, path))
|
|
|
|
if len(batch) >= 1000:
|
|
cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch])
|
|
conn.commit()
|
|
batch.clear()
|
|
|
|
if idx % 1000 == 0:
|
|
print(f"\rClassified: {idx:,}/{total:,}", end='', flush=True)
|
|
|
|
if update_db and batch:
|
|
cursor.executemany("UPDATE files SET category = %s WHERE path = %s", [(cat, p) for cat, lbl, p in batch])
|
|
conn.commit()
|
|
|
|
print()
|
|
print(f"\n=== CLASSIFICATION SUMMARY ===")
|
|
print(f"Total files: {total:,}")
|
|
print(f"Build artifacts: {build_artifacts:,}")
|
|
print(f"\nCategories:")
|
|
for category in sorted(categories.keys()):
|
|
info = categories[category]
|
|
print(f" {category:30}: {info['count']:8,} files, {self.format_size(info['size'])}")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def review_migration(self, category: Optional[str] = None, show_build: bool = False):
|
|
from classification.classifier import FileClassifier
|
|
|
|
classifier = FileClassifier()
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
query = "SELECT path, size, category FROM files WHERE 1=1"
|
|
params = []
|
|
|
|
if category:
|
|
query += " AND category = %s"
|
|
params.append(category)
|
|
|
|
if not show_build:
|
|
query += " AND (metadata->>'labels' IS NULL OR metadata->>'labels' NOT LIKE '%build-artifact%')"
|
|
|
|
query += " ORDER BY category, size DESC LIMIT 100"
|
|
|
|
cursor.execute(query, params)
|
|
files = cursor.fetchall()
|
|
|
|
if not files:
|
|
print("No files found matching criteria")
|
|
return
|
|
|
|
print(f"\n=== MIGRATION PREVIEW ===")
|
|
print(f"Showing {len(files)} files\n")
|
|
|
|
current_category = None
|
|
for path, size, cat in files:
|
|
if cat != current_category:
|
|
current_category = cat
|
|
print(f"\n{cat}:")
|
|
|
|
labels, suggested_cat, is_build = classifier.classify_path(path, int(size))
|
|
target = classifier.suggest_target_path(path, suggested_cat, labels)
|
|
print(f" {path}")
|
|
print(f" → {target} ({self.format_size(int(size))})")
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
@staticmethod
|
|
def format_size(size: int) -> str:
|
|
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
|
if size < 1024:
|
|
return f"{size:.1f}{unit}"
|
|
size /= 1024
|
|
return f"{size:.1f}PB"
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
|
|
subparsers = parser.add_subparsers(dest='command', required=True)
|
|
|
|
# Index command
|
|
index_parser = subparsers.add_parser('index', help='Index files on a disk')
|
|
index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
|
|
index_parser.add_argument('disk_name', help='Logical name for the disk')
|
|
|
|
# Plan command
|
|
plan_parser = subparsers.add_parser('plan', help='Create migration plan')
|
|
plan_parser.add_argument('target_disk', help='Disk to free up')
|
|
plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
|
|
|
|
# Execute command
|
|
exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
|
|
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
|
|
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
|
|
|
|
# Dedupe command
|
|
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
|
|
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
|
|
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
|
|
|
|
# Merge command
|
|
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
|
|
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
|
|
merge_parser.add_argument('--target', required=True, help='Target disk')
|
|
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
|
|
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
|
|
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
|
|
|
|
# Profile command
|
|
profile_parser = subparsers.add_parser('profile', help='Create content profiles (inventory + triage)')
|
|
profile_parser.add_argument('--disk', help='Profile specific disk')
|
|
profile_parser.add_argument('--update', action='store_true', help='Update database with profiles')
|
|
profile_parser.add_argument('--limit', type=int, help='Limit number of files')
|
|
|
|
# Extract command
|
|
extract_parser = subparsers.add_parser('extract', help='Extract content from files')
|
|
extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
|
|
extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
|
|
|
|
# Classify command
|
|
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
|
|
classify_parser.add_argument('--disk', help='Classify specific disk')
|
|
classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
|
|
|
|
# Review command
|
|
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
|
|
review_parser.add_argument('--category', help='Review specific category')
|
|
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
|
|
|
|
# Report command
|
|
report_parser = subparsers.add_parser('report', help='Show current status')
|
|
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
|
|
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
|
|
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
|
|
|
|
args = parser.parse_args()
|
|
tool = DiskReorganizer()
|
|
|
|
if args.command == 'index':
|
|
tool.index_disk(args.disk_root, args.disk_name)
|
|
|
|
elif args.command == 'dedupe':
|
|
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
|
|
|
|
elif args.command == 'merge':
|
|
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
|
|
filter_system=args.filter_system, network_target=args.network)
|
|
|
|
elif args.command == 'plan':
|
|
plan = tool.plan_migration(args.target_disk, args.dest_disks)
|
|
if plan:
|
|
print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
|
|
print(f"Destination disks: {', '.join(plan['destination_disks'])}")
|
|
|
|
elif args.command == 'execute':
|
|
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
|
|
|
|
elif args.command == 'profile':
|
|
tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
|
|
|
|
elif args.command == 'extract':
|
|
tool.extract_content(kind=args.kind, limit=args.limit)
|
|
|
|
elif args.command == 'classify':
|
|
tool.classify_files(disk=args.disk, update_db=args.update)
|
|
|
|
elif args.command == 'review':
|
|
tool.review_migration(category=args.category, show_build=args.show_build)
|
|
|
|
elif args.command == 'report':
|
|
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
|
|
|
|
if __name__ == '__main__':
|
|
main() |