base
This commit is contained in:
30
app/filters/gitignore.py
Normal file
30
app/filters/gitignore.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Set
|
||||||
|
import fnmatch
|
||||||
|
|
||||||
|
DEFAULT_PATTERNS = {
|
||||||
|
'node_modules/**', '__pycache__/**', '.git/**', 'build/**', 'dist/**',
|
||||||
|
'.cache/**', 'target/**', 'vendor/**', '.venv/**', 'venv/**',
|
||||||
|
'*.pyc', '*.pyo', '*.so', '*.dll', '*.dylib', '*.o', '*.a',
|
||||||
|
'.DS_Store', 'Thumbs.db', '.pytest_cache/**', '.tox/**',
|
||||||
|
'*.egg-info/**', '.mypy_cache/**', '.coverage', 'htmlcov/**',
|
||||||
|
'.gradle/**', 'bin/**', 'obj/**', '.vs/**', '.idea/**'
|
||||||
|
}
|
||||||
|
|
||||||
|
class GitignoreFilter:
|
||||||
|
def __init__(self, patterns: Set[str] = None):
|
||||||
|
self.patterns = patterns or DEFAULT_PATTERNS
|
||||||
|
|
||||||
|
def should_exclude(self, path: str) -> bool:
|
||||||
|
path_obj = Path(path)
|
||||||
|
for pattern in self.patterns:
|
||||||
|
if '**' in pattern:
|
||||||
|
clean_pattern = pattern.replace('/**', '').replace('**/', '')
|
||||||
|
if clean_pattern in path_obj.parts:
|
||||||
|
return True
|
||||||
|
elif fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(path_obj.name, pattern):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def filter_files(self, files: list) -> list:
|
||||||
|
return [f for f in files if not self.should_exclude(f)]
|
||||||
257
app/main.py
257
app/main.py
@@ -1,33 +1,23 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
|
||||||
Disk Reorganizer - Safely restructure files across disks to free up one entire disk.
|
|
||||||
Three modes: index, plan, execute
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import psycopg2
|
import psycopg2
|
||||||
from psycopg2 import sql
|
|
||||||
from psycopg2.extras import RealDictCursor
|
|
||||||
import shutil
|
import shutil
|
||||||
import hashlib
|
import hashlib
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from dataclasses import dataclass, asdict
|
from typing import List, Dict, Optional
|
||||||
from typing import List, Dict, Optional, Tuple
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|
||||||
# Setup logging
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
handlers=[
|
handlers=[logging.FileHandler('disk_reorganizer.log'), logging.StreamHandler(sys.stdout)]
|
||||||
logging.FileHandler('disk_reorganizer.log'),
|
|
||||||
logging.StreamHandler(sys.stdout)
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -430,12 +420,186 @@ class DiskReorganizer:
|
|||||||
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
|
logger.info(f"✓ Disk {plan['target_disk']} is ready for Linux installation!")
|
||||||
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
|
logger.info(f" Remember to safely delete original files from {plan['target_disk']}")
|
||||||
|
|
||||||
def generate_report(self):
|
def run_deduplication(self, disk: Optional[str] = None, use_chunks: bool = True):
|
||||||
|
logger.info(f"Starting deduplication{' for disk ' + disk if disk else ''}")
|
||||||
|
|
||||||
|
disk_mount_map = {
|
||||||
|
'SMT': '/media/mike/SMT',
|
||||||
|
'DISK1': '/media/mike/DISK1',
|
||||||
|
'LLM': '/media/mike/LLM'
|
||||||
|
}
|
||||||
|
|
||||||
|
conn = self.get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
def hash_file_local(file_path: Path) -> str:
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
while chunk := f.read(65536):
|
||||||
|
hasher.update(chunk)
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if disk:
|
||||||
|
cursor.execute("SELECT path, size, disk_label FROM files WHERE disk_label = %s AND checksum IS NULL ORDER BY size DESC", (disk,))
|
||||||
|
else:
|
||||||
|
cursor.execute("SELECT path, size, disk_label FROM files WHERE checksum IS NULL ORDER BY size DESC")
|
||||||
|
|
||||||
|
files_to_process = cursor.fetchall()
|
||||||
|
total = len(files_to_process)
|
||||||
|
logger.info(f"Found {total} files to hash")
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
skipped = 0
|
||||||
|
for path_str, size, disk_label in files_to_process:
|
||||||
|
try:
|
||||||
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||||
|
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
|
||||||
|
|
||||||
|
if not full_path.exists():
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
checksum = hash_file_local(full_path)
|
||||||
|
|
||||||
|
cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str))
|
||||||
|
dup_row = cursor.fetchone()
|
||||||
|
duplicate_of = dup_row[0] if dup_row else None
|
||||||
|
|
||||||
|
cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str))
|
||||||
|
|
||||||
|
processed += 1
|
||||||
|
if processed % 100 == 0:
|
||||||
|
conn.commit()
|
||||||
|
print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
skipped += 1
|
||||||
|
conn.rollback()
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
print()
|
||||||
|
logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def plan_merge(self, sources: List[str], target: str, output_file: str,
|
||||||
|
filter_system: bool = False, network_target: str = None):
|
||||||
|
"""Plan merge of multiple source disks to target with deduplication"""
|
||||||
|
logger.info(f"Planning merge: {', '.join(sources)} → {target or network_target}")
|
||||||
|
|
||||||
|
if filter_system:
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
from filters import GitignoreFilter
|
||||||
|
file_filter = GitignoreFilter()
|
||||||
|
logger.info("System/build file filtering enabled")
|
||||||
|
|
||||||
|
conn = self.get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
placeholders = ','.join(['%s'] * len(sources))
|
||||||
|
cursor.execute(f"""
|
||||||
|
SELECT path, size, checksum, disk_label, duplicate_of
|
||||||
|
FROM files
|
||||||
|
WHERE disk_label IN ({placeholders})
|
||||||
|
ORDER BY size DESC
|
||||||
|
""", tuple(sources))
|
||||||
|
|
||||||
|
files = cursor.fetchall()
|
||||||
|
total_files = len(files)
|
||||||
|
total_size = sum(int(f[1]) for f in files)
|
||||||
|
|
||||||
|
unique_files = {}
|
||||||
|
duplicate_count = 0
|
||||||
|
duplicate_size = 0
|
||||||
|
filtered_count = 0
|
||||||
|
filtered_size = 0
|
||||||
|
|
||||||
|
for path, size, checksum, disk_label, duplicate_of in files:
|
||||||
|
if filter_system and file_filter.should_exclude(path):
|
||||||
|
filtered_count += 1
|
||||||
|
filtered_size += int(size)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if checksum and checksum in unique_files:
|
||||||
|
duplicate_count += 1
|
||||||
|
duplicate_size += int(size)
|
||||||
|
else:
|
||||||
|
if checksum:
|
||||||
|
unique_files[checksum] = (path, int(size), disk_label)
|
||||||
|
|
||||||
|
unique_count = len(unique_files)
|
||||||
|
unique_size = sum(f[1] for f in unique_files.values())
|
||||||
|
|
||||||
|
plan = {
|
||||||
|
'sources': sources,
|
||||||
|
'target': target or network_target,
|
||||||
|
'network': network_target is not None,
|
||||||
|
'total_files': total_files,
|
||||||
|
'total_size': total_size,
|
||||||
|
'unique_files': unique_count,
|
||||||
|
'unique_size': unique_size,
|
||||||
|
'duplicate_files': duplicate_count,
|
||||||
|
'duplicate_size': duplicate_size,
|
||||||
|
'filtered_files': filtered_count if filter_system else 0,
|
||||||
|
'filtered_size': filtered_size if filter_system else 0,
|
||||||
|
'space_saved': duplicate_size + (filtered_size if filter_system else 0),
|
||||||
|
'operations': []
|
||||||
|
}
|
||||||
|
|
||||||
|
for checksum, (path, size, disk_label) in unique_files.items():
|
||||||
|
plan['operations'].append({
|
||||||
|
'source_disk': disk_label,
|
||||||
|
'source_path': path,
|
||||||
|
'target_disk': target or network_target,
|
||||||
|
'target_path': path,
|
||||||
|
'size': size,
|
||||||
|
'checksum': checksum
|
||||||
|
})
|
||||||
|
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
json.dump(plan, f, indent=2)
|
||||||
|
|
||||||
|
logger.info(f"Merge plan saved to {output_file}")
|
||||||
|
print(f"\n=== MERGE PLAN SUMMARY ===")
|
||||||
|
print(f"Sources: {', '.join(sources)}")
|
||||||
|
print(f"Target: {target or network_target}")
|
||||||
|
print(f"Total files: {total_files:,} ({self.format_size(total_size)})")
|
||||||
|
if filter_system:
|
||||||
|
print(f"Filtered (system/build): {filtered_count:,} ({self.format_size(filtered_size)})")
|
||||||
|
print(f"Unique files: {unique_count:,} ({self.format_size(unique_size)})")
|
||||||
|
print(f"Duplicates: {duplicate_count:,} ({self.format_size(duplicate_size)})")
|
||||||
|
print(f"Total space saved: {self.format_size(plan['space_saved'])}")
|
||||||
|
print(f"Space needed on target: {self.format_size(unique_size)}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def generate_report(self, format='text', show_duplicates=False, preview_merge=None):
|
||||||
"""Generate status report"""
|
"""Generate status report"""
|
||||||
conn = self.get_connection()
|
conn = self.get_connection()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
if preview_merge:
|
||||||
|
# Load and display merge plan
|
||||||
|
with open(preview_merge, 'r') as f:
|
||||||
|
plan = json.load(f)
|
||||||
|
|
||||||
|
print("\n=== MERGE PLAN PREVIEW ===")
|
||||||
|
print(f"Sources: {', '.join(plan['sources'])}")
|
||||||
|
print(f"Target: {plan['target']}")
|
||||||
|
print(f"Total files: {plan['total_files']:,} ({self.format_size(plan['total_size'])})")
|
||||||
|
print(f"Unique files: {plan['unique_files']:,} ({self.format_size(plan['unique_size'])})")
|
||||||
|
print(f"Duplicates: {plan['duplicate_files']:,} ({self.format_size(plan['duplicate_size'])})")
|
||||||
|
print(f"Space saved: {self.format_size(plan['space_saved'])}")
|
||||||
|
print(f"Space needed on target: {self.format_size(plan['unique_size'])}")
|
||||||
|
return
|
||||||
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
|
SELECT status, COUNT(*), SUM(size) FROM files GROUP BY status
|
||||||
""")
|
""")
|
||||||
@@ -443,7 +607,43 @@ class DiskReorganizer:
|
|||||||
print("\n=== FILE MIGRATION REPORT ===")
|
print("\n=== FILE MIGRATION REPORT ===")
|
||||||
for row in cursor.fetchall():
|
for row in cursor.fetchall():
|
||||||
status, count, size = row
|
status, count, size = row
|
||||||
print(f"{status:15}: {count:6} files, {self.format_size(size or 0)}")
|
print(f"{status:15}: {count:6} files, {self.format_size(int(size or 0))}")
|
||||||
|
|
||||||
|
# Disk usage summary
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT disk_label, COUNT(*), SUM(size) FROM files GROUP BY disk_label
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("\n=== DISK USAGE ===")
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
disk, count, size = row
|
||||||
|
print(f"{disk:20}: {count:6} files, {self.format_size(int(size or 0))}")
|
||||||
|
|
||||||
|
# Deduplication stats
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT COUNT(*), SUM(size) FROM files WHERE checksum IS NOT NULL
|
||||||
|
""")
|
||||||
|
hashed_count, hashed_size = cursor.fetchone()
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT COUNT(*), SUM(size) FROM files WHERE duplicate_of IS NOT NULL
|
||||||
|
""")
|
||||||
|
dup_count, dup_size = cursor.fetchone()
|
||||||
|
|
||||||
|
print("\n=== DEDUPLICATION STATS ===")
|
||||||
|
print(f"Files with checksums: {hashed_count or 0:6}")
|
||||||
|
print(f"Duplicate files: {dup_count or 0:6} ({self.format_size(int(dup_size or 0))})")
|
||||||
|
|
||||||
|
if show_duplicates and dup_count:
|
||||||
|
print("\n=== DUPLICATE FILES ===")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT path, size, duplicate_of FROM files
|
||||||
|
WHERE duplicate_of IS NOT NULL
|
||||||
|
ORDER BY size DESC
|
||||||
|
LIMIT 20
|
||||||
|
""")
|
||||||
|
for path, size, dup_of in cursor.fetchall():
|
||||||
|
print(f" {path} ({self.format_size(int(size))}) → {dup_of}")
|
||||||
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
|
SELECT operation_type, executed, verified, COUNT(*) FROM operations GROUP BY operation_type, executed, verified
|
||||||
@@ -489,8 +689,24 @@ def main():
|
|||||||
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
|
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
|
||||||
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
|
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
|
||||||
|
|
||||||
|
# Dedupe command
|
||||||
|
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
|
||||||
|
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
|
||||||
|
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
|
||||||
|
|
||||||
|
# Merge command
|
||||||
|
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
|
||||||
|
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
|
||||||
|
merge_parser.add_argument('--target', required=True, help='Target disk')
|
||||||
|
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
|
||||||
|
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
|
||||||
|
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
|
||||||
|
|
||||||
# Report command
|
# Report command
|
||||||
report_parser = subparsers.add_parser('report', help='Show current status')
|
report_parser = subparsers.add_parser('report', help='Show current status')
|
||||||
|
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
|
||||||
|
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
|
||||||
|
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
tool = DiskReorganizer()
|
tool = DiskReorganizer()
|
||||||
@@ -498,6 +714,13 @@ def main():
|
|||||||
if args.command == 'index':
|
if args.command == 'index':
|
||||||
tool.index_disk(args.disk_root, args.disk_name)
|
tool.index_disk(args.disk_root, args.disk_name)
|
||||||
|
|
||||||
|
elif args.command == 'dedupe':
|
||||||
|
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
|
||||||
|
|
||||||
|
elif args.command == 'merge':
|
||||||
|
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
|
||||||
|
filter_system=args.filter_system, network_target=args.network)
|
||||||
|
|
||||||
elif args.command == 'plan':
|
elif args.command == 'plan':
|
||||||
plan = tool.plan_migration(args.target_disk, args.dest_disks)
|
plan = tool.plan_migration(args.target_disk, args.dest_disks)
|
||||||
if plan:
|
if plan:
|
||||||
@@ -508,7 +731,7 @@ def main():
|
|||||||
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
|
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
|
||||||
|
|
||||||
elif args.command == 'report':
|
elif args.command == 'report':
|
||||||
tool.generate_report()
|
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
Reference in New Issue
Block a user