base
This commit is contained in:
3
app/filters/__init__.py
Normal file
3
app/filters/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from .gitignore import GitignoreFilter, DEFAULT_PATTERNS
|
||||||
|
|
||||||
|
__all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS']
|
||||||
143
app/main.py
143
app/main.py
@@ -451,35 +451,77 @@ class DiskReorganizer:
|
|||||||
|
|
||||||
processed = 0
|
processed = 0
|
||||||
skipped = 0
|
skipped = 0
|
||||||
for path_str, size, disk_label in files_to_process:
|
start_time = time.time()
|
||||||
|
batch = []
|
||||||
|
|
||||||
|
print(f"Phase 1: Computing checksums...")
|
||||||
|
|
||||||
|
for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
|
||||||
try:
|
try:
|
||||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||||
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
|
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
|
||||||
|
|
||||||
if not full_path.exists():
|
if not full_path.exists():
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
if idx % 100 == 0:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
|
||||||
|
remaining = (total - idx) / rate if rate > 0 else 0
|
||||||
|
pct = 100 * idx / total
|
||||||
|
print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
checksum = hash_file_local(full_path)
|
checksum = hash_file_local(full_path)
|
||||||
|
batch.append((checksum, path_str))
|
||||||
cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str))
|
|
||||||
dup_row = cursor.fetchone()
|
|
||||||
duplicate_of = dup_row[0] if dup_row else None
|
|
||||||
|
|
||||||
cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str))
|
|
||||||
|
|
||||||
processed += 1
|
processed += 1
|
||||||
if processed % 100 == 0:
|
if len(batch) >= 1000:
|
||||||
conn.commit()
|
try:
|
||||||
print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True)
|
cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
|
||||||
|
conn.commit()
|
||||||
|
batch.clear()
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
batch.clear()
|
||||||
|
print(f"\nBatch update failed: {e}")
|
||||||
|
|
||||||
|
if idx % 100 == 0:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
|
||||||
|
remaining = (total - idx) / rate if rate > 0 else 0
|
||||||
|
pct = 100 * idx / total
|
||||||
|
print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
skipped += 1
|
skipped += 1
|
||||||
conn.rollback()
|
if idx <= 5:
|
||||||
|
print(f"\nDebug: {full_path} - {e}")
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
try:
|
||||||
|
cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
|
||||||
|
conn.commit()
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
print(f"\nFinal batch failed: {e}")
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
print()
|
print()
|
||||||
logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped")
|
elapsed = time.time() - start_time
|
||||||
|
logger.info(f"Phase 1 done: {processed:,} files in {int(elapsed/60)}m{int(elapsed%60):02d}s ({skipped:,} skipped)")
|
||||||
|
|
||||||
|
print("Phase 2: Finding duplicates...")
|
||||||
|
cursor.execute("""
|
||||||
|
UPDATE files f1 SET duplicate_of = (
|
||||||
|
SELECT MIN(path) FROM files f2
|
||||||
|
WHERE f2.checksum = f1.checksum AND f2.path < f1.path
|
||||||
|
)
|
||||||
|
WHERE checksum IS NOT NULL
|
||||||
|
""")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL")
|
||||||
|
dup_count = cursor.fetchone()[0]
|
||||||
|
logger.info(f"Phase 2 done: Found {dup_count:,} duplicates")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
cursor.close()
|
cursor.close()
|
||||||
@@ -671,67 +713,20 @@ class DiskReorganizer:
|
|||||||
return f"{size:.1f}PB"
|
return f"{size:.1f}PB"
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
|
p=argparse.ArgumentParser();s=p.add_subparsers(dest='cmd',required=True)
|
||||||
subparsers = parser.add_subparsers(dest='command', required=True)
|
i=s.add_parser('index');i.add_argument('root');i.add_argument('name')
|
||||||
|
pl=s.add_parser('plan');pl.add_argument('target');pl.add_argument('dests',nargs='+')
|
||||||
# Index command
|
e=s.add_parser('execute');e.add_argument('file');e.add_argument('--dry',action='store_true')
|
||||||
index_parser = subparsers.add_parser('index', help='Index files on a disk')
|
d=s.add_parser('dedupe');d.add_argument('--disk');d.add_argument('--no-chunks',action='store_true')
|
||||||
index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
|
m=s.add_parser('merge');m.add_argument('--sources',nargs='+',required=True);m.add_argument('--target',required=True);m.add_argument('--out',default='merge.json');m.add_argument('--filter',action='store_true');m.add_argument('--net')
|
||||||
index_parser.add_argument('disk_name', help='Logical name for the disk')
|
r=s.add_parser('report');r.add_argument('--fmt',default='text');r.add_argument('--dups',action='store_true');r.add_argument('--preview')
|
||||||
|
a=p.parse_args();t=DiskReorganizer()
|
||||||
# Plan command
|
if a.cmd=='index':t.index_disk(a.root,a.name)
|
||||||
plan_parser = subparsers.add_parser('plan', help='Create migration plan')
|
elif a.cmd=='dedupe':t.run_deduplication(a.disk,not a.no_chunks)
|
||||||
plan_parser.add_argument('target_disk', help='Disk to free up')
|
elif a.cmd=='merge':t.plan_merge(a.sources,a.target,a.out,a.filter,a.net)
|
||||||
plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
|
elif a.cmd=='plan':plan=t.plan_migration(a.target,a.dests);print(f"\nPlan {plan['file_count']} files {t.format_size(plan['total_size'])}")
|
||||||
|
elif a.cmd=='execute':t.execute_migration(a.file,a.dry)
|
||||||
# Execute command
|
elif a.cmd=='report':t.generate_report(a.fmt,a.dups,a.preview)
|
||||||
exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
|
|
||||||
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
|
|
||||||
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
|
|
||||||
|
|
||||||
# Dedupe command
|
|
||||||
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
|
|
||||||
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
|
|
||||||
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
|
|
||||||
|
|
||||||
# Merge command
|
|
||||||
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
|
|
||||||
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
|
|
||||||
merge_parser.add_argument('--target', required=True, help='Target disk')
|
|
||||||
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
|
|
||||||
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
|
|
||||||
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
|
|
||||||
|
|
||||||
# Report command
|
|
||||||
report_parser = subparsers.add_parser('report', help='Show current status')
|
|
||||||
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
|
|
||||||
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
|
|
||||||
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
tool = DiskReorganizer()
|
|
||||||
|
|
||||||
if args.command == 'index':
|
|
||||||
tool.index_disk(args.disk_root, args.disk_name)
|
|
||||||
|
|
||||||
elif args.command == 'dedupe':
|
|
||||||
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
|
|
||||||
|
|
||||||
elif args.command == 'merge':
|
|
||||||
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
|
|
||||||
filter_system=args.filter_system, network_target=args.network)
|
|
||||||
|
|
||||||
elif args.command == 'plan':
|
|
||||||
plan = tool.plan_migration(args.target_disk, args.dest_disks)
|
|
||||||
if plan:
|
|
||||||
print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
|
|
||||||
print(f"Destination disks: {', '.join(plan['destination_disks'])}")
|
|
||||||
|
|
||||||
elif args.command == 'execute':
|
|
||||||
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
|
|
||||||
|
|
||||||
elif args.command == 'report':
|
|
||||||
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
12
sql/init.sql
12
sql/init.sql
@@ -98,11 +98,13 @@ CREATE TABLE IF NOT EXISTS migration_plans (
|
|||||||
);
|
);
|
||||||
|
|
||||||
-- Indexes for performance
|
-- Indexes for performance
|
||||||
CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);
|
CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
|
||||||
CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);
|
CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
|
||||||
CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label);
|
CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
|
||||||
CREATE INDEX IF NOT EXISTS idx_files_category ON files(category);
|
CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
|
||||||
CREATE INDEX IF NOT EXISTS idx_files_status ON files(status);
|
CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
|
||||||
|
create index on files (checksum);
|
||||||
|
create index on files (checksum,path);
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
|
CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
|
||||||
CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);
|
CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);
|
||||||
|
|||||||
Reference in New Issue
Block a user