base

2025-12-13 01:12:59 +01:00
parent 942e87d439
commit 2ec65e059c
3 changed files with 79 additions and 79 deletions
--- a/app/filters/init.py
+++ b/app/filters/init.py
@@ -0,0 +1,3 @@
 from .gitignore import GitignoreFilter, DEFAULT_PATTERNS
 __all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS']
--- a/app/main.py
+++ b/app/main.py
@@ -451,35 +451,77 @@ class DiskReorganizer:
            processed = 0
            skipped = 0
-            for path_str, size, disk_label in files_to_process:
+            start_time = time.time()
            batch = []
            print(f"Phase 1: Computing checksums...")
            for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
                try:
                    mount_point = disk_mount_map.get(disk_label, disk_label)
                    full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
                    if not full_path.exists():
                        skipped += 1
                        if idx % 100 == 0:
                            elapsed = time.time() - start_time
                            rate = (processed + skipped) / elapsed if elapsed > 0 else 0
                            remaining = (total - idx) / rate if rate > 0 else 0
                            pct = 100 * idx / total
                            print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
                        continue
                    checksum = hash_file_local(full_path)
-
+                    batch.append((checksum, path_str))
                    cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str))
                    dup_row = cursor.fetchone()
                    duplicate_of = dup_row[0] if dup_row else None
                    cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str))
                    processed += 1
-                    if processed % 100 == 0:
+                    if len(batch) >= 1000:
-                        conn.commit()
+                        try:
-                        print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True)
+                            cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
                            conn.commit()
                            batch.clear()
                        except Exception as e:
                            conn.rollback()
                            batch.clear()
                            print(f"\nBatch update failed: {e}")
                    if idx % 100 == 0:
                        elapsed = time.time() - start_time
                        rate = (processed + skipped) / elapsed if elapsed > 0 else 0
                        remaining = (total - idx) / rate if rate > 0 else 0
                        pct = 100 * idx / total
                        print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
                except Exception as e:
                    skipped += 1
-                    conn.rollback()
+                    if idx <= 5:
                        print(f"\nDebug: {full_path} - {e}")
            if batch:
                try:
                    cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
                    conn.commit()
                except Exception as e:
                    conn.rollback()
                    print(f"\nFinal batch failed: {e}")
            conn.commit()
            print()
-            logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped")
+            elapsed = time.time() - start_time
            logger.info(f"Phase 1 done: {processed:,} files in {int(elapsed/60)}m{int(elapsed%60):02d}s ({skipped:,} skipped)")
            print("Phase 2: Finding duplicates...")
            cursor.execute("""
                UPDATE files f1 SET duplicate_of = (
                    SELECT MIN(path) FROM files f2
                    WHERE f2.checksum = f1.checksum AND f2.path < f1.path
                )
                WHERE checksum IS NOT NULL
            """)
            conn.commit()
            cursor.execute("SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL")
            dup_count = cursor.fetchone()[0]
            logger.info(f"Phase 2 done: Found {dup_count:,} duplicates")
        finally:
            cursor.close()
@@ -671,67 +713,20 @@ class DiskReorganizer:
        return f"{size:.1f}PB"
 def main():
-    parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
+    p=argparse.ArgumentParser();s=p.add_subparsers(dest='cmd',required=True)
-    subparsers = parser.add_subparsers(dest='command', required=True)
+    i=s.add_parser('index');i.add_argument('root');i.add_argument('name')
-
+    pl=s.add_parser('plan');pl.add_argument('target');pl.add_argument('dests',nargs='+')
-    # Index command
+    e=s.add_parser('execute');e.add_argument('file');e.add_argument('--dry',action='store_true')
-    index_parser = subparsers.add_parser('index', help='Index files on a disk')
+    d=s.add_parser('dedupe');d.add_argument('--disk');d.add_argument('--no-chunks',action='store_true')
-    index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
+    m=s.add_parser('merge');m.add_argument('--sources',nargs='+',required=True);m.add_argument('--target',required=True);m.add_argument('--out',default='merge.json');m.add_argument('--filter',action='store_true');m.add_argument('--net')
-    index_parser.add_argument('disk_name', help='Logical name for the disk')
+    r=s.add_parser('report');r.add_argument('--fmt',default='text');r.add_argument('--dups',action='store_true');r.add_argument('--preview')
-
+    a=p.parse_args();t=DiskReorganizer()
-    # Plan command
+    if a.cmd=='index':t.index_disk(a.root,a.name)
-    plan_parser = subparsers.add_parser('plan', help='Create migration plan')
+    elif a.cmd=='dedupe':t.run_deduplication(a.disk,not a.no_chunks)
-    plan_parser.add_argument('target_disk', help='Disk to free up')
+    elif a.cmd=='merge':t.plan_merge(a.sources,a.target,a.out,a.filter,a.net)
-    plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
+    elif a.cmd=='plan':plan=t.plan_migration(a.target,a.dests);print(f"\nPlan {plan['file_count']} files {t.format_size(plan['total_size'])}")
-
+    elif a.cmd=='execute':t.execute_migration(a.file,a.dry)
-    # Execute command
+    elif a.cmd=='report':t.generate_report(a.fmt,a.dups,a.preview)
    exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
    exec_parser.add_argument('plan_file', help='Path to plan JSON file')
    exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
    # Dedupe command
    dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
    dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
    dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
    # Merge command
    merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
    merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
    merge_parser.add_argument('--target', required=True, help='Target disk')
    merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
    merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
    merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
    # Report command
    report_parser = subparsers.add_parser('report', help='Show current status')
    report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
    report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
    report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
    args = parser.parse_args()
    tool = DiskReorganizer()
    if args.command == 'index':
        tool.index_disk(args.disk_root, args.disk_name)
    elif args.command == 'dedupe':
        tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
    elif args.command == 'merge':
        tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
                       filter_system=args.filter_system, network_target=args.network)
    elif args.command == 'plan':
        plan = tool.plan_migration(args.target_disk, args.dest_disks)
        if plan:
            print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
            print(f"Destination disks: {', '.join(plan['destination_disks'])}")
    elif args.command == 'execute':
        tool.execute_migration(args.plan_file, dry_run=args.dry_run)
    elif args.command == 'report':
        tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
 if __name__ == '__main__':
    main()
--- a/sql/init.sql
+++ b/sql/init.sql
@@ -98,11 +98,13 @@ CREATE TABLE IF NOT EXISTS migration_plans (
 );
 -- Indexes for performance
-CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);
+CREATE INDEX IF NOT EXISTS idx_files_path ON files (path);
-CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);
+CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash);
-CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label);
+CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
-CREATE INDEX IF NOT EXISTS idx_files_category ON files(category);
+CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
-CREATE INDEX IF NOT EXISTS idx_files_status ON files(status);
+CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
 create index on files (checksum);
 create index on files (checksum,path);
 CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
 CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);
		`@@ -0,0 +1,3 @@`
							`from .gitignore import GitignoreFilter, DEFAULT_PATTERNS`

							`__all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS']`