diff --git a/app/filters/__init__.py b/app/filters/__init__.py new file mode 100644 index 0000000..edaa883 --- /dev/null +++ b/app/filters/__init__.py @@ -0,0 +1,3 @@ +from .gitignore import GitignoreFilter, DEFAULT_PATTERNS + +__all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS'] diff --git a/app/main.py b/app/main.py index 81be092..82d937f 100644 --- a/app/main.py +++ b/app/main.py @@ -451,35 +451,77 @@ class DiskReorganizer: processed = 0 skipped = 0 - for path_str, size, disk_label in files_to_process: + start_time = time.time() + batch = [] + + print(f"Phase 1: Computing checksums...") + + for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1): try: mount_point = disk_mount_map.get(disk_label, disk_label) full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str) if not full_path.exists(): skipped += 1 + if idx % 100 == 0: + elapsed = time.time() - start_time + rate = (processed + skipped) / elapsed if elapsed > 0 else 0 + remaining = (total - idx) / rate if rate > 0 else 0 + pct = 100 * idx / total + print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True) continue checksum = hash_file_local(full_path) - - cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str)) - dup_row = cursor.fetchone() - duplicate_of = dup_row[0] if dup_row else None - - cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str)) + batch.append((checksum, path_str)) processed += 1 - if processed % 100 == 0: - conn.commit() - print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True) + if len(batch) >= 1000: + try: + cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch) + conn.commit() + batch.clear() + except Exception as e: + conn.rollback() + batch.clear() + print(f"\nBatch update failed: {e}") + + if idx % 100 == 0: + elapsed = time.time() - start_time + rate = (processed + skipped) / elapsed if elapsed > 0 else 0 + remaining = (total - idx) / rate if rate > 0 else 0 + pct = 100 * idx / total + print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True) except Exception as e: skipped += 1 - conn.rollback() + if idx <= 5: + print(f"\nDebug: {full_path} - {e}") + + if batch: + try: + cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch) + conn.commit() + except Exception as e: + conn.rollback() + print(f"\nFinal batch failed: {e}") - conn.commit() print() - logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped") + elapsed = time.time() - start_time + logger.info(f"Phase 1 done: {processed:,} files in {int(elapsed/60)}m{int(elapsed%60):02d}s ({skipped:,} skipped)") + + print("Phase 2: Finding duplicates...") + cursor.execute(""" + UPDATE files f1 SET duplicate_of = ( + SELECT MIN(path) FROM files f2 + WHERE f2.checksum = f1.checksum AND f2.path < f1.path + ) + WHERE checksum IS NOT NULL + """) + conn.commit() + + cursor.execute("SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL") + dup_count = cursor.fetchone()[0] + logger.info(f"Phase 2 done: Found {dup_count:,} duplicates") finally: cursor.close() @@ -671,67 +713,20 @@ class DiskReorganizer: return f"{size:.1f}PB" def main(): - parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot') - subparsers = parser.add_subparsers(dest='command', required=True) - - # Index command - index_parser = subparsers.add_parser('index', help='Index files on a disk') - index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)') - index_parser.add_argument('disk_name', help='Logical name for the disk') - - # Plan command - plan_parser = subparsers.add_parser('plan', help='Create migration plan') - plan_parser.add_argument('target_disk', help='Disk to free up') - plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks') - - # Execute command - exec_parser = subparsers.add_parser('execute', help='Execute migration plan') - exec_parser.add_argument('plan_file', help='Path to plan JSON file') - exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations') - - # Dedupe command - dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums') - dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk') - dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication') - - # Merge command - merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication') - merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge') - merge_parser.add_argument('--target', required=True, help='Target disk') - merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file') - merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files') - merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)') - - # Report command - report_parser = subparsers.add_parser('report', help='Show current status') - report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format') - report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files') - report_parser.add_argument('--preview-merge', help='Preview merge plan from file') - - args = parser.parse_args() - tool = DiskReorganizer() - - if args.command == 'index': - tool.index_disk(args.disk_root, args.disk_name) - - elif args.command == 'dedupe': - tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks) - - elif args.command == 'merge': - tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output, - filter_system=args.filter_system, network_target=args.network) - - elif args.command == 'plan': - plan = tool.plan_migration(args.target_disk, args.dest_disks) - if plan: - print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}") - print(f"Destination disks: {', '.join(plan['destination_disks'])}") - - elif args.command == 'execute': - tool.execute_migration(args.plan_file, dry_run=args.dry_run) - - elif args.command == 'report': - tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge) + p=argparse.ArgumentParser();s=p.add_subparsers(dest='cmd',required=True) + i=s.add_parser('index');i.add_argument('root');i.add_argument('name') + pl=s.add_parser('plan');pl.add_argument('target');pl.add_argument('dests',nargs='+') + e=s.add_parser('execute');e.add_argument('file');e.add_argument('--dry',action='store_true') + d=s.add_parser('dedupe');d.add_argument('--disk');d.add_argument('--no-chunks',action='store_true') + m=s.add_parser('merge');m.add_argument('--sources',nargs='+',required=True);m.add_argument('--target',required=True);m.add_argument('--out',default='merge.json');m.add_argument('--filter',action='store_true');m.add_argument('--net') + r=s.add_parser('report');r.add_argument('--fmt',default='text');r.add_argument('--dups',action='store_true');r.add_argument('--preview') + a=p.parse_args();t=DiskReorganizer() + if a.cmd=='index':t.index_disk(a.root,a.name) + elif a.cmd=='dedupe':t.run_deduplication(a.disk,not a.no_chunks) + elif a.cmd=='merge':t.plan_merge(a.sources,a.target,a.out,a.filter,a.net) + elif a.cmd=='plan':plan=t.plan_migration(a.target,a.dests);print(f"\nPlan {plan['file_count']} files {t.format_size(plan['total_size'])}") + elif a.cmd=='execute':t.execute_migration(a.file,a.dry) + elif a.cmd=='report':t.generate_report(a.fmt,a.dups,a.preview) if __name__ == '__main__': main() \ No newline at end of file diff --git a/sql/init.sql b/sql/init.sql index 956fbc0..58266c7 100644 --- a/sql/init.sql +++ b/sql/init.sql @@ -98,11 +98,13 @@ CREATE TABLE IF NOT EXISTS migration_plans ( ); -- Indexes for performance -CREATE INDEX IF NOT EXISTS idx_files_path ON files(path); -CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash); -CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label); -CREATE INDEX IF NOT EXISTS idx_files_category ON files(category); -CREATE INDEX IF NOT EXISTS idx_files_status ON files(status); +CREATE INDEX IF NOT EXISTS idx_files_path ON files (path); +CREATE INDEX IF NOT EXISTS idx_files_hash ON files (file_hash); +CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label); +CREATE INDEX IF NOT EXISTS idx_files_category ON files (category); +CREATE INDEX IF NOT EXISTS idx_files_status ON files (status); +create index on files (checksum); +create index on files (checksum,path); CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status); CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);