This commit is contained in:
mike
2025-12-13 01:12:59 +01:00
parent 942e87d439
commit 2ec65e059c
3 changed files with 79 additions and 79 deletions

3
app/filters/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .gitignore import GitignoreFilter, DEFAULT_PATTERNS
__all__ = ['GitignoreFilter', 'DEFAULT_PATTERNS']

View File

@@ -451,35 +451,77 @@ class DiskReorganizer:
processed = 0
skipped = 0
for path_str, size, disk_label in files_to_process:
start_time = time.time()
batch = []
print(f"Phase 1: Computing checksums...")
for idx, (path_str, size, disk_label) in enumerate(files_to_process, 1):
try:
mount_point = disk_mount_map.get(disk_label, disk_label)
full_path = Path(mount_point) / path_str if not Path(path_str).is_absolute() else Path(path_str)
if not full_path.exists():
skipped += 1
if idx % 100 == 0:
elapsed = time.time() - start_time
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
remaining = (total - idx) / rate if rate > 0 else 0
pct = 100 * idx / total
print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
continue
checksum = hash_file_local(full_path)
cursor.execute("SELECT path FROM files WHERE checksum = %s AND path != %s LIMIT 1", (checksum, path_str))
dup_row = cursor.fetchone()
duplicate_of = dup_row[0] if dup_row else None
cursor.execute("UPDATE files SET checksum = %s, duplicate_of = %s WHERE path = %s", (checksum, duplicate_of, path_str))
batch.append((checksum, path_str))
processed += 1
if processed % 100 == 0:
if len(batch) >= 1000:
try:
cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
conn.commit()
print(f"\rProcessed: {processed}/{total} ({skipped} skipped)", end='', flush=True)
batch.clear()
except Exception as e:
conn.rollback()
batch.clear()
print(f"\nBatch update failed: {e}")
if idx % 100 == 0:
elapsed = time.time() - start_time
rate = (processed + skipped) / elapsed if elapsed > 0 else 0
remaining = (total - idx) / rate if rate > 0 else 0
pct = 100 * idx / total
print(f"\r[{pct:5.1f}%] {processed:,}/{total:,} | {rate:.0f}/s | ETA: {int(remaining/60)}m{int(remaining%60):02d}s | Skip: {skipped:,}", end='', flush=True)
except Exception as e:
skipped += 1
conn.rollback()
if idx <= 5:
print(f"\nDebug: {full_path} - {e}")
if batch:
try:
cursor.executemany("UPDATE files SET checksum = %s WHERE path = %s", batch)
conn.commit()
except Exception as e:
conn.rollback()
print(f"\nFinal batch failed: {e}")
print()
logger.info(f"Deduplication complete: {processed}/{total} files processed, {skipped} skipped")
elapsed = time.time() - start_time
logger.info(f"Phase 1 done: {processed:,} files in {int(elapsed/60)}m{int(elapsed%60):02d}s ({skipped:,} skipped)")
print("Phase 2: Finding duplicates...")
cursor.execute("""
UPDATE files f1 SET duplicate_of = (
SELECT MIN(path) FROM files f2
WHERE f2.checksum = f1.checksum AND f2.path < f1.path
)
WHERE checksum IS NOT NULL
""")
conn.commit()
cursor.execute("SELECT COUNT(*) FROM files WHERE duplicate_of IS NOT NULL")
dup_count = cursor.fetchone()[0]
logger.info(f"Phase 2 done: Found {dup_count:,} duplicates")
finally:
cursor.close()
@@ -671,67 +713,20 @@ class DiskReorganizer:
return f"{size:.1f}PB"
def main():
parser = argparse.ArgumentParser(description='Disk Reorganizer - Free up a disk for Linux dual-boot')
subparsers = parser.add_subparsers(dest='command', required=True)
# Index command
index_parser = subparsers.add_parser('index', help='Index files on a disk')
index_parser.add_argument('disk_root', help='Root path of disk (e.g., D:\\\\)')
index_parser.add_argument('disk_name', help='Logical name for the disk')
# Plan command
plan_parser = subparsers.add_parser('plan', help='Create migration plan')
plan_parser.add_argument('target_disk', help='Disk to free up')
plan_parser.add_argument('dest_disks', nargs='+', help='Destination disks')
# Execute command
exec_parser = subparsers.add_parser('execute', help='Execute migration plan')
exec_parser.add_argument('plan_file', help='Path to plan JSON file')
exec_parser.add_argument('--dry-run', action='store_true', help='Simulate without actual file operations')
# Dedupe command
dedupe_parser = subparsers.add_parser('dedupe', help='Deduplicate files and compute checksums')
dedupe_parser.add_argument('--disk', help='Optional: Only dedupe specific disk')
dedupe_parser.add_argument('--no-chunks', action='store_true', help='Disable chunk-level deduplication')
# Merge command
merge_parser = subparsers.add_parser('merge', help='Plan multi-disk merge with deduplication')
merge_parser.add_argument('--sources', nargs='+', required=True, help='Source disks to merge')
merge_parser.add_argument('--target', required=True, help='Target disk')
merge_parser.add_argument('--output', default='merge_plan.json', help='Output plan file')
merge_parser.add_argument('--filter-system', action='store_true', help='Filter system/build files')
merge_parser.add_argument('--network', help='Network target (e.g., user@host:/path)')
# Report command
report_parser = subparsers.add_parser('report', help='Show current status')
report_parser.add_argument('--format', choices=['text', 'json'], default='text', help='Report format')
report_parser.add_argument('--show-duplicates', action='store_true', help='Show duplicate files')
report_parser.add_argument('--preview-merge', help='Preview merge plan from file')
args = parser.parse_args()
tool = DiskReorganizer()
if args.command == 'index':
tool.index_disk(args.disk_root, args.disk_name)
elif args.command == 'dedupe':
tool.run_deduplication(disk=args.disk, use_chunks=not args.no_chunks)
elif args.command == 'merge':
tool.plan_merge(sources=args.sources, target=args.target, output_file=args.output,
filter_system=args.filter_system, network_target=args.network)
elif args.command == 'plan':
plan = tool.plan_migration(args.target_disk, args.dest_disks)
if plan:
print(f"\nPlan generated: {plan['file_count']} files, {tool.format_size(plan['total_size'])}")
print(f"Destination disks: {', '.join(plan['destination_disks'])}")
elif args.command == 'execute':
tool.execute_migration(args.plan_file, dry_run=args.dry_run)
elif args.command == 'report':
tool.generate_report(format=args.format, show_duplicates=args.show_duplicates, preview_merge=args.preview_merge)
p=argparse.ArgumentParser();s=p.add_subparsers(dest='cmd',required=True)
i=s.add_parser('index');i.add_argument('root');i.add_argument('name')
pl=s.add_parser('plan');pl.add_argument('target');pl.add_argument('dests',nargs='+')
e=s.add_parser('execute');e.add_argument('file');e.add_argument('--dry',action='store_true')
d=s.add_parser('dedupe');d.add_argument('--disk');d.add_argument('--no-chunks',action='store_true')
m=s.add_parser('merge');m.add_argument('--sources',nargs='+',required=True);m.add_argument('--target',required=True);m.add_argument('--out',default='merge.json');m.add_argument('--filter',action='store_true');m.add_argument('--net')
r=s.add_parser('report');r.add_argument('--fmt',default='text');r.add_argument('--dups',action='store_true');r.add_argument('--preview')
a=p.parse_args();t=DiskReorganizer()
if a.cmd=='index':t.index_disk(a.root,a.name)
elif a.cmd=='dedupe':t.run_deduplication(a.disk,not a.no_chunks)
elif a.cmd=='merge':t.plan_merge(a.sources,a.target,a.out,a.filter,a.net)
elif a.cmd=='plan':plan=t.plan_migration(a.target,a.dests);print(f"\nPlan {plan['file_count']} files {t.format_size(plan['total_size'])}")
elif a.cmd=='execute':t.execute_migration(a.file,a.dry)
elif a.cmd=='report':t.generate_report(a.fmt,a.dups,a.preview)
if __name__ == '__main__':
main()

View File

@@ -103,6 +103,8 @@ CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);
CREATE INDEX IF NOT EXISTS idx_files_disk ON files (disk_label);
CREATE INDEX IF NOT EXISTS idx_files_category ON files (category);
CREATE INDEX IF NOT EXISTS idx_files_status ON files (status);
create index on files (checksum);
create index on files (checksum,path);
CREATE INDEX IF NOT EXISTS idx_operations_status ON operations(status);
CREATE INDEX IF NOT EXISTS idx_operations_created ON operations(created_at);