chunking: add structured

This commit is contained in:
mike
2025-12-13 14:32:43 +01:00
parent 1583df8f57
commit b3d06660a4
7 changed files with 507 additions and 1 deletions

View File

@@ -894,3 +894,126 @@ class DiskReorganizer:
cursor.close()
conn.close()
def inventory_file_types(self, disk: Optional[str]=None, limit: int=50):
from analysis.inventory import FileTypeInventory
inventory = FileTypeInventory(self.db_config)
results = inventory.analyze(disk=disk, limit=limit)
print(f'\n=== FILE TYPE INVENTORY ===\n')
print(f'{"Extension":<15} {"Count":>10} {"Total Size":>12} {"Parsed":>8} {"Status":>8} {"Parser":>15}')
print('=' * 95)
for ext_info in results['extensions']:
ext = ext_info['extension']
count = ext_info['count']
size = ext_info['total_size']
parsed = ext_info['parsed']
ptype = ext_info['parser_type']
status = '' if ext_info['is_parseable'] else ''
print(f'{ext:<15} {count:>10,} {inventory.format_size(size):>12} {parsed:>8,} {status:>8} {ptype:>15}')
print('=' * 95)
summary = results['summary']
print(f'Total files: {summary["total_files"]:,}')
print(f'Parseable: {summary["parseable_files"]:,} ({100*summary["parseable_files"]/summary["total_files"]:.1f}%)')
print(f'Parsed: {summary["parsed_files"]:,} ({summary["coverage"]:.1f}% coverage)')
print(f'\n=== PARSER STATUS ===\n')
for ptype, info in results['parser_status'].items():
status = '✓ Implemented' if info['implemented'] else '✗ Not yet'
print(f'{ptype:<15} {status:<20} {", ".join(info["extensions"][:10])}')
if results['unparsed_by_type']:
print(f'\n=== UNPARSED FILES BY TYPE ===\n')
for ptype, info in sorted(results['unparsed_by_type'].items(), key=lambda x: x[1]['count'], reverse=True):
print(f'{ptype:<15} {info["count"]:>10,} files unparsed')
exts = sorted(info["extensions"])[:10]
print(f' Extensions: {", ".join(exts)}')
def review_migration(self, category: Optional[str]=None, show_build: bool=False):
from classification.classifier import FileClassifier
classifier = FileClassifier()
conn = self.get_connection()
cursor = conn.cursor()
try:
query = 'SELECT path, size, category FROM files WHERE 1=1'
params = []
if category:
query += ' AND category = %s'
params.append(category)
if not show_build:
query += " AND category NOT LIKE 'artifacts%'"
query += ' LIMIT 100'
cursor.execute(query, params)
results = cursor.fetchall()
print(f'\n=== MIGRATION REVIEW ({len(results)} files) ===\n')
for path, size, cat in results:
print(f'{path[:70]:<70} {cat:>20}')
finally:
cursor.close()
conn.close()
def format_size(self, size: int) -> str:
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size < 1024:
return f'{size:.1f}{unit}'
size /= 1024
return f'{size:.1f}PB'
def main():
parser = argparse.ArgumentParser(description='Disk Reorganizer with Content Understanding')
subparsers = parser.add_subparsers(dest='command', required=True)
inventory_parser = subparsers.add_parser('inventory', help='Analyze file types and parser coverage')
inventory_parser.add_argument('--disk', help='Analyze specific disk')
inventory_parser.add_argument('--limit', type=int, default=50, help='Limit results')
index_parser = subparsers.add_parser('index', help='Index files on a disk')
index_parser.add_argument('disk_root', help='Root path of disk')
index_parser.add_argument('disk_name', help='Logical name for disk')
parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
enrich_parser.add_argument('--use-llm', action='store_true', help='Use LLM for summarization')
enrich_parser.add_argument('--network', action='store_true', help='Use network LM_STUDIO')
search_parser = subparsers.add_parser('search', help='Search indexed content')
search_parser.add_argument('query', help='Search query')
search_parser.add_argument('--type', choices=['text', 'enrichment', 'path'], default='enrichment')
search_parser.add_argument('--limit', type=int, default=20, help='Max results')
classify_parser = subparsers.add_parser('classify', help='Classify files')
classify_parser.add_argument('--disk', help='Classify specific disk')
classify_parser.add_argument('--update', action='store_true', help='Update database')
classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch')
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure')
folders_parser.add_argument('--disk', help='Analyze specific disk')
folders_parser.add_argument('--min-files', type=int, default=3)
args = parser.parse_args()
tool = DiskReorganizer()
if args.command == 'inventory':
tool.inventory_file_types(disk=args.disk, limit=args.limit)
elif args.command == 'index':
tool.index_disk(args.disk_root, args.disk_name)
elif args.command == 'parse':
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
elif args.command == 'enrich':
tool.enrich_files(limit=args.limit, use_llm=args.use_llm, use_local=not args.network)
elif args.command == 'search':
tool.search_content(query=args.query, limit=args.limit, search_type=args.type)
elif args.command == 'classify':
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
elif args.command == 'analyze-folders':
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
if __name__ == '__main__':
main()