chunking: add structured
This commit is contained in:
123
app/main.py
123
app/main.py
@@ -894,3 +894,126 @@ class DiskReorganizer:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def inventory_file_types(self, disk: Optional[str]=None, limit: int=50):
|
||||
from analysis.inventory import FileTypeInventory
|
||||
inventory = FileTypeInventory(self.db_config)
|
||||
results = inventory.analyze(disk=disk, limit=limit)
|
||||
|
||||
print(f'\n=== FILE TYPE INVENTORY ===\n')
|
||||
print(f'{"Extension":<15} {"Count":>10} {"Total Size":>12} {"Parsed":>8} {"Status":>8} {"Parser":>15}')
|
||||
print('=' * 95)
|
||||
|
||||
for ext_info in results['extensions']:
|
||||
ext = ext_info['extension']
|
||||
count = ext_info['count']
|
||||
size = ext_info['total_size']
|
||||
parsed = ext_info['parsed']
|
||||
ptype = ext_info['parser_type']
|
||||
status = '✓' if ext_info['is_parseable'] else '✗'
|
||||
print(f'{ext:<15} {count:>10,} {inventory.format_size(size):>12} {parsed:>8,} {status:>8} {ptype:>15}')
|
||||
|
||||
print('=' * 95)
|
||||
summary = results['summary']
|
||||
print(f'Total files: {summary["total_files"]:,}')
|
||||
print(f'Parseable: {summary["parseable_files"]:,} ({100*summary["parseable_files"]/summary["total_files"]:.1f}%)')
|
||||
print(f'Parsed: {summary["parsed_files"]:,} ({summary["coverage"]:.1f}% coverage)')
|
||||
|
||||
print(f'\n=== PARSER STATUS ===\n')
|
||||
for ptype, info in results['parser_status'].items():
|
||||
status = '✓ Implemented' if info['implemented'] else '✗ Not yet'
|
||||
print(f'{ptype:<15} {status:<20} {", ".join(info["extensions"][:10])}')
|
||||
|
||||
if results['unparsed_by_type']:
|
||||
print(f'\n=== UNPARSED FILES BY TYPE ===\n')
|
||||
for ptype, info in sorted(results['unparsed_by_type'].items(), key=lambda x: x[1]['count'], reverse=True):
|
||||
print(f'{ptype:<15} {info["count"]:>10,} files unparsed')
|
||||
exts = sorted(info["extensions"])[:10]
|
||||
print(f' Extensions: {", ".join(exts)}')
|
||||
|
||||
def review_migration(self, category: Optional[str]=None, show_build: bool=False):
|
||||
from classification.classifier import FileClassifier
|
||||
classifier = FileClassifier()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
query = 'SELECT path, size, category FROM files WHERE 1=1'
|
||||
params = []
|
||||
if category:
|
||||
query += ' AND category = %s'
|
||||
params.append(category)
|
||||
if not show_build:
|
||||
query += " AND category NOT LIKE 'artifacts%'"
|
||||
query += ' LIMIT 100'
|
||||
cursor.execute(query, params)
|
||||
results = cursor.fetchall()
|
||||
print(f'\n=== MIGRATION REVIEW ({len(results)} files) ===\n')
|
||||
for path, size, cat in results:
|
||||
print(f'{path[:70]:<70} {cat:>20}')
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def format_size(self, size: int) -> str:
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
if size < 1024:
|
||||
return f'{size:.1f}{unit}'
|
||||
size /= 1024
|
||||
return f'{size:.1f}PB'
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Disk Reorganizer with Content Understanding')
|
||||
subparsers = parser.add_subparsers(dest='command', required=True)
|
||||
|
||||
inventory_parser = subparsers.add_parser('inventory', help='Analyze file types and parser coverage')
|
||||
inventory_parser.add_argument('--disk', help='Analyze specific disk')
|
||||
inventory_parser.add_argument('--limit', type=int, default=50, help='Limit results')
|
||||
|
||||
index_parser = subparsers.add_parser('index', help='Index files on a disk')
|
||||
index_parser.add_argument('disk_root', help='Root path of disk')
|
||||
index_parser.add_argument('disk_name', help='Logical name for disk')
|
||||
|
||||
parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
|
||||
parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
|
||||
parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
|
||||
parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
|
||||
|
||||
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
|
||||
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
|
||||
enrich_parser.add_argument('--use-llm', action='store_true', help='Use LLM for summarization')
|
||||
enrich_parser.add_argument('--network', action='store_true', help='Use network LM_STUDIO')
|
||||
|
||||
search_parser = subparsers.add_parser('search', help='Search indexed content')
|
||||
search_parser.add_argument('query', help='Search query')
|
||||
search_parser.add_argument('--type', choices=['text', 'enrichment', 'path'], default='enrichment')
|
||||
search_parser.add_argument('--limit', type=int, default=20, help='Max results')
|
||||
|
||||
classify_parser = subparsers.add_parser('classify', help='Classify files')
|
||||
classify_parser.add_argument('--disk', help='Classify specific disk')
|
||||
classify_parser.add_argument('--update', action='store_true', help='Update database')
|
||||
classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch')
|
||||
|
||||
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure')
|
||||
folders_parser.add_argument('--disk', help='Analyze specific disk')
|
||||
folders_parser.add_argument('--min-files', type=int, default=3)
|
||||
|
||||
args = parser.parse_args()
|
||||
tool = DiskReorganizer()
|
||||
|
||||
if args.command == 'inventory':
|
||||
tool.inventory_file_types(disk=args.disk, limit=args.limit)
|
||||
elif args.command == 'index':
|
||||
tool.index_disk(args.disk_root, args.disk_name)
|
||||
elif args.command == 'parse':
|
||||
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
|
||||
elif args.command == 'enrich':
|
||||
tool.enrich_files(limit=args.limit, use_llm=args.use_llm, use_local=not args.network)
|
||||
elif args.command == 'search':
|
||||
tool.search_content(query=args.query, limit=args.limit, search_type=args.type)
|
||||
elif args.command == 'classify':
|
||||
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
|
||||
elif args.command == 'analyze-folders':
|
||||
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user