clean up code

This commit is contained in:
mike
2025-12-13 12:24:43 +01:00
parent 7ce8c8c73d
commit 78042ff2a2
4 changed files with 326 additions and 73 deletions

View File

@@ -535,14 +535,18 @@ class DiskReorganizer:
try:
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
params = []
if kind:
suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
suffix_map = {
'text': ['.txt', '.md', '.log', '.json', '.yaml', '.yml'],
'code': ['.py', '.js', '.java', '.go', '.rs', '.ts', '.cpp', '.h'],
'pdf': ['.pdf']
}
if kind in suffix_map:
query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
conditions = ' OR '.join([f"path LIKE '%{ext}'" for ext in suffix_map[kind]])
query += f" AND ({conditions})"
query += f" LIMIT {limit}"
cursor.execute(query, params)
cursor.execute(query)
files = cursor.fetchall()
print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
@@ -580,30 +584,63 @@ class DiskReorganizer:
cursor.close()
conn.close()
def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
def enrich_files(self, limit: int = 10, use_llm: bool = False, use_local: bool = True, batch_size: int = 100):
from enrichment.enricher import ContentEnricher
from enrichment.llm_client import LLMClient
llm_client = LLMClient(use_local=use_local) if use_llm else None
enricher = ContentEnricher(llm_client=llm_client)
enricher = ContentEnricher()
conn = self.get_connection()
cursor = conn.cursor()
try:
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL AND (enrichment IS NULL OR enrichment = '{{}}'::jsonb) LIMIT {limit}")
files = cursor.fetchall()
print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
print(f"\n=== ENRICHING CONTENT ===")
print(f"Processing {len(files)} files")
if use_llm:
print(f"Using LLM: {'Local OLLAMA' if use_local else 'Network LM_STUDIO'}\n")
else:
print("Using rule-based enrichment only\n")
for path, text in files:
enrichment = enricher.enrich(text[:5000], use_llm=False)
print(f"{path[:60]}")
enriched_count = 0
batch = []
for idx, (path, text) in enumerate(files, 1):
if not text:
continue
enrichment = enricher.enrich(text[:5000], use_llm=use_llm)
print(f"{idx}/{len(files)} {path[:60]}")
print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
print(f" PII: {list(enrichment.get('has_pii', {}).keys())}")
print(f" Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
if enrichment.get('security', {}).get('has_pii'):
print(f" PII: {list(enrichment.get('security', {}).get('pii_details', {}).keys())}")
if enrichment.get('tech_stack'):
print(f" Tech: {', '.join(enrichment['tech_stack'][:5])}")
if enrichment.get('topics'):
print(f" Topics: {', '.join(enrichment['topics'][:5])}")
if use_llm and enrichment.get('llm_summary'):
print(f" LLM Summary: {enrichment['llm_summary'][:100]}...")
if use_llm and enrichment.get('llm_intent'):
print(f" Intent: {enrichment['llm_intent'][:100]}...")
print()
cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
batch.append((json.dumps(enrichment), path))
enriched_count += 1
conn.commit()
print(f"Enriched {len(files)} files")
if len(batch) >= batch_size:
cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
conn.commit()
batch.clear()
print(f" Committed batch ({enriched_count} files so far)")
if batch:
cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
conn.commit()
print(f"\nEnriched {enriched_count} files")
finally:
cursor.close()
@@ -695,6 +732,75 @@ class DiskReorganizer:
cursor.close()
conn.close()
def search_content(self, query: str, limit: int=20, search_type: str='text'):
conn = self.get_connection()
cursor = conn.cursor()
try:
if search_type == 'text':
cursor.execute('''
SELECT path, disk_label, size, category,
ts_rank(to_tsvector('english', COALESCE(extracted_text, '')), plainto_tsquery('english', %s)) as rank,
LEFT(extracted_text, 200) as snippet
FROM files
WHERE extracted_text IS NOT NULL
AND to_tsvector('english', extracted_text) @@ plainto_tsquery('english', %s)
ORDER BY rank DESC
LIMIT %s
''', (query, query, limit))
elif search_type == 'enrichment':
cursor.execute('''
SELECT path, disk_label, size, category, enrichment
FROM files
WHERE enrichment IS NOT NULL
AND enrichment::text ILIKE %s
LIMIT %s
''', (f'%{query}%', limit))
elif search_type == 'path':
cursor.execute('''
SELECT path, disk_label, size, category
FROM files
WHERE path ILIKE %s
LIMIT %s
''', (f'%{query}%', limit))
else:
logger.error(f'Unknown search type: {search_type}')
return
results = cursor.fetchall()
if not results:
print(f'No results found for: {query}')
return
print(f'\n=== SEARCH RESULTS: {len(results)} matches for "{query}" ===\n')
for idx, row in enumerate(results, 1):
if search_type == 'text':
path, disk, size, category, rank, snippet = row
print(f'{idx}. {path}')
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
print(f' Rank: {rank:.4f}')
if snippet:
print(f' Snippet: {snippet[:150]}...')
elif search_type == 'enrichment':
path, disk, size, category, enrichment = row
print(f'{idx}. {path}')
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
if enrichment:
import json
enrich_data = json.loads(enrichment) if isinstance(enrichment, str) else enrichment
if 'topics' in enrich_data:
print(f' Topics: {", ".join(enrich_data["topics"][:5])}')
if 'tech_stack' in enrich_data:
print(f' Tech: {", ".join(enrich_data["tech_stack"][:5])}')
else:
path, disk, size, category = row
print(f'{idx}. {path}')
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
print()
finally:
cursor.close()
conn.close()
def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
from analysis.folder_analyzer import FolderAnalyzer
analyzer = FolderAnalyzer()
@@ -866,8 +972,8 @@ def main():
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
enrich_parser.add_argument('--use-llm', action='store_true', help='Use LLM for summarization')
enrich_parser.add_argument('--network', action='store_true', help='Use network LM_STUDIO instead of local OLLAMA')
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
classify_parser.add_argument('--disk', help='Classify specific disk')
@@ -876,6 +982,10 @@ def main():
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
folders_parser.add_argument('--disk', help='Analyze specific disk')
folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
search_parser = subparsers.add_parser('search', help='Search indexed content')
search_parser.add_argument('query', help='Search query')
search_parser.add_argument('--type', choices=['text', 'enrichment', 'path'], default='enrichment', help='Search type')
search_parser.add_argument('--limit', type=int, default=20, help='Max results')
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
review_parser.add_argument('--category', help='Review specific category')
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
@@ -905,11 +1015,13 @@ def main():
elif args.command == 'parse':
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
elif args.command == 'enrich':
tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
tool.enrich_files(limit=args.limit, use_llm=args.use_llm, use_local=not args.network)
elif args.command == 'classify':
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
elif args.command == 'analyze-folders':
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
elif args.command == 'search':
tool.search_content(query=args.query, limit=args.limit, search_type=args.type)
elif args.command == 'review':
tool.review_migration(category=args.category, show_build=args.show_build)
elif args.command == 'report':