clean up code
This commit is contained in:
150
app/main.py
150
app/main.py
@@ -535,14 +535,18 @@ class DiskReorganizer:
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
||||
params = []
|
||||
if kind:
|
||||
suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
|
||||
suffix_map = {
|
||||
'text': ['.txt', '.md', '.log', '.json', '.yaml', '.yml'],
|
||||
'code': ['.py', '.js', '.java', '.go', '.rs', '.ts', '.cpp', '.h'],
|
||||
'pdf': ['.pdf']
|
||||
}
|
||||
if kind in suffix_map:
|
||||
query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
|
||||
conditions = ' OR '.join([f"path LIKE '%{ext}'" for ext in suffix_map[kind]])
|
||||
query += f" AND ({conditions})"
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(query, params)
|
||||
cursor.execute(query)
|
||||
files = cursor.fetchall()
|
||||
|
||||
print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
|
||||
@@ -580,30 +584,63 @@ class DiskReorganizer:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
|
||||
def enrich_files(self, limit: int = 10, use_llm: bool = False, use_local: bool = True, batch_size: int = 100):
|
||||
from enrichment.enricher import ContentEnricher
|
||||
from enrichment.llm_client import LLMClient
|
||||
|
||||
llm_client = LLMClient(use_local=use_local) if use_llm else None
|
||||
enricher = ContentEnricher(llm_client=llm_client)
|
||||
|
||||
enricher = ContentEnricher()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
|
||||
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL AND (enrichment IS NULL OR enrichment = '{{}}'::jsonb) LIMIT {limit}")
|
||||
files = cursor.fetchall()
|
||||
|
||||
print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
|
||||
print(f"\n=== ENRICHING CONTENT ===")
|
||||
print(f"Processing {len(files)} files")
|
||||
if use_llm:
|
||||
print(f"Using LLM: {'Local OLLAMA' if use_local else 'Network LM_STUDIO'}\n")
|
||||
else:
|
||||
print("Using rule-based enrichment only\n")
|
||||
|
||||
for path, text in files:
|
||||
enrichment = enricher.enrich(text[:5000], use_llm=False)
|
||||
print(f"{path[:60]}")
|
||||
enriched_count = 0
|
||||
batch = []
|
||||
for idx, (path, text) in enumerate(files, 1):
|
||||
if not text:
|
||||
continue
|
||||
|
||||
enrichment = enricher.enrich(text[:5000], use_llm=use_llm)
|
||||
|
||||
print(f"{idx}/{len(files)} {path[:60]}")
|
||||
print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
|
||||
print(f" PII: {list(enrichment.get('has_pii', {}).keys())}")
|
||||
print(f" Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
|
||||
if enrichment.get('security', {}).get('has_pii'):
|
||||
print(f" PII: {list(enrichment.get('security', {}).get('pii_details', {}).keys())}")
|
||||
if enrichment.get('tech_stack'):
|
||||
print(f" Tech: {', '.join(enrichment['tech_stack'][:5])}")
|
||||
if enrichment.get('topics'):
|
||||
print(f" Topics: {', '.join(enrichment['topics'][:5])}")
|
||||
if use_llm and enrichment.get('llm_summary'):
|
||||
print(f" LLM Summary: {enrichment['llm_summary'][:100]}...")
|
||||
if use_llm and enrichment.get('llm_intent'):
|
||||
print(f" Intent: {enrichment['llm_intent'][:100]}...")
|
||||
print()
|
||||
|
||||
cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
|
||||
batch.append((json.dumps(enrichment), path))
|
||||
enriched_count += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Enriched {len(files)} files")
|
||||
if len(batch) >= batch_size:
|
||||
cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
print(f" Committed batch ({enriched_count} files so far)")
|
||||
|
||||
if batch:
|
||||
cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
|
||||
conn.commit()
|
||||
|
||||
print(f"\nEnriched {enriched_count} files")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
@@ -695,6 +732,75 @@ class DiskReorganizer:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def search_content(self, query: str, limit: int=20, search_type: str='text'):
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
if search_type == 'text':
|
||||
cursor.execute('''
|
||||
SELECT path, disk_label, size, category,
|
||||
ts_rank(to_tsvector('english', COALESCE(extracted_text, '')), plainto_tsquery('english', %s)) as rank,
|
||||
LEFT(extracted_text, 200) as snippet
|
||||
FROM files
|
||||
WHERE extracted_text IS NOT NULL
|
||||
AND to_tsvector('english', extracted_text) @@ plainto_tsquery('english', %s)
|
||||
ORDER BY rank DESC
|
||||
LIMIT %s
|
||||
''', (query, query, limit))
|
||||
elif search_type == 'enrichment':
|
||||
cursor.execute('''
|
||||
SELECT path, disk_label, size, category, enrichment
|
||||
FROM files
|
||||
WHERE enrichment IS NOT NULL
|
||||
AND enrichment::text ILIKE %s
|
||||
LIMIT %s
|
||||
''', (f'%{query}%', limit))
|
||||
elif search_type == 'path':
|
||||
cursor.execute('''
|
||||
SELECT path, disk_label, size, category
|
||||
FROM files
|
||||
WHERE path ILIKE %s
|
||||
LIMIT %s
|
||||
''', (f'%{query}%', limit))
|
||||
else:
|
||||
logger.error(f'Unknown search type: {search_type}')
|
||||
return
|
||||
|
||||
results = cursor.fetchall()
|
||||
if not results:
|
||||
print(f'No results found for: {query}')
|
||||
return
|
||||
|
||||
print(f'\n=== SEARCH RESULTS: {len(results)} matches for "{query}" ===\n')
|
||||
for idx, row in enumerate(results, 1):
|
||||
if search_type == 'text':
|
||||
path, disk, size, category, rank, snippet = row
|
||||
print(f'{idx}. {path}')
|
||||
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
||||
print(f' Rank: {rank:.4f}')
|
||||
if snippet:
|
||||
print(f' Snippet: {snippet[:150]}...')
|
||||
elif search_type == 'enrichment':
|
||||
path, disk, size, category, enrichment = row
|
||||
print(f'{idx}. {path}')
|
||||
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
||||
if enrichment:
|
||||
import json
|
||||
enrich_data = json.loads(enrichment) if isinstance(enrichment, str) else enrichment
|
||||
if 'topics' in enrich_data:
|
||||
print(f' Topics: {", ".join(enrich_data["topics"][:5])}')
|
||||
if 'tech_stack' in enrich_data:
|
||||
print(f' Tech: {", ".join(enrich_data["tech_stack"][:5])}')
|
||||
else:
|
||||
path, disk, size, category = row
|
||||
print(f'{idx}. {path}')
|
||||
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
||||
print()
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
|
||||
from analysis.folder_analyzer import FolderAnalyzer
|
||||
analyzer = FolderAnalyzer()
|
||||
@@ -866,8 +972,8 @@ def main():
|
||||
|
||||
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
|
||||
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
|
||||
enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
|
||||
enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
|
||||
enrich_parser.add_argument('--use-llm', action='store_true', help='Use LLM for summarization')
|
||||
enrich_parser.add_argument('--network', action='store_true', help='Use network LM_STUDIO instead of local OLLAMA')
|
||||
|
||||
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
|
||||
classify_parser.add_argument('--disk', help='Classify specific disk')
|
||||
@@ -876,6 +982,10 @@ def main():
|
||||
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
|
||||
folders_parser.add_argument('--disk', help='Analyze specific disk')
|
||||
folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
|
||||
search_parser = subparsers.add_parser('search', help='Search indexed content')
|
||||
search_parser.add_argument('query', help='Search query')
|
||||
search_parser.add_argument('--type', choices=['text', 'enrichment', 'path'], default='enrichment', help='Search type')
|
||||
search_parser.add_argument('--limit', type=int, default=20, help='Max results')
|
||||
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
|
||||
review_parser.add_argument('--category', help='Review specific category')
|
||||
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
|
||||
@@ -905,11 +1015,13 @@ def main():
|
||||
elif args.command == 'parse':
|
||||
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
|
||||
elif args.command == 'enrich':
|
||||
tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
|
||||
tool.enrich_files(limit=args.limit, use_llm=args.use_llm, use_local=not args.network)
|
||||
elif args.command == 'classify':
|
||||
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
|
||||
elif args.command == 'analyze-folders':
|
||||
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
|
||||
elif args.command == 'search':
|
||||
tool.search_content(query=args.query, limit=args.limit, search_type=args.type)
|
||||
elif args.command == 'review':
|
||||
tool.review_migration(category=args.category, show_build=args.show_build)
|
||||
elif args.command == 'report':
|
||||
|
||||
Reference in New Issue
Block a user