clean up code

2025-12-13 12:24:43 +01:00
parent 7ce8c8c73d
commit 78042ff2a2
4 changed files with 326 additions and 73 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -535,14 +535,18 @@ class DiskReorganizer:

        try:
            query = "SELECT path, size, disk_label FROM files WHERE 1=1"
-            params = []
            if kind:
-                suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
+                suffix_map = {
+                    'text': ['.txt', '.md', '.log', '.json', '.yaml', '.yml'],
+                    'code': ['.py', '.js', '.java', '.go', '.rs', '.ts', '.cpp', '.h'],
+                    'pdf': ['.pdf']
+                }
                if kind in suffix_map:
-                    query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
+                    conditions = ' OR '.join([f"path LIKE '%{ext}'" for ext in suffix_map[kind]])
+                    query += f" AND ({conditions})"
            query += f" LIMIT {limit}"

-            cursor.execute(query, params)
+            cursor.execute(query)
            files = cursor.fetchall()

            print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
@@ -580,30 +584,63 @@ class DiskReorganizer:
            cursor.close()
            conn.close()

-    def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
+    def enrich_files(self, limit: int = 10, use_llm: bool = False, use_local: bool = True, batch_size: int = 100):
        from enrichment.enricher import ContentEnricher
+        from enrichment.llm_client import LLMClient
+
+        llm_client = LLMClient(use_local=use_local) if use_llm else None
+        enricher = ContentEnricher(llm_client=llm_client)

-        enricher = ContentEnricher()
        conn = self.get_connection()
        cursor = conn.cursor()

        try:
-            cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
+            cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL AND (enrichment IS NULL OR enrichment = '{{}}'::jsonb) LIMIT {limit}")
            files = cursor.fetchall()

-            print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
+            print(f"\n=== ENRICHING CONTENT ===")
+            print(f"Processing {len(files)} files")
+            if use_llm:
+                print(f"Using LLM: {'Local OLLAMA' if use_local else 'Network LM_STUDIO'}\n")
+            else:
+                print("Using rule-based enrichment only\n")

-            for path, text in files:
-                enrichment = enricher.enrich(text[:5000], use_llm=False)
-                print(f"{path[:60]}")
+            enriched_count = 0
+            batch = []
+            for idx, (path, text) in enumerate(files, 1):
+                if not text:
+                    continue
+
+                enrichment = enricher.enrich(text[:5000], use_llm=use_llm)
+
+                print(f"{idx}/{len(files)} {path[:60]}")
                print(f"  Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
-                print(f"  PII: {list(enrichment.get('has_pii', {}).keys())}")
-                print(f"  Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
+                if enrichment.get('security', {}).get('has_pii'):
+                    print(f"  PII: {list(enrichment.get('security', {}).get('pii_details', {}).keys())}")
+                if enrichment.get('tech_stack'):
+                    print(f"  Tech: {', '.join(enrichment['tech_stack'][:5])}")
+                if enrichment.get('topics'):
+                    print(f"  Topics: {', '.join(enrichment['topics'][:5])}")
+                if use_llm and enrichment.get('llm_summary'):
+                    print(f"  LLM Summary: {enrichment['llm_summary'][:100]}...")
+                if use_llm and enrichment.get('llm_intent'):
+                    print(f"  Intent: {enrichment['llm_intent'][:100]}...")
+                print()

-                cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
+                batch.append((json.dumps(enrichment), path))
+                enriched_count += 1

-            conn.commit()
-            print(f"Enriched {len(files)} files")
+                if len(batch) >= batch_size:
+                    cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
+                    conn.commit()
+                    batch.clear()
+                    print(f"  Committed batch ({enriched_count} files so far)")
+
+            if batch:
+                cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
+                conn.commit()
+
+            print(f"\nEnriched {enriched_count} files")

        finally:
            cursor.close()
@@ -695,6 +732,75 @@ class DiskReorganizer:
            cursor.close()
            conn.close()

+    def search_content(self, query: str, limit: int=20, search_type: str='text'):
+        conn = self.get_connection()
+        cursor = conn.cursor()
+        try:
+            if search_type == 'text':
+                cursor.execute('''
+                    SELECT path, disk_label, size, category,
+                           ts_rank(to_tsvector('english', COALESCE(extracted_text, '')), plainto_tsquery('english', %s)) as rank,
+                           LEFT(extracted_text, 200) as snippet
+                    FROM files
+                    WHERE extracted_text IS NOT NULL
+                    AND to_tsvector('english', extracted_text) @@ plainto_tsquery('english', %s)
+                    ORDER BY rank DESC
+                    LIMIT %s
+                ''', (query, query, limit))
+            elif search_type == 'enrichment':
+                cursor.execute('''
+                    SELECT path, disk_label, size, category, enrichment
+                    FROM files
+                    WHERE enrichment IS NOT NULL
+                    AND enrichment::text ILIKE %s
+                    LIMIT %s
+                ''', (f'%{query}%', limit))
+            elif search_type == 'path':
+                cursor.execute('''
+                    SELECT path, disk_label, size, category
+                    FROM files
+                    WHERE path ILIKE %s
+                    LIMIT %s
+                ''', (f'%{query}%', limit))
+            else:
+                logger.error(f'Unknown search type: {search_type}')
+                return
+
+            results = cursor.fetchall()
+            if not results:
+                print(f'No results found for: {query}')
+                return
+
+            print(f'\n=== SEARCH RESULTS: {len(results)} matches for "{query}" ===\n')
+            for idx, row in enumerate(results, 1):
+                if search_type == 'text':
+                    path, disk, size, category, rank, snippet = row
+                    print(f'{idx}. {path}')
+                    print(f'   Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
+                    print(f'   Rank: {rank:.4f}')
+                    if snippet:
+                        print(f'   Snippet: {snippet[:150]}...')
+                elif search_type == 'enrichment':
+                    path, disk, size, category, enrichment = row
+                    print(f'{idx}. {path}')
+                    print(f'   Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
+                    if enrichment:
+                        import json
+                        enrich_data = json.loads(enrichment) if isinstance(enrichment, str) else enrichment
+                        if 'topics' in enrich_data:
+                            print(f'   Topics: {", ".join(enrich_data["topics"][:5])}')
+                        if 'tech_stack' in enrich_data:
+                            print(f'   Tech: {", ".join(enrich_data["tech_stack"][:5])}')
+                else:
+                    path, disk, size, category = row
+                    print(f'{idx}. {path}')
+                    print(f'   Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
+                print()
+
+        finally:
+            cursor.close()
+            conn.close()
+
    def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
        from analysis.folder_analyzer import FolderAnalyzer
        analyzer = FolderAnalyzer()
@@ -866,8 +972,8 @@ def main():

    enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
    enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
-    enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
-    enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
+    enrich_parser.add_argument('--use-llm', action='store_true', help='Use LLM for summarization')
+    enrich_parser.add_argument('--network', action='store_true', help='Use network LM_STUDIO instead of local OLLAMA')

    classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
    classify_parser.add_argument('--disk', help='Classify specific disk')
@@ -876,6 +982,10 @@ def main():
    folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
    folders_parser.add_argument('--disk', help='Analyze specific disk')
    folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
+    search_parser = subparsers.add_parser('search', help='Search indexed content')
+    search_parser.add_argument('query', help='Search query')
+    search_parser.add_argument('--type', choices=['text', 'enrichment', 'path'], default='enrichment', help='Search type')
+    search_parser.add_argument('--limit', type=int, default=20, help='Max results')
    review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
    review_parser.add_argument('--category', help='Review specific category')
    review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
@@ -905,11 +1015,13 @@ def main():
    elif args.command == 'parse':
        tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
    elif args.command == 'enrich':
-        tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
+        tool.enrich_files(limit=args.limit, use_llm=args.use_llm, use_local=not args.network)
    elif args.command == 'classify':
        tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
    elif args.command == 'analyze-folders':
        tool.analyze_folders(disk=args.disk, min_files=args.min_files)
+    elif args.command == 'search':
+        tool.search_content(query=args.query, limit=args.limit, search_type=args.type)
    elif args.command == 'review':
        tool.review_migration(category=args.category, show_build=args.show_build)
    elif args.command == 'report':