From 78042ff2a22a1a847d5f050318e3ca08476c1872 Mon Sep 17 00:00:00 2001 From: mike Date: Sat, 13 Dec 2025 12:24:43 +0100 Subject: [PATCH] clean up code --- app/enrichment/enricher.py | 116 +++++++++++++++------ app/enrichment/llm_client.py | 68 ++++++++----- app/main.py | 150 ++++++++++++++++++++++++---- app/parsers/transcription_parser.py | 65 ++++++++++++ 4 files changed, 326 insertions(+), 73 deletions(-) create mode 100644 app/parsers/transcription_parser.py diff --git a/app/enrichment/enricher.py b/app/enrichment/enricher.py index 9185ce0..fe67d9a 100644 --- a/app/enrichment/enricher.py +++ b/app/enrichment/enricher.py @@ -1,59 +1,119 @@ -from typing import Dict +from typing import Dict, List import re class ContentEnricher: + tech_keywords = {'transcribe', 'transcription', 'whisper', 'speech-to-text', 'audio', 'video', 'subtitle', 'caption', 'srt', 'vtt', 'ffmpeg', 'opencv', 'pytorch', 'tensorflow', 'cuda', 'gpu', 'ml', 'nlp', 'llm', 'ollama', 'docker', 'kubernetes', 'postgres', 'database', 'api', 'rest', 'graphql', 'python', 'javascript', 'java', 'rust', 'golang'} + def __init__(self, llm_client=None): self.llm_client = llm_client self.pii_patterns = { 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', 'ssn': r'\b\d{3}-\d{2}-\d{4}\b', - 'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b' + 'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', + 'api_key': r'(?i)(api[_-]?key|token|secret)["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{20,})', + 'password': r'(?i)(password|passwd|pwd)["\']?\s*[:=]\s*["\']([^"\']{8,})' } def enrich(self, text: str, use_llm: bool = False) -> Dict: enrichment = { 'summary': self._basic_summary(text), 'word_count': len(text.split()), - 'has_pii': self._detect_pii(text), - 'quality': self._assess_quality(text), - 'topics': self._extract_basic_topics(text) + 'topics': self._extract_topics(text), + 'entities': self._extract_entities(text), + 'tech_stack': self._detect_tech(text), + 'security': { + 'has_pii': bool(self._detect_pii(text)), + 'has_credentials': self._detect_credentials(text), + 'pii_details': self._detect_pii(text) + }, + 'quality': self._assess_quality(text) } if use_llm and self.llm_client: - llm_result = self.llm_client.classify_content(text) - if llm_result.get('success'): - enrichment['llm_classification'] = llm_result['text'] + summary_result = self.llm_client.summarize(text[:3000], max_length=200) + if summary_result.get('success'): + enrichment['llm_summary'] = summary_result['text'] + + intent_result = self.llm_client.extract_intent(text[:3000]) + if intent_result.get('success'): + enrichment['llm_intent'] = intent_result['text'] + + topics_result = self.llm_client.extract_topics(text[:3000]) + if topics_result.get('success') and topics_result.get('topics'): + enrichment['llm_topics'] = topics_result['topics'] return enrichment def _basic_summary(self, text: str) -> str: - sentences = re.split(r'[.!?]+', text) - return ' '.join(sentences[:3])[:200] + if not text: + return '' + sentences = re.split(r'[.!?\n]+', text) + summary = [] + length = 0 + for sent in sentences: + sent = sent.strip() + if not sent: + continue + if length + len(sent) > 200: + break + summary.append(sent) + length += len(sent) + return '. '.join(summary) if summary else text[:200] + + def _extract_topics(self, text: str) -> List[str]: + text_lower = text.lower() + topics = [] + for tech in self.tech_keywords: + if tech in text_lower: + topics.append(tech) + words = re.findall(r'\b[A-Z][a-z]+\b', text) + word_freq = {} + for word in words: + if len(word) > 3 and word.lower() not in {'this', 'that', 'with', 'from', 'have'}: + word_freq[word] = word_freq.get(word, 0) + 1 + sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) + topics.extend([w for (w, _) in sorted_words[:5]]) + return list(set(topics))[:15] + + def _extract_entities(self, text: str) -> Dict[str, List[str]]: + entities = {'files': [], 'urls': [], 'paths': []} + file_pattern = re.compile(r'\b\w+\.(py|js|java|go|rs|cpp|h|md|txt|json|yaml|yml|xml|sql|sh|bat|ts|tsx|jsx)\b') + entities['files'] = list(set(file_pattern.findall(text)))[:10] + url_pattern = re.compile(r'https?://[^\s<>"{}|\\^`\[\]]+') + entities['urls'] = list(set(url_pattern.findall(text)))[:5] + path_pattern = re.compile(r'(?:/[a-zA-Z0-9_.-]+)+/?') + entities['paths'] = list(set(path_pattern.findall(text)))[:10] + return entities + + def _detect_tech(self, text: str) -> List[str]: + text_lower = text.lower() + return [tech for tech in self.tech_keywords if tech in text_lower] def _detect_pii(self, text: str) -> Dict: detected = {} - for pii_type, pattern in self.pii_patterns.items(): - matches = re.findall(pattern, text) + for pii_type in ['email', 'phone', 'ssn', 'credit_card']: + matches = re.findall(self.pii_patterns[pii_type], text) if matches: detected[pii_type] = len(matches) return detected + def _detect_credentials(self, text: str) -> bool: + for name in ['api_key', 'password']: + if re.search(self.pii_patterns[name], text): + return True + return False + def _assess_quality(self, text: str) -> str: - if len(text.strip()) < 10: - return 'low' - + if not text or len(text.strip()) < 10: + return 'empty' + words = text.split() + if not words: + return 'empty' + avg_word_len = sum(len(w) for w in words) / len(words) + if avg_word_len < 2 or avg_word_len > 20: + return 'garbled' special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text) - if special_char_ratio > 0.3: - return 'low' - - return 'high' if len(text.split()) > 50 else 'medium' - - def _extract_basic_topics(self, text: str) -> list: - words = re.findall(r'\b[A-Z][a-z]+\b', text) - word_freq = {} - for word in words: - if len(word) > 3: - word_freq[word] = word_freq.get(word, 0) + 1 - - return sorted(word_freq, key=word_freq.get, reverse=True)[:10] + if special_char_ratio > 0.4: + return 'low_confidence' + return 'good' diff --git a/app/enrichment/llm_client.py b/app/enrichment/llm_client.py index 526b4a4..d25ead6 100644 --- a/app/enrichment/llm_client.py +++ b/app/enrichment/llm_client.py @@ -1,54 +1,70 @@ import requests import json -from typing import Dict, Optional +import logging +from typing import Dict, Optional, List + +logger = logging.getLogger(__name__) class LLMClient: - def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'): + def __init__(self, endpoint: str = 'http://localhost:11434', model: str = 'llama3', use_local: bool = True): self.endpoint = endpoint self.model = model - self.local_ollama = 'http://localhost:11434' + self.use_local = use_local + self.lm_studio_endpoint = 'http://192.168.1.74:1234' + self.lm_studio_model = 'openai/gpt-oss-20b' def summarize(self, text: str, max_length: int = 200) -> Dict: - prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}" + prompt = f"Summarize this concisely in under {max_length} characters:\n\n{text[:3000]}" return self._query(prompt) def extract_topics(self, text: str) -> Dict: - prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}" + prompt = f"Extract 5-10 key topics/tags. Return ONLY comma-separated words:\n\n{text[:3000]}" + result = self._query(prompt) + if result.get('success'): + topics = [t.strip() for t in result['text'].split(',')] + result['topics'] = topics[:10] + return result + + def extract_intent(self, text: str) -> Dict: + prompt = f"What is the main purpose/intent of this code/document? Answer in 1-2 sentences:\n\n{text[:3000]}" return self._query(prompt) - def classify_content(self, text: str) -> Dict: - prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}" + def detect_project_type(self, text: str, file_list: List[str]) -> Dict: + files_str = ', '.join(file_list[:20]) + prompt = f"Based on these files: {files_str}\nAnd this content:\n{text[:2000]}\n\nWhat type of project is this? (e.g. web app, ml/ai, transcription, data processing, etc.)" return self._query(prompt) - def _query(self, prompt: str, use_local: bool = False) -> Dict: + def _query(self, prompt: str, timeout: int = 30) -> Dict: try: - endpoint = self.local_ollama if use_local else self.endpoint - - if use_local: + if self.use_local: response = requests.post( - f'{endpoint}/api/generate', - json={'model': 'llama3.2', 'prompt': prompt, 'stream': False}, - timeout=30 + f'{self.endpoint}/api/generate', + json={'model': self.model, 'prompt': prompt, 'stream': False}, + timeout=timeout ) + if response.status_code == 200: + data = response.json() + return {'success': True, 'text': data.get('response', '').strip()} else: response = requests.post( - f'{endpoint}/v1/chat/completions', + f'{self.lm_studio_endpoint}/v1/chat/completions', json={ - 'model': self.model, + 'model': self.lm_studio_model, 'messages': [{'role': 'user', 'content': prompt}], - 'max_tokens': 500 + 'max_tokens': 500, + 'temperature': 0.7 }, - timeout=30 + timeout=timeout ) + if response.status_code == 200: + data = response.json() + return {'success': True, 'text': data['choices'][0]['message']['content'].strip()} - if response.status_code == 200: - data = response.json() - if use_local: - return {'success': True, 'text': data.get('response', '')} - else: - return {'success': True, 'text': data['choices'][0]['message']['content']} - else: - return {'success': False, 'error': f'HTTP {response.status_code}'} + return {'success': False, 'error': f'HTTP {response.status_code}'} + except requests.Timeout: + logger.warning(f'LLM request timeout after {timeout}s') + return {'success': False, 'error': 'timeout'} except Exception as e: + logger.error(f'LLM query failed: {e}') return {'success': False, 'error': str(e)} diff --git a/app/main.py b/app/main.py index 95ab25e..8f5a1e4 100644 --- a/app/main.py +++ b/app/main.py @@ -535,14 +535,18 @@ class DiskReorganizer: try: query = "SELECT path, size, disk_label FROM files WHERE 1=1" - params = [] if kind: - suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"} + suffix_map = { + 'text': ['.txt', '.md', '.log', '.json', '.yaml', '.yml'], + 'code': ['.py', '.js', '.java', '.go', '.rs', '.ts', '.cpp', '.h'], + 'pdf': ['.pdf'] + } if kind in suffix_map: - query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}" + conditions = ' OR '.join([f"path LIKE '%{ext}'" for ext in suffix_map[kind]]) + query += f" AND ({conditions})" query += f" LIMIT {limit}" - cursor.execute(query, params) + cursor.execute(query) files = cursor.fetchall() print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n") @@ -580,30 +584,63 @@ class DiskReorganizer: cursor.close() conn.close() - def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False): + def enrich_files(self, limit: int = 10, use_llm: bool = False, use_local: bool = True, batch_size: int = 100): from enrichment.enricher import ContentEnricher + from enrichment.llm_client import LLMClient + + llm_client = LLMClient(use_local=use_local) if use_llm else None + enricher = ContentEnricher(llm_client=llm_client) - enricher = ContentEnricher() conn = self.get_connection() cursor = conn.cursor() try: - cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}") + cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL AND (enrichment IS NULL OR enrichment = '{{}}'::jsonb) LIMIT {limit}") files = cursor.fetchall() - print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n") + print(f"\n=== ENRICHING CONTENT ===") + print(f"Processing {len(files)} files") + if use_llm: + print(f"Using LLM: {'Local OLLAMA' if use_local else 'Network LM_STUDIO'}\n") + else: + print("Using rule-based enrichment only\n") - for path, text in files: - enrichment = enricher.enrich(text[:5000], use_llm=False) - print(f"{path[:60]}") + enriched_count = 0 + batch = [] + for idx, (path, text) in enumerate(files, 1): + if not text: + continue + + enrichment = enricher.enrich(text[:5000], use_llm=use_llm) + + print(f"{idx}/{len(files)} {path[:60]}") print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}") - print(f" PII: {list(enrichment.get('has_pii', {}).keys())}") - print(f" Topics: {', '.join(enrichment.get('topics', [])[:5])}\n") + if enrichment.get('security', {}).get('has_pii'): + print(f" PII: {list(enrichment.get('security', {}).get('pii_details', {}).keys())}") + if enrichment.get('tech_stack'): + print(f" Tech: {', '.join(enrichment['tech_stack'][:5])}") + if enrichment.get('topics'): + print(f" Topics: {', '.join(enrichment['topics'][:5])}") + if use_llm and enrichment.get('llm_summary'): + print(f" LLM Summary: {enrichment['llm_summary'][:100]}...") + if use_llm and enrichment.get('llm_intent'): + print(f" Intent: {enrichment['llm_intent'][:100]}...") + print() - cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path)) + batch.append((json.dumps(enrichment), path)) + enriched_count += 1 - conn.commit() - print(f"Enriched {len(files)} files") + if len(batch) >= batch_size: + cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch) + conn.commit() + batch.clear() + print(f" Committed batch ({enriched_count} files so far)") + + if batch: + cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch) + conn.commit() + + print(f"\nEnriched {enriched_count} files") finally: cursor.close() @@ -695,6 +732,75 @@ class DiskReorganizer: cursor.close() conn.close() + def search_content(self, query: str, limit: int=20, search_type: str='text'): + conn = self.get_connection() + cursor = conn.cursor() + try: + if search_type == 'text': + cursor.execute(''' + SELECT path, disk_label, size, category, + ts_rank(to_tsvector('english', COALESCE(extracted_text, '')), plainto_tsquery('english', %s)) as rank, + LEFT(extracted_text, 200) as snippet + FROM files + WHERE extracted_text IS NOT NULL + AND to_tsvector('english', extracted_text) @@ plainto_tsquery('english', %s) + ORDER BY rank DESC + LIMIT %s + ''', (query, query, limit)) + elif search_type == 'enrichment': + cursor.execute(''' + SELECT path, disk_label, size, category, enrichment + FROM files + WHERE enrichment IS NOT NULL + AND enrichment::text ILIKE %s + LIMIT %s + ''', (f'%{query}%', limit)) + elif search_type == 'path': + cursor.execute(''' + SELECT path, disk_label, size, category + FROM files + WHERE path ILIKE %s + LIMIT %s + ''', (f'%{query}%', limit)) + else: + logger.error(f'Unknown search type: {search_type}') + return + + results = cursor.fetchall() + if not results: + print(f'No results found for: {query}') + return + + print(f'\n=== SEARCH RESULTS: {len(results)} matches for "{query}" ===\n') + for idx, row in enumerate(results, 1): + if search_type == 'text': + path, disk, size, category, rank, snippet = row + print(f'{idx}. {path}') + print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}') + print(f' Rank: {rank:.4f}') + if snippet: + print(f' Snippet: {snippet[:150]}...') + elif search_type == 'enrichment': + path, disk, size, category, enrichment = row + print(f'{idx}. {path}') + print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}') + if enrichment: + import json + enrich_data = json.loads(enrichment) if isinstance(enrichment, str) else enrichment + if 'topics' in enrich_data: + print(f' Topics: {", ".join(enrich_data["topics"][:5])}') + if 'tech_stack' in enrich_data: + print(f' Tech: {", ".join(enrich_data["tech_stack"][:5])}') + else: + path, disk, size, category = row + print(f'{idx}. {path}') + print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}') + print() + + finally: + cursor.close() + conn.close() + def analyze_folders(self, disk: Optional[str]=None, min_files: int=3): from analysis.folder_analyzer import FolderAnalyzer analyzer = FolderAnalyzer() @@ -866,8 +972,8 @@ def main(): enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis') enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch') - enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint') - enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama') + enrich_parser.add_argument('--use-llm', action='store_true', help='Use LLM for summarization') + enrich_parser.add_argument('--network', action='store_true', help='Use network LM_STUDIO instead of local OLLAMA') classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization') classify_parser.add_argument('--disk', help='Classify specific disk') @@ -876,6 +982,10 @@ def main(): folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent') folders_parser.add_argument('--disk', help='Analyze specific disk') folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder') + search_parser = subparsers.add_parser('search', help='Search indexed content') + search_parser.add_argument('query', help='Search query') + search_parser.add_argument('--type', choices=['text', 'enrichment', 'path'], default='enrichment', help='Search type') + search_parser.add_argument('--limit', type=int, default=20, help='Max results') review_parser = subparsers.add_parser('review', help='Review proposed migration structure') review_parser.add_argument('--category', help='Review specific category') review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts') @@ -905,11 +1015,13 @@ def main(): elif args.command == 'parse': tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update) elif args.command == 'enrich': - tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local) + tool.enrich_files(limit=args.limit, use_llm=args.use_llm, use_local=not args.network) elif args.command == 'classify': tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume) elif args.command == 'analyze-folders': tool.analyze_folders(disk=args.disk, min_files=args.min_files) + elif args.command == 'search': + tool.search_content(query=args.query, limit=args.limit, search_type=args.type) elif args.command == 'review': tool.review_migration(category=args.category, show_build=args.show_build) elif args.command == 'report': diff --git a/app/parsers/transcription_parser.py b/app/parsers/transcription_parser.py new file mode 100644 index 0000000..3909240 --- /dev/null +++ b/app/parsers/transcription_parser.py @@ -0,0 +1,65 @@ +import os +import subprocess +import tempfile +from pathlib import Path +from typing import Dict, Optional +import logging + +logger = logging.getLogger(__name__) + +class TranscriptionParser: + def __init__(self, model: str = 'base'): + self.model = model + self.whisper_available = self._check_whisper() + + def _check_whisper(self) -> bool: + try: + import whisper + return True + except ImportError: + logger.warning('Whisper not installed. Install with: pip install openai-whisper') + return False + + def parse(self, file_path: Path) -> Dict: + if not self.whisper_available: + return {'success': False, 'error': 'Whisper not available', 'text': ''} + + if not self._is_supported(file_path): + return {'success': False, 'error': 'Unsupported file type', 'text': ''} + + try: + import whisper + logger.info(f'Transcribing {file_path} with Whisper model={self.model}') + + model = whisper.load_model(self.model) + result = model.transcribe(str(file_path)) + + return { + 'success': True, + 'text': result['text'], + 'segments': result.get('segments', []), + 'language': result.get('language', 'unknown') + } + except Exception as e: + logger.error(f'Transcription failed for {file_path}: {e}') + return {'success': False, 'error': str(e), 'text': ''} + + def _is_supported(self, file_path: Path) -> bool: + supported = {'.mp3', '.mp4', '.wav', '.m4a', '.flac', '.ogg', '.avi', '.mkv', '.webm'} + return file_path.suffix.lower() in supported + + def parse_with_timestamps(self, file_path: Path) -> Dict: + result = self.parse(file_path) + if not result['success']: + return result + + segments = result.get('segments', []) + timestamped_text = [] + for seg in segments: + start = seg.get('start', 0) + end = seg.get('end', 0) + text = seg.get('text', '').strip() + timestamped_text.append(f'[{start:.2f}s - {end:.2f}s] {text}') + + result['timestamped_text'] = '\n'.join(timestamped_text) + return result