clean up code

2025-12-13 12:24:43 +01:00
parent 7ce8c8c73d
commit 78042ff2a2
4 changed files with 326 additions and 73 deletions
--- a/app/enrichment/enricher.py
+++ b/app/enrichment/enricher.py
@@ -1,59 +1,119 @@
-from typing import Dict
+from typing import Dict, List
 import re

 class ContentEnricher:
+    tech_keywords = {'transcribe', 'transcription', 'whisper', 'speech-to-text', 'audio', 'video', 'subtitle', 'caption', 'srt', 'vtt', 'ffmpeg', 'opencv', 'pytorch', 'tensorflow', 'cuda', 'gpu', 'ml', 'nlp', 'llm', 'ollama', 'docker', 'kubernetes', 'postgres', 'database', 'api', 'rest', 'graphql', 'python', 'javascript', 'java', 'rust', 'golang'}
+
    def __init__(self, llm_client=None):
        self.llm_client = llm_client
        self.pii_patterns = {
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
            'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
-            'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
+            'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
+            'api_key': r'(?i)(api[_-]?key|token|secret)["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{20,})',
+            'password': r'(?i)(password|passwd|pwd)["\']?\s*[:=]\s*["\']([^"\']{8,})'
        }

    def enrich(self, text: str, use_llm: bool = False) -> Dict:
        enrichment = {
            'summary': self._basic_summary(text),
            'word_count': len(text.split()),
-            'has_pii': self._detect_pii(text),
-            'quality': self._assess_quality(text),
-            'topics': self._extract_basic_topics(text)
+            'topics': self._extract_topics(text),
+            'entities': self._extract_entities(text),
+            'tech_stack': self._detect_tech(text),
+            'security': {
+                'has_pii': bool(self._detect_pii(text)),
+                'has_credentials': self._detect_credentials(text),
+                'pii_details': self._detect_pii(text)
+            },
+            'quality': self._assess_quality(text)
        }

        if use_llm and self.llm_client:
-            llm_result = self.llm_client.classify_content(text)
-            if llm_result.get('success'):
-                enrichment['llm_classification'] = llm_result['text']
+            summary_result = self.llm_client.summarize(text[:3000], max_length=200)
+            if summary_result.get('success'):
+                enrichment['llm_summary'] = summary_result['text']
+
+            intent_result = self.llm_client.extract_intent(text[:3000])
+            if intent_result.get('success'):
+                enrichment['llm_intent'] = intent_result['text']
+
+            topics_result = self.llm_client.extract_topics(text[:3000])
+            if topics_result.get('success') and topics_result.get('topics'):
+                enrichment['llm_topics'] = topics_result['topics']

        return enrichment

    def _basic_summary(self, text: str) -> str:
-        sentences = re.split(r'[.!?]+', text)
-        return ' '.join(sentences[:3])[:200]
+        if not text:
+            return ''
+        sentences = re.split(r'[.!?\n]+', text)
+        summary = []
+        length = 0
+        for sent in sentences:
+            sent = sent.strip()
+            if not sent:
+                continue
+            if length + len(sent) > 200:
+                break
+            summary.append(sent)
+            length += len(sent)
+        return '. '.join(summary) if summary else text[:200]
+
+    def _extract_topics(self, text: str) -> List[str]:
+        text_lower = text.lower()
+        topics = []
+        for tech in self.tech_keywords:
+            if tech in text_lower:
+                topics.append(tech)
+        words = re.findall(r'\b[A-Z][a-z]+\b', text)
+        word_freq = {}
+        for word in words:
+            if len(word) > 3 and word.lower() not in {'this', 'that', 'with', 'from', 'have'}:
+                word_freq[word] = word_freq.get(word, 0) + 1
+        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+        topics.extend([w for (w, _) in sorted_words[:5]])
+        return list(set(topics))[:15]
+
+    def _extract_entities(self, text: str) -> Dict[str, List[str]]:
+        entities = {'files': [], 'urls': [], 'paths': []}
+        file_pattern = re.compile(r'\b\w+\.(py|js|java|go|rs|cpp|h|md|txt|json|yaml|yml|xml|sql|sh|bat|ts|tsx|jsx)\b')
+        entities['files'] = list(set(file_pattern.findall(text)))[:10]
+        url_pattern = re.compile(r'https?://[^\s<>"{}|\\^`\[\]]+')
+        entities['urls'] = list(set(url_pattern.findall(text)))[:5]
+        path_pattern = re.compile(r'(?:/[a-zA-Z0-9_.-]+)+/?')
+        entities['paths'] = list(set(path_pattern.findall(text)))[:10]
+        return entities
+
+    def _detect_tech(self, text: str) -> List[str]:
+        text_lower = text.lower()
+        return [tech for tech in self.tech_keywords if tech in text_lower]

    def _detect_pii(self, text: str) -> Dict:
        detected = {}
-        for pii_type, pattern in self.pii_patterns.items():
-            matches = re.findall(pattern, text)
+        for pii_type in ['email', 'phone', 'ssn', 'credit_card']:
+            matches = re.findall(self.pii_patterns[pii_type], text)
            if matches:
                detected[pii_type] = len(matches)
        return detected

+    def _detect_credentials(self, text: str) -> bool:
+        for name in ['api_key', 'password']:
+            if re.search(self.pii_patterns[name], text):
+                return True
+        return False
+
    def _assess_quality(self, text: str) -> str:
-        if len(text.strip()) < 10:
-            return 'low'
-
+        if not text or len(text.strip()) < 10:
+            return 'empty'
+        words = text.split()
+        if not words:
+            return 'empty'
+        avg_word_len = sum(len(w) for w in words) / len(words)
+        if avg_word_len < 2 or avg_word_len > 20:
+            return 'garbled'
        special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
-        if special_char_ratio > 0.3:
-            return 'low'
-
-        return 'high' if len(text.split()) > 50 else 'medium'
-
-    def _extract_basic_topics(self, text: str) -> list:
-        words = re.findall(r'\b[A-Z][a-z]+\b', text)
-        word_freq = {}
-        for word in words:
-            if len(word) > 3:
-                word_freq[word] = word_freq.get(word, 0) + 1
-
-        return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
+        if special_char_ratio > 0.4:
+            return 'low_confidence'
+        return 'good'
--- a/app/enrichment/llm_client.py
+++ b/app/enrichment/llm_client.py
@@ -1,54 +1,70 @@
 import requests
 import json
-from typing import Dict, Optional
+import logging
+from typing import Dict, Optional, List
+
+logger = logging.getLogger(__name__)

 class LLMClient:
-    def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'):
+    def __init__(self, endpoint: str = 'http://localhost:11434', model: str = 'llama3', use_local: bool = True):
        self.endpoint = endpoint
        self.model = model
-        self.local_ollama = 'http://localhost:11434'
+        self.use_local = use_local
+        self.lm_studio_endpoint = 'http://192.168.1.74:1234'
+        self.lm_studio_model = 'openai/gpt-oss-20b'

    def summarize(self, text: str, max_length: int = 200) -> Dict:
-        prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}"
+        prompt = f"Summarize this concisely in under {max_length} characters:\n\n{text[:3000]}"
        return self._query(prompt)

    def extract_topics(self, text: str) -> Dict:
-        prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}"
+        prompt = f"Extract 5-10 key topics/tags. Return ONLY comma-separated words:\n\n{text[:3000]}"
+        result = self._query(prompt)
+        if result.get('success'):
+            topics = [t.strip() for t in result['text'].split(',')]
+            result['topics'] = topics[:10]
+        return result
+
+    def extract_intent(self, text: str) -> Dict:
+        prompt = f"What is the main purpose/intent of this code/document? Answer in 1-2 sentences:\n\n{text[:3000]}"
        return self._query(prompt)

-    def classify_content(self, text: str) -> Dict:
-        prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}"
+    def detect_project_type(self, text: str, file_list: List[str]) -> Dict:
+        files_str = ', '.join(file_list[:20])
+        prompt = f"Based on these files: {files_str}\nAnd this content:\n{text[:2000]}\n\nWhat type of project is this? (e.g. web app, ml/ai, transcription, data processing, etc.)"
        return self._query(prompt)

-    def _query(self, prompt: str, use_local: bool = False) -> Dict:
+    def _query(self, prompt: str, timeout: int = 30) -> Dict:
        try:
-            endpoint = self.local_ollama if use_local else self.endpoint
-
-            if use_local:
+            if self.use_local:
                response = requests.post(
-                    f'{endpoint}/api/generate',
-                    json={'model': 'llama3.2', 'prompt': prompt, 'stream': False},
-                    timeout=30
+                    f'{self.endpoint}/api/generate',
+                    json={'model': self.model, 'prompt': prompt, 'stream': False},
+                    timeout=timeout
                )
+                if response.status_code == 200:
+                    data = response.json()
+                    return {'success': True, 'text': data.get('response', '').strip()}
            else:
                response = requests.post(
-                    f'{endpoint}/v1/chat/completions',
+                    f'{self.lm_studio_endpoint}/v1/chat/completions',
                    json={
-                        'model': self.model,
+                        'model': self.lm_studio_model,
                        'messages': [{'role': 'user', 'content': prompt}],
-                        'max_tokens': 500
+                        'max_tokens': 500,
+                        'temperature': 0.7
                    },
-                    timeout=30
+                    timeout=timeout
                )
+                if response.status_code == 200:
+                    data = response.json()
+                    return {'success': True, 'text': data['choices'][0]['message']['content'].strip()}

-            if response.status_code == 200:
-                data = response.json()
-                if use_local:
-                    return {'success': True, 'text': data.get('response', '')}
-                else:
-                    return {'success': True, 'text': data['choices'][0]['message']['content']}
-            else:
-                return {'success': False, 'error': f'HTTP {response.status_code}'}
+            return {'success': False, 'error': f'HTTP {response.status_code}'}

+        except requests.Timeout:
+            logger.warning(f'LLM request timeout after {timeout}s')
+            return {'success': False, 'error': 'timeout'}
        except Exception as e:
+            logger.error(f'LLM query failed: {e}')
            return {'success': False, 'error': str(e)}