from typing import Dict, List import re class ContentEnricher: tech_keywords = {'transcribe', 'transcription', 'whisper', 'speech-to-text', 'audio', 'video', 'subtitle', 'caption', 'srt', 'vtt', 'ffmpeg', 'opencv', 'pytorch', 'tensorflow', 'cuda', 'gpu', 'ml', 'nlp', 'llm', 'ollama', 'docker', 'kubernetes', 'postgres', 'database', 'api', 'rest', 'graphql', 'python', 'javascript', 'java', 'rust', 'golang'} def __init__(self, llm_client=None): self.llm_client = llm_client self.pii_patterns = { 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', 'ssn': r'\b\d{3}-\d{2}-\d{4}\b', 'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', 'api_key': r'(?i)(api[_-]?key|token|secret)["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{20,})', 'password': r'(?i)(password|passwd|pwd)["\']?\s*[:=]\s*["\']([^"\']{8,})' } def enrich(self, text: str, use_llm: bool = False) -> Dict: enrichment = { 'summary': self._basic_summary(text), 'word_count': len(text.split()), 'topics': self._extract_topics(text), 'entities': self._extract_entities(text), 'tech_stack': self._detect_tech(text), 'security': { 'has_pii': bool(self._detect_pii(text)), 'has_credentials': self._detect_credentials(text), 'pii_details': self._detect_pii(text) }, 'quality': self._assess_quality(text) } if use_llm and self.llm_client: summary_result = self.llm_client.summarize(text[:3000], max_length=200) if summary_result.get('success'): enrichment['llm_summary'] = summary_result['text'] intent_result = self.llm_client.extract_intent(text[:3000]) if intent_result.get('success'): enrichment['llm_intent'] = intent_result['text'] topics_result = self.llm_client.extract_topics(text[:3000]) if topics_result.get('success') and topics_result.get('topics'): enrichment['llm_topics'] = topics_result['topics'] return enrichment def _basic_summary(self, text: str) -> str: if not text: return '' sentences = re.split(r'[.!?\n]+', text) summary = [] length = 0 for sent in sentences: sent = sent.strip() if not sent: continue if length + len(sent) > 200: break summary.append(sent) length += len(sent) return '. '.join(summary) if summary else text[:200] def _extract_topics(self, text: str) -> List[str]: text_lower = text.lower() topics = [] for tech in self.tech_keywords: if tech in text_lower: topics.append(tech) words = re.findall(r'\b[A-Z][a-z]+\b', text) word_freq = {} for word in words: if len(word) > 3 and word.lower() not in {'this', 'that', 'with', 'from', 'have'}: word_freq[word] = word_freq.get(word, 0) + 1 sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) topics.extend([w for (w, _) in sorted_words[:5]]) return list(set(topics))[:15] def _extract_entities(self, text: str) -> Dict[str, List[str]]: entities = {'files': [], 'urls': [], 'paths': []} file_pattern = re.compile(r'\b\w+\.(py|js|java|go|rs|cpp|h|md|txt|json|yaml|yml|xml|sql|sh|bat|ts|tsx|jsx)\b') entities['files'] = list(set(file_pattern.findall(text)))[:10] url_pattern = re.compile(r'https?://[^\s<>"{}|\\^`\[\]]+') entities['urls'] = list(set(url_pattern.findall(text)))[:5] path_pattern = re.compile(r'(?:/[a-zA-Z0-9_.-]+)+/?') entities['paths'] = list(set(path_pattern.findall(text)))[:10] return entities def _detect_tech(self, text: str) -> List[str]: text_lower = text.lower() return [tech for tech in self.tech_keywords if tech in text_lower] def _detect_pii(self, text: str) -> Dict: detected = {} for pii_type in ['email', 'phone', 'ssn', 'credit_card']: matches = re.findall(self.pii_patterns[pii_type], text) if matches: detected[pii_type] = len(matches) return detected def _detect_credentials(self, text: str) -> bool: for name in ['api_key', 'password']: if re.search(self.pii_patterns[name], text): return True return False def _assess_quality(self, text: str) -> str: if not text or len(text.strip()) < 10: return 'empty' words = text.split() if not words: return 'empty' avg_word_len = sum(len(w) for w in words) / len(words) if avg_word_len < 2 or avg_word_len > 20: return 'garbled' special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text) if special_char_ratio > 0.4: return 'low_confidence' return 'good'