from typing import Dict import re class ContentEnricher: def __init__(self, llm_client=None): self.llm_client = llm_client self.pii_patterns = { 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', 'ssn': r'\b\d{3}-\d{2}-\d{4}\b', 'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b' } def enrich(self, text: str, use_llm: bool = False) -> Dict: enrichment = { 'summary': self._basic_summary(text), 'word_count': len(text.split()), 'has_pii': self._detect_pii(text), 'quality': self._assess_quality(text), 'topics': self._extract_basic_topics(text) } if use_llm and self.llm_client: llm_result = self.llm_client.classify_content(text) if llm_result.get('success'): enrichment['llm_classification'] = llm_result['text'] return enrichment def _basic_summary(self, text: str) -> str: sentences = re.split(r'[.!?]+', text) return ' '.join(sentences[:3])[:200] def _detect_pii(self, text: str) -> Dict: detected = {} for pii_type, pattern in self.pii_patterns.items(): matches = re.findall(pattern, text) if matches: detected[pii_type] = len(matches) return detected def _assess_quality(self, text: str) -> str: if len(text.strip()) < 10: return 'low' special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text) if special_char_ratio > 0.3: return 'low' return 'high' if len(text.split()) > 50 else 'medium' def _extract_basic_topics(self, text: str) -> list: words = re.findall(r'\b[A-Z][a-z]+\b', text) word_freq = {} for word in words: if len(word) > 3: word_freq[word] = word_freq.get(word, 0) + 1 return sorted(word_freq, key=word_freq.get, reverse=True)[:10]