60 lines
2.0 KiB
Python
60 lines
2.0 KiB
Python
from typing import Dict
|
|
import re
|
|
|
|
class ContentEnricher:
|
|
def __init__(self, llm_client=None):
|
|
self.llm_client = llm_client
|
|
self.pii_patterns = {
|
|
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
|
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
|
|
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
|
|
'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
|
|
}
|
|
|
|
def enrich(self, text: str, use_llm: bool = False) -> Dict:
|
|
enrichment = {
|
|
'summary': self._basic_summary(text),
|
|
'word_count': len(text.split()),
|
|
'has_pii': self._detect_pii(text),
|
|
'quality': self._assess_quality(text),
|
|
'topics': self._extract_basic_topics(text)
|
|
}
|
|
|
|
if use_llm and self.llm_client:
|
|
llm_result = self.llm_client.classify_content(text)
|
|
if llm_result.get('success'):
|
|
enrichment['llm_classification'] = llm_result['text']
|
|
|
|
return enrichment
|
|
|
|
def _basic_summary(self, text: str) -> str:
|
|
sentences = re.split(r'[.!?]+', text)
|
|
return ' '.join(sentences[:3])[:200]
|
|
|
|
def _detect_pii(self, text: str) -> Dict:
|
|
detected = {}
|
|
for pii_type, pattern in self.pii_patterns.items():
|
|
matches = re.findall(pattern, text)
|
|
if matches:
|
|
detected[pii_type] = len(matches)
|
|
return detected
|
|
|
|
def _assess_quality(self, text: str) -> str:
|
|
if len(text.strip()) < 10:
|
|
return 'low'
|
|
|
|
special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
|
|
if special_char_ratio > 0.3:
|
|
return 'low'
|
|
|
|
return 'high' if len(text.split()) > 50 else 'medium'
|
|
|
|
def _extract_basic_topics(self, text: str) -> list:
|
|
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
|
word_freq = {}
|
|
for word in words:
|
|
if len(word) > 3:
|
|
word_freq[word] = word_freq.get(word, 0) + 1
|
|
|
|
return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
|