clean up code
This commit is contained in:
@@ -1,59 +1,119 @@
|
||||
from typing import Dict
|
||||
from typing import Dict, List
|
||||
import re
|
||||
|
||||
class ContentEnricher:
|
||||
tech_keywords = {'transcribe', 'transcription', 'whisper', 'speech-to-text', 'audio', 'video', 'subtitle', 'caption', 'srt', 'vtt', 'ffmpeg', 'opencv', 'pytorch', 'tensorflow', 'cuda', 'gpu', 'ml', 'nlp', 'llm', 'ollama', 'docker', 'kubernetes', 'postgres', 'database', 'api', 'rest', 'graphql', 'python', 'javascript', 'java', 'rust', 'golang'}
|
||||
|
||||
def __init__(self, llm_client=None):
|
||||
self.llm_client = llm_client
|
||||
self.pii_patterns = {
|
||||
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
||||
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
|
||||
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
|
||||
'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
|
||||
'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
|
||||
'api_key': r'(?i)(api[_-]?key|token|secret)["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{20,})',
|
||||
'password': r'(?i)(password|passwd|pwd)["\']?\s*[:=]\s*["\']([^"\']{8,})'
|
||||
}
|
||||
|
||||
def enrich(self, text: str, use_llm: bool = False) -> Dict:
|
||||
enrichment = {
|
||||
'summary': self._basic_summary(text),
|
||||
'word_count': len(text.split()),
|
||||
'has_pii': self._detect_pii(text),
|
||||
'quality': self._assess_quality(text),
|
||||
'topics': self._extract_basic_topics(text)
|
||||
'topics': self._extract_topics(text),
|
||||
'entities': self._extract_entities(text),
|
||||
'tech_stack': self._detect_tech(text),
|
||||
'security': {
|
||||
'has_pii': bool(self._detect_pii(text)),
|
||||
'has_credentials': self._detect_credentials(text),
|
||||
'pii_details': self._detect_pii(text)
|
||||
},
|
||||
'quality': self._assess_quality(text)
|
||||
}
|
||||
|
||||
if use_llm and self.llm_client:
|
||||
llm_result = self.llm_client.classify_content(text)
|
||||
if llm_result.get('success'):
|
||||
enrichment['llm_classification'] = llm_result['text']
|
||||
summary_result = self.llm_client.summarize(text[:3000], max_length=200)
|
||||
if summary_result.get('success'):
|
||||
enrichment['llm_summary'] = summary_result['text']
|
||||
|
||||
intent_result = self.llm_client.extract_intent(text[:3000])
|
||||
if intent_result.get('success'):
|
||||
enrichment['llm_intent'] = intent_result['text']
|
||||
|
||||
topics_result = self.llm_client.extract_topics(text[:3000])
|
||||
if topics_result.get('success') and topics_result.get('topics'):
|
||||
enrichment['llm_topics'] = topics_result['topics']
|
||||
|
||||
return enrichment
|
||||
|
||||
def _basic_summary(self, text: str) -> str:
|
||||
sentences = re.split(r'[.!?]+', text)
|
||||
return ' '.join(sentences[:3])[:200]
|
||||
if not text:
|
||||
return ''
|
||||
sentences = re.split(r'[.!?\n]+', text)
|
||||
summary = []
|
||||
length = 0
|
||||
for sent in sentences:
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
if length + len(sent) > 200:
|
||||
break
|
||||
summary.append(sent)
|
||||
length += len(sent)
|
||||
return '. '.join(summary) if summary else text[:200]
|
||||
|
||||
def _extract_topics(self, text: str) -> List[str]:
|
||||
text_lower = text.lower()
|
||||
topics = []
|
||||
for tech in self.tech_keywords:
|
||||
if tech in text_lower:
|
||||
topics.append(tech)
|
||||
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
||||
word_freq = {}
|
||||
for word in words:
|
||||
if len(word) > 3 and word.lower() not in {'this', 'that', 'with', 'from', 'have'}:
|
||||
word_freq[word] = word_freq.get(word, 0) + 1
|
||||
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
||||
topics.extend([w for (w, _) in sorted_words[:5]])
|
||||
return list(set(topics))[:15]
|
||||
|
||||
def _extract_entities(self, text: str) -> Dict[str, List[str]]:
|
||||
entities = {'files': [], 'urls': [], 'paths': []}
|
||||
file_pattern = re.compile(r'\b\w+\.(py|js|java|go|rs|cpp|h|md|txt|json|yaml|yml|xml|sql|sh|bat|ts|tsx|jsx)\b')
|
||||
entities['files'] = list(set(file_pattern.findall(text)))[:10]
|
||||
url_pattern = re.compile(r'https?://[^\s<>"{}|\\^`\[\]]+')
|
||||
entities['urls'] = list(set(url_pattern.findall(text)))[:5]
|
||||
path_pattern = re.compile(r'(?:/[a-zA-Z0-9_.-]+)+/?')
|
||||
entities['paths'] = list(set(path_pattern.findall(text)))[:10]
|
||||
return entities
|
||||
|
||||
def _detect_tech(self, text: str) -> List[str]:
|
||||
text_lower = text.lower()
|
||||
return [tech for tech in self.tech_keywords if tech in text_lower]
|
||||
|
||||
def _detect_pii(self, text: str) -> Dict:
|
||||
detected = {}
|
||||
for pii_type, pattern in self.pii_patterns.items():
|
||||
matches = re.findall(pattern, text)
|
||||
for pii_type in ['email', 'phone', 'ssn', 'credit_card']:
|
||||
matches = re.findall(self.pii_patterns[pii_type], text)
|
||||
if matches:
|
||||
detected[pii_type] = len(matches)
|
||||
return detected
|
||||
|
||||
def _detect_credentials(self, text: str) -> bool:
|
||||
for name in ['api_key', 'password']:
|
||||
if re.search(self.pii_patterns[name], text):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _assess_quality(self, text: str) -> str:
|
||||
if len(text.strip()) < 10:
|
||||
return 'low'
|
||||
|
||||
if not text or len(text.strip()) < 10:
|
||||
return 'empty'
|
||||
words = text.split()
|
||||
if not words:
|
||||
return 'empty'
|
||||
avg_word_len = sum(len(w) for w in words) / len(words)
|
||||
if avg_word_len < 2 or avg_word_len > 20:
|
||||
return 'garbled'
|
||||
special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
|
||||
if special_char_ratio > 0.3:
|
||||
return 'low'
|
||||
|
||||
return 'high' if len(text.split()) > 50 else 'medium'
|
||||
|
||||
def _extract_basic_topics(self, text: str) -> list:
|
||||
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
||||
word_freq = {}
|
||||
for word in words:
|
||||
if len(word) > 3:
|
||||
word_freq[word] = word_freq.get(word, 0) + 1
|
||||
|
||||
return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
|
||||
if special_char_ratio > 0.4:
|
||||
return 'low_confidence'
|
||||
return 'good'
|
||||
|
||||
Reference in New Issue
Block a user