initial
This commit is contained in:
59
app/enrichment/enricher.py
Normal file
59
app/enrichment/enricher.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from typing import Dict
|
||||
import re
|
||||
|
||||
class ContentEnricher:
|
||||
def __init__(self, llm_client=None):
|
||||
self.llm_client = llm_client
|
||||
self.pii_patterns = {
|
||||
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
||||
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
|
||||
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
|
||||
'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
|
||||
}
|
||||
|
||||
def enrich(self, text: str, use_llm: bool = False) -> Dict:
|
||||
enrichment = {
|
||||
'summary': self._basic_summary(text),
|
||||
'word_count': len(text.split()),
|
||||
'has_pii': self._detect_pii(text),
|
||||
'quality': self._assess_quality(text),
|
||||
'topics': self._extract_basic_topics(text)
|
||||
}
|
||||
|
||||
if use_llm and self.llm_client:
|
||||
llm_result = self.llm_client.classify_content(text)
|
||||
if llm_result.get('success'):
|
||||
enrichment['llm_classification'] = llm_result['text']
|
||||
|
||||
return enrichment
|
||||
|
||||
def _basic_summary(self, text: str) -> str:
|
||||
sentences = re.split(r'[.!?]+', text)
|
||||
return ' '.join(sentences[:3])[:200]
|
||||
|
||||
def _detect_pii(self, text: str) -> Dict:
|
||||
detected = {}
|
||||
for pii_type, pattern in self.pii_patterns.items():
|
||||
matches = re.findall(pattern, text)
|
||||
if matches:
|
||||
detected[pii_type] = len(matches)
|
||||
return detected
|
||||
|
||||
def _assess_quality(self, text: str) -> str:
|
||||
if len(text.strip()) < 10:
|
||||
return 'low'
|
||||
|
||||
special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
|
||||
if special_char_ratio > 0.3:
|
||||
return 'low'
|
||||
|
||||
return 'high' if len(text.split()) > 50 else 'medium'
|
||||
|
||||
def _extract_basic_topics(self, text: str) -> list:
|
||||
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
||||
word_freq = {}
|
||||
for word in words:
|
||||
if len(word) > 3:
|
||||
word_freq[word] = word_freq.get(word, 0) + 1
|
||||
|
||||
return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
|
||||
54
app/enrichment/llm_client.py
Normal file
54
app/enrichment/llm_client.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import requests
|
||||
import json
|
||||
from typing import Dict, Optional
|
||||
|
||||
class LLMClient:
|
||||
def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'):
|
||||
self.endpoint = endpoint
|
||||
self.model = model
|
||||
self.local_ollama = 'http://localhost:11434'
|
||||
|
||||
def summarize(self, text: str, max_length: int = 200) -> Dict:
|
||||
prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}"
|
||||
return self._query(prompt)
|
||||
|
||||
def extract_topics(self, text: str) -> Dict:
|
||||
prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}"
|
||||
return self._query(prompt)
|
||||
|
||||
def classify_content(self, text: str) -> Dict:
|
||||
prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}"
|
||||
return self._query(prompt)
|
||||
|
||||
def _query(self, prompt: str, use_local: bool = False) -> Dict:
|
||||
try:
|
||||
endpoint = self.local_ollama if use_local else self.endpoint
|
||||
|
||||
if use_local:
|
||||
response = requests.post(
|
||||
f'{endpoint}/api/generate',
|
||||
json={'model': 'llama3.2', 'prompt': prompt, 'stream': False},
|
||||
timeout=30
|
||||
)
|
||||
else:
|
||||
response = requests.post(
|
||||
f'{endpoint}/v1/chat/completions',
|
||||
json={
|
||||
'model': self.model,
|
||||
'messages': [{'role': 'user', 'content': prompt}],
|
||||
'max_tokens': 500
|
||||
},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if use_local:
|
||||
return {'success': True, 'text': data.get('response', '')}
|
||||
else:
|
||||
return {'success': True, 'text': data['choices'][0]['message']['content']}
|
||||
else:
|
||||
return {'success': False, 'error': f'HTTP {response.status_code}'}
|
||||
|
||||
except Exception as e:
|
||||
return {'success': False, 'error': str(e)}
|
||||
Reference in New Issue
Block a user