clean up code
This commit is contained in:
@@ -1,59 +1,119 @@
|
|||||||
from typing import Dict
|
from typing import Dict, List
|
||||||
import re
|
import re
|
||||||
|
|
||||||
class ContentEnricher:
|
class ContentEnricher:
|
||||||
|
tech_keywords = {'transcribe', 'transcription', 'whisper', 'speech-to-text', 'audio', 'video', 'subtitle', 'caption', 'srt', 'vtt', 'ffmpeg', 'opencv', 'pytorch', 'tensorflow', 'cuda', 'gpu', 'ml', 'nlp', 'llm', 'ollama', 'docker', 'kubernetes', 'postgres', 'database', 'api', 'rest', 'graphql', 'python', 'javascript', 'java', 'rust', 'golang'}
|
||||||
|
|
||||||
def __init__(self, llm_client=None):
|
def __init__(self, llm_client=None):
|
||||||
self.llm_client = llm_client
|
self.llm_client = llm_client
|
||||||
self.pii_patterns = {
|
self.pii_patterns = {
|
||||||
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
||||||
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
|
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
|
||||||
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
|
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
|
||||||
'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
|
'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
|
||||||
|
'api_key': r'(?i)(api[_-]?key|token|secret)["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{20,})',
|
||||||
|
'password': r'(?i)(password|passwd|pwd)["\']?\s*[:=]\s*["\']([^"\']{8,})'
|
||||||
}
|
}
|
||||||
|
|
||||||
def enrich(self, text: str, use_llm: bool = False) -> Dict:
|
def enrich(self, text: str, use_llm: bool = False) -> Dict:
|
||||||
enrichment = {
|
enrichment = {
|
||||||
'summary': self._basic_summary(text),
|
'summary': self._basic_summary(text),
|
||||||
'word_count': len(text.split()),
|
'word_count': len(text.split()),
|
||||||
'has_pii': self._detect_pii(text),
|
'topics': self._extract_topics(text),
|
||||||
'quality': self._assess_quality(text),
|
'entities': self._extract_entities(text),
|
||||||
'topics': self._extract_basic_topics(text)
|
'tech_stack': self._detect_tech(text),
|
||||||
|
'security': {
|
||||||
|
'has_pii': bool(self._detect_pii(text)),
|
||||||
|
'has_credentials': self._detect_credentials(text),
|
||||||
|
'pii_details': self._detect_pii(text)
|
||||||
|
},
|
||||||
|
'quality': self._assess_quality(text)
|
||||||
}
|
}
|
||||||
|
|
||||||
if use_llm and self.llm_client:
|
if use_llm and self.llm_client:
|
||||||
llm_result = self.llm_client.classify_content(text)
|
summary_result = self.llm_client.summarize(text[:3000], max_length=200)
|
||||||
if llm_result.get('success'):
|
if summary_result.get('success'):
|
||||||
enrichment['llm_classification'] = llm_result['text']
|
enrichment['llm_summary'] = summary_result['text']
|
||||||
|
|
||||||
|
intent_result = self.llm_client.extract_intent(text[:3000])
|
||||||
|
if intent_result.get('success'):
|
||||||
|
enrichment['llm_intent'] = intent_result['text']
|
||||||
|
|
||||||
|
topics_result = self.llm_client.extract_topics(text[:3000])
|
||||||
|
if topics_result.get('success') and topics_result.get('topics'):
|
||||||
|
enrichment['llm_topics'] = topics_result['topics']
|
||||||
|
|
||||||
return enrichment
|
return enrichment
|
||||||
|
|
||||||
def _basic_summary(self, text: str) -> str:
|
def _basic_summary(self, text: str) -> str:
|
||||||
sentences = re.split(r'[.!?]+', text)
|
if not text:
|
||||||
return ' '.join(sentences[:3])[:200]
|
return ''
|
||||||
|
sentences = re.split(r'[.!?\n]+', text)
|
||||||
|
summary = []
|
||||||
|
length = 0
|
||||||
|
for sent in sentences:
|
||||||
|
sent = sent.strip()
|
||||||
|
if not sent:
|
||||||
|
continue
|
||||||
|
if length + len(sent) > 200:
|
||||||
|
break
|
||||||
|
summary.append(sent)
|
||||||
|
length += len(sent)
|
||||||
|
return '. '.join(summary) if summary else text[:200]
|
||||||
|
|
||||||
|
def _extract_topics(self, text: str) -> List[str]:
|
||||||
|
text_lower = text.lower()
|
||||||
|
topics = []
|
||||||
|
for tech in self.tech_keywords:
|
||||||
|
if tech in text_lower:
|
||||||
|
topics.append(tech)
|
||||||
|
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
||||||
|
word_freq = {}
|
||||||
|
for word in words:
|
||||||
|
if len(word) > 3 and word.lower() not in {'this', 'that', 'with', 'from', 'have'}:
|
||||||
|
word_freq[word] = word_freq.get(word, 0) + 1
|
||||||
|
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
topics.extend([w for (w, _) in sorted_words[:5]])
|
||||||
|
return list(set(topics))[:15]
|
||||||
|
|
||||||
|
def _extract_entities(self, text: str) -> Dict[str, List[str]]:
|
||||||
|
entities = {'files': [], 'urls': [], 'paths': []}
|
||||||
|
file_pattern = re.compile(r'\b\w+\.(py|js|java|go|rs|cpp|h|md|txt|json|yaml|yml|xml|sql|sh|bat|ts|tsx|jsx)\b')
|
||||||
|
entities['files'] = list(set(file_pattern.findall(text)))[:10]
|
||||||
|
url_pattern = re.compile(r'https?://[^\s<>"{}|\\^`\[\]]+')
|
||||||
|
entities['urls'] = list(set(url_pattern.findall(text)))[:5]
|
||||||
|
path_pattern = re.compile(r'(?:/[a-zA-Z0-9_.-]+)+/?')
|
||||||
|
entities['paths'] = list(set(path_pattern.findall(text)))[:10]
|
||||||
|
return entities
|
||||||
|
|
||||||
|
def _detect_tech(self, text: str) -> List[str]:
|
||||||
|
text_lower = text.lower()
|
||||||
|
return [tech for tech in self.tech_keywords if tech in text_lower]
|
||||||
|
|
||||||
def _detect_pii(self, text: str) -> Dict:
|
def _detect_pii(self, text: str) -> Dict:
|
||||||
detected = {}
|
detected = {}
|
||||||
for pii_type, pattern in self.pii_patterns.items():
|
for pii_type in ['email', 'phone', 'ssn', 'credit_card']:
|
||||||
matches = re.findall(pattern, text)
|
matches = re.findall(self.pii_patterns[pii_type], text)
|
||||||
if matches:
|
if matches:
|
||||||
detected[pii_type] = len(matches)
|
detected[pii_type] = len(matches)
|
||||||
return detected
|
return detected
|
||||||
|
|
||||||
|
def _detect_credentials(self, text: str) -> bool:
|
||||||
|
for name in ['api_key', 'password']:
|
||||||
|
if re.search(self.pii_patterns[name], text):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def _assess_quality(self, text: str) -> str:
|
def _assess_quality(self, text: str) -> str:
|
||||||
if len(text.strip()) < 10:
|
if not text or len(text.strip()) < 10:
|
||||||
return 'low'
|
return 'empty'
|
||||||
|
words = text.split()
|
||||||
|
if not words:
|
||||||
|
return 'empty'
|
||||||
|
avg_word_len = sum(len(w) for w in words) / len(words)
|
||||||
|
if avg_word_len < 2 or avg_word_len > 20:
|
||||||
|
return 'garbled'
|
||||||
special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
|
special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
|
||||||
if special_char_ratio > 0.3:
|
if special_char_ratio > 0.4:
|
||||||
return 'low'
|
return 'low_confidence'
|
||||||
|
return 'good'
|
||||||
return 'high' if len(text.split()) > 50 else 'medium'
|
|
||||||
|
|
||||||
def _extract_basic_topics(self, text: str) -> list:
|
|
||||||
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
|
||||||
word_freq = {}
|
|
||||||
for word in words:
|
|
||||||
if len(word) > 3:
|
|
||||||
word_freq[word] = word_freq.get(word, 0) + 1
|
|
||||||
|
|
||||||
return sorted(word_freq, key=word_freq.get, reverse=True)[:10]
|
|
||||||
|
|||||||
@@ -1,54 +1,70 @@
|
|||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
from typing import Dict, Optional
|
import logging
|
||||||
|
from typing import Dict, Optional, List
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class LLMClient:
|
class LLMClient:
|
||||||
def __init__(self, endpoint: str = 'http://192.168.1.74:1234', model: str = 'local'):
|
def __init__(self, endpoint: str = 'http://localhost:11434', model: str = 'llama3', use_local: bool = True):
|
||||||
self.endpoint = endpoint
|
self.endpoint = endpoint
|
||||||
self.model = model
|
self.model = model
|
||||||
self.local_ollama = 'http://localhost:11434'
|
self.use_local = use_local
|
||||||
|
self.lm_studio_endpoint = 'http://192.168.1.74:1234'
|
||||||
|
self.lm_studio_model = 'openai/gpt-oss-20b'
|
||||||
|
|
||||||
def summarize(self, text: str, max_length: int = 200) -> Dict:
|
def summarize(self, text: str, max_length: int = 200) -> Dict:
|
||||||
prompt = f"Summarize the following in {max_length} chars or less:\n\n{text[:2000]}"
|
prompt = f"Summarize this concisely in under {max_length} characters:\n\n{text[:3000]}"
|
||||||
return self._query(prompt)
|
return self._query(prompt)
|
||||||
|
|
||||||
def extract_topics(self, text: str) -> Dict:
|
def extract_topics(self, text: str) -> Dict:
|
||||||
prompt = f"Extract 5-10 key topics/tags from this text. Return as comma-separated list:\n\n{text[:2000]}"
|
prompt = f"Extract 5-10 key topics/tags. Return ONLY comma-separated words:\n\n{text[:3000]}"
|
||||||
|
result = self._query(prompt)
|
||||||
|
if result.get('success'):
|
||||||
|
topics = [t.strip() for t in result['text'].split(',')]
|
||||||
|
result['topics'] = topics[:10]
|
||||||
|
return result
|
||||||
|
|
||||||
|
def extract_intent(self, text: str) -> Dict:
|
||||||
|
prompt = f"What is the main purpose/intent of this code/document? Answer in 1-2 sentences:\n\n{text[:3000]}"
|
||||||
return self._query(prompt)
|
return self._query(prompt)
|
||||||
|
|
||||||
def classify_content(self, text: str) -> Dict:
|
def detect_project_type(self, text: str, file_list: List[str]) -> Dict:
|
||||||
prompt = f"Classify this content. Return: category, topics, has_pii (yes/no), quality (high/medium/low):\n\n{text[:1000]}"
|
files_str = ', '.join(file_list[:20])
|
||||||
|
prompt = f"Based on these files: {files_str}\nAnd this content:\n{text[:2000]}\n\nWhat type of project is this? (e.g. web app, ml/ai, transcription, data processing, etc.)"
|
||||||
return self._query(prompt)
|
return self._query(prompt)
|
||||||
|
|
||||||
def _query(self, prompt: str, use_local: bool = False) -> Dict:
|
def _query(self, prompt: str, timeout: int = 30) -> Dict:
|
||||||
try:
|
try:
|
||||||
endpoint = self.local_ollama if use_local else self.endpoint
|
if self.use_local:
|
||||||
|
|
||||||
if use_local:
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
f'{endpoint}/api/generate',
|
f'{self.endpoint}/api/generate',
|
||||||
json={'model': 'llama3.2', 'prompt': prompt, 'stream': False},
|
json={'model': self.model, 'prompt': prompt, 'stream': False},
|
||||||
timeout=30
|
timeout=timeout
|
||||||
)
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
return {'success': True, 'text': data.get('response', '').strip()}
|
||||||
else:
|
else:
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
f'{endpoint}/v1/chat/completions',
|
f'{self.lm_studio_endpoint}/v1/chat/completions',
|
||||||
json={
|
json={
|
||||||
'model': self.model,
|
'model': self.lm_studio_model,
|
||||||
'messages': [{'role': 'user', 'content': prompt}],
|
'messages': [{'role': 'user', 'content': prompt}],
|
||||||
'max_tokens': 500
|
'max_tokens': 500,
|
||||||
|
'temperature': 0.7
|
||||||
},
|
},
|
||||||
timeout=30
|
timeout=timeout
|
||||||
)
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
return {'success': True, 'text': data['choices'][0]['message']['content'].strip()}
|
||||||
|
|
||||||
if response.status_code == 200:
|
return {'success': False, 'error': f'HTTP {response.status_code}'}
|
||||||
data = response.json()
|
|
||||||
if use_local:
|
|
||||||
return {'success': True, 'text': data.get('response', '')}
|
|
||||||
else:
|
|
||||||
return {'success': True, 'text': data['choices'][0]['message']['content']}
|
|
||||||
else:
|
|
||||||
return {'success': False, 'error': f'HTTP {response.status_code}'}
|
|
||||||
|
|
||||||
|
except requests.Timeout:
|
||||||
|
logger.warning(f'LLM request timeout after {timeout}s')
|
||||||
|
return {'success': False, 'error': 'timeout'}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
logger.error(f'LLM query failed: {e}')
|
||||||
return {'success': False, 'error': str(e)}
|
return {'success': False, 'error': str(e)}
|
||||||
|
|||||||
150
app/main.py
150
app/main.py
@@ -535,14 +535,18 @@ class DiskReorganizer:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
||||||
params = []
|
|
||||||
if kind:
|
if kind:
|
||||||
suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
|
suffix_map = {
|
||||||
|
'text': ['.txt', '.md', '.log', '.json', '.yaml', '.yml'],
|
||||||
|
'code': ['.py', '.js', '.java', '.go', '.rs', '.ts', '.cpp', '.h'],
|
||||||
|
'pdf': ['.pdf']
|
||||||
|
}
|
||||||
if kind in suffix_map:
|
if kind in suffix_map:
|
||||||
query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
|
conditions = ' OR '.join([f"path LIKE '%{ext}'" for ext in suffix_map[kind]])
|
||||||
|
query += f" AND ({conditions})"
|
||||||
query += f" LIMIT {limit}"
|
query += f" LIMIT {limit}"
|
||||||
|
|
||||||
cursor.execute(query, params)
|
cursor.execute(query)
|
||||||
files = cursor.fetchall()
|
files = cursor.fetchall()
|
||||||
|
|
||||||
print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
|
print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
|
||||||
@@ -580,30 +584,63 @@ class DiskReorganizer:
|
|||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
|
def enrich_files(self, limit: int = 10, use_llm: bool = False, use_local: bool = True, batch_size: int = 100):
|
||||||
from enrichment.enricher import ContentEnricher
|
from enrichment.enricher import ContentEnricher
|
||||||
|
from enrichment.llm_client import LLMClient
|
||||||
|
|
||||||
|
llm_client = LLMClient(use_local=use_local) if use_llm else None
|
||||||
|
enricher = ContentEnricher(llm_client=llm_client)
|
||||||
|
|
||||||
enricher = ContentEnricher()
|
|
||||||
conn = self.get_connection()
|
conn = self.get_connection()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
|
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL AND (enrichment IS NULL OR enrichment = '{{}}'::jsonb) LIMIT {limit}")
|
||||||
files = cursor.fetchall()
|
files = cursor.fetchall()
|
||||||
|
|
||||||
print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
|
print(f"\n=== ENRICHING CONTENT ===")
|
||||||
|
print(f"Processing {len(files)} files")
|
||||||
|
if use_llm:
|
||||||
|
print(f"Using LLM: {'Local OLLAMA' if use_local else 'Network LM_STUDIO'}\n")
|
||||||
|
else:
|
||||||
|
print("Using rule-based enrichment only\n")
|
||||||
|
|
||||||
for path, text in files:
|
enriched_count = 0
|
||||||
enrichment = enricher.enrich(text[:5000], use_llm=False)
|
batch = []
|
||||||
print(f"{path[:60]}")
|
for idx, (path, text) in enumerate(files, 1):
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
enrichment = enricher.enrich(text[:5000], use_llm=use_llm)
|
||||||
|
|
||||||
|
print(f"{idx}/{len(files)} {path[:60]}")
|
||||||
print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
|
print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
|
||||||
print(f" PII: {list(enrichment.get('has_pii', {}).keys())}")
|
if enrichment.get('security', {}).get('has_pii'):
|
||||||
print(f" Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
|
print(f" PII: {list(enrichment.get('security', {}).get('pii_details', {}).keys())}")
|
||||||
|
if enrichment.get('tech_stack'):
|
||||||
|
print(f" Tech: {', '.join(enrichment['tech_stack'][:5])}")
|
||||||
|
if enrichment.get('topics'):
|
||||||
|
print(f" Topics: {', '.join(enrichment['topics'][:5])}")
|
||||||
|
if use_llm and enrichment.get('llm_summary'):
|
||||||
|
print(f" LLM Summary: {enrichment['llm_summary'][:100]}...")
|
||||||
|
if use_llm and enrichment.get('llm_intent'):
|
||||||
|
print(f" Intent: {enrichment['llm_intent'][:100]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
|
batch.append((json.dumps(enrichment), path))
|
||||||
|
enriched_count += 1
|
||||||
|
|
||||||
conn.commit()
|
if len(batch) >= batch_size:
|
||||||
print(f"Enriched {len(files)} files")
|
cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
|
||||||
|
conn.commit()
|
||||||
|
batch.clear()
|
||||||
|
print(f" Committed batch ({enriched_count} files so far)")
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
cursor.executemany("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", batch)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
print(f"\nEnriched {enriched_count} files")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
cursor.close()
|
cursor.close()
|
||||||
@@ -695,6 +732,75 @@ class DiskReorganizer:
|
|||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
def search_content(self, query: str, limit: int=20, search_type: str='text'):
|
||||||
|
conn = self.get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
try:
|
||||||
|
if search_type == 'text':
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT path, disk_label, size, category,
|
||||||
|
ts_rank(to_tsvector('english', COALESCE(extracted_text, '')), plainto_tsquery('english', %s)) as rank,
|
||||||
|
LEFT(extracted_text, 200) as snippet
|
||||||
|
FROM files
|
||||||
|
WHERE extracted_text IS NOT NULL
|
||||||
|
AND to_tsvector('english', extracted_text) @@ plainto_tsquery('english', %s)
|
||||||
|
ORDER BY rank DESC
|
||||||
|
LIMIT %s
|
||||||
|
''', (query, query, limit))
|
||||||
|
elif search_type == 'enrichment':
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT path, disk_label, size, category, enrichment
|
||||||
|
FROM files
|
||||||
|
WHERE enrichment IS NOT NULL
|
||||||
|
AND enrichment::text ILIKE %s
|
||||||
|
LIMIT %s
|
||||||
|
''', (f'%{query}%', limit))
|
||||||
|
elif search_type == 'path':
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT path, disk_label, size, category
|
||||||
|
FROM files
|
||||||
|
WHERE path ILIKE %s
|
||||||
|
LIMIT %s
|
||||||
|
''', (f'%{query}%', limit))
|
||||||
|
else:
|
||||||
|
logger.error(f'Unknown search type: {search_type}')
|
||||||
|
return
|
||||||
|
|
||||||
|
results = cursor.fetchall()
|
||||||
|
if not results:
|
||||||
|
print(f'No results found for: {query}')
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f'\n=== SEARCH RESULTS: {len(results)} matches for "{query}" ===\n')
|
||||||
|
for idx, row in enumerate(results, 1):
|
||||||
|
if search_type == 'text':
|
||||||
|
path, disk, size, category, rank, snippet = row
|
||||||
|
print(f'{idx}. {path}')
|
||||||
|
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
||||||
|
print(f' Rank: {rank:.4f}')
|
||||||
|
if snippet:
|
||||||
|
print(f' Snippet: {snippet[:150]}...')
|
||||||
|
elif search_type == 'enrichment':
|
||||||
|
path, disk, size, category, enrichment = row
|
||||||
|
print(f'{idx}. {path}')
|
||||||
|
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
||||||
|
if enrichment:
|
||||||
|
import json
|
||||||
|
enrich_data = json.loads(enrichment) if isinstance(enrichment, str) else enrichment
|
||||||
|
if 'topics' in enrich_data:
|
||||||
|
print(f' Topics: {", ".join(enrich_data["topics"][:5])}')
|
||||||
|
if 'tech_stack' in enrich_data:
|
||||||
|
print(f' Tech: {", ".join(enrich_data["tech_stack"][:5])}')
|
||||||
|
else:
|
||||||
|
path, disk, size, category = row
|
||||||
|
print(f'{idx}. {path}')
|
||||||
|
print(f' Disk: {disk}, Size: {self.format_size(int(size))}, Category: {category}')
|
||||||
|
print()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
|
def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
|
||||||
from analysis.folder_analyzer import FolderAnalyzer
|
from analysis.folder_analyzer import FolderAnalyzer
|
||||||
analyzer = FolderAnalyzer()
|
analyzer = FolderAnalyzer()
|
||||||
@@ -866,8 +972,8 @@ def main():
|
|||||||
|
|
||||||
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
|
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
|
||||||
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
|
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
|
||||||
enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
|
enrich_parser.add_argument('--use-llm', action='store_true', help='Use LLM for summarization')
|
||||||
enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
|
enrich_parser.add_argument('--network', action='store_true', help='Use network LM_STUDIO instead of local OLLAMA')
|
||||||
|
|
||||||
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
|
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
|
||||||
classify_parser.add_argument('--disk', help='Classify specific disk')
|
classify_parser.add_argument('--disk', help='Classify specific disk')
|
||||||
@@ -876,6 +982,10 @@ def main():
|
|||||||
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
|
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
|
||||||
folders_parser.add_argument('--disk', help='Analyze specific disk')
|
folders_parser.add_argument('--disk', help='Analyze specific disk')
|
||||||
folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
|
folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
|
||||||
|
search_parser = subparsers.add_parser('search', help='Search indexed content')
|
||||||
|
search_parser.add_argument('query', help='Search query')
|
||||||
|
search_parser.add_argument('--type', choices=['text', 'enrichment', 'path'], default='enrichment', help='Search type')
|
||||||
|
search_parser.add_argument('--limit', type=int, default=20, help='Max results')
|
||||||
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
|
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
|
||||||
review_parser.add_argument('--category', help='Review specific category')
|
review_parser.add_argument('--category', help='Review specific category')
|
||||||
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
|
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
|
||||||
@@ -905,11 +1015,13 @@ def main():
|
|||||||
elif args.command == 'parse':
|
elif args.command == 'parse':
|
||||||
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
|
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
|
||||||
elif args.command == 'enrich':
|
elif args.command == 'enrich':
|
||||||
tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
|
tool.enrich_files(limit=args.limit, use_llm=args.use_llm, use_local=not args.network)
|
||||||
elif args.command == 'classify':
|
elif args.command == 'classify':
|
||||||
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
|
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
|
||||||
elif args.command == 'analyze-folders':
|
elif args.command == 'analyze-folders':
|
||||||
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
|
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
|
||||||
|
elif args.command == 'search':
|
||||||
|
tool.search_content(query=args.query, limit=args.limit, search_type=args.type)
|
||||||
elif args.command == 'review':
|
elif args.command == 'review':
|
||||||
tool.review_migration(category=args.category, show_build=args.show_build)
|
tool.review_migration(category=args.category, show_build=args.show_build)
|
||||||
elif args.command == 'report':
|
elif args.command == 'report':
|
||||||
|
|||||||
65
app/parsers/transcription_parser.py
Normal file
65
app/parsers/transcription_parser.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Optional
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class TranscriptionParser:
|
||||||
|
def __init__(self, model: str = 'base'):
|
||||||
|
self.model = model
|
||||||
|
self.whisper_available = self._check_whisper()
|
||||||
|
|
||||||
|
def _check_whisper(self) -> bool:
|
||||||
|
try:
|
||||||
|
import whisper
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
logger.warning('Whisper not installed. Install with: pip install openai-whisper')
|
||||||
|
return False
|
||||||
|
|
||||||
|
def parse(self, file_path: Path) -> Dict:
|
||||||
|
if not self.whisper_available:
|
||||||
|
return {'success': False, 'error': 'Whisper not available', 'text': ''}
|
||||||
|
|
||||||
|
if not self._is_supported(file_path):
|
||||||
|
return {'success': False, 'error': 'Unsupported file type', 'text': ''}
|
||||||
|
|
||||||
|
try:
|
||||||
|
import whisper
|
||||||
|
logger.info(f'Transcribing {file_path} with Whisper model={self.model}')
|
||||||
|
|
||||||
|
model = whisper.load_model(self.model)
|
||||||
|
result = model.transcribe(str(file_path))
|
||||||
|
|
||||||
|
return {
|
||||||
|
'success': True,
|
||||||
|
'text': result['text'],
|
||||||
|
'segments': result.get('segments', []),
|
||||||
|
'language': result.get('language', 'unknown')
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'Transcription failed for {file_path}: {e}')
|
||||||
|
return {'success': False, 'error': str(e), 'text': ''}
|
||||||
|
|
||||||
|
def _is_supported(self, file_path: Path) -> bool:
|
||||||
|
supported = {'.mp3', '.mp4', '.wav', '.m4a', '.flac', '.ogg', '.avi', '.mkv', '.webm'}
|
||||||
|
return file_path.suffix.lower() in supported
|
||||||
|
|
||||||
|
def parse_with_timestamps(self, file_path: Path) -> Dict:
|
||||||
|
result = self.parse(file_path)
|
||||||
|
if not result['success']:
|
||||||
|
return result
|
||||||
|
|
||||||
|
segments = result.get('segments', [])
|
||||||
|
timestamped_text = []
|
||||||
|
for seg in segments:
|
||||||
|
start = seg.get('start', 0)
|
||||||
|
end = seg.get('end', 0)
|
||||||
|
text = seg.get('text', '').strip()
|
||||||
|
timestamped_text.append(f'[{start:.2f}s - {end:.2f}s] {text}')
|
||||||
|
|
||||||
|
result['timestamped_text'] = '\n'.join(timestamped_text)
|
||||||
|
return result
|
||||||
Reference in New Issue
Block a user