This commit is contained in:
mike
2025-12-17 16:33:19 +01:00
commit ae818f0b4b
10 changed files with 2206 additions and 0 deletions

792
transcribe_speakers.py Executable file
View File

@@ -0,0 +1,792 @@
#!/usr/bin/env python3
"""
Real-time transcription of Windows speaker output using loopback capture.
Captures system audio and transcribes with Whisper in near real-time.
"""
import sounddevice as sd
import numpy as np
import threading
import queue
import time
import os
import argparse
import json
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
# Whisper transcription (using faster-whisper for optimal performance)
from faster_whisper import WhisperModel
# Sentence extraction for stitching chunks
from sentence_extractor import SentenceExtractor, SentenceCleaner
# LLM integration (optional)
try:
import ollama
OLLAMA_AVAILABLE = True
except ImportError:
OLLAMA_AVAILABLE = False
class WindowsLoopbackAudioCapture:
"""Capture Windows speaker output using WASAPI loopback"""
def __init__(self, device_name=None, sample_rate=16000, chunk_size=2048):
self.sample_rate = sample_rate
self.chunk_size = chunk_size
# Find loopback device
self.device_info = self._find_loopback_device(device_name)
if not self.device_info:
raise RuntimeError(
"No loopback device found.\n"
"1. Ensure your speakers/headphones are connected\n"
"2. Enable 'Stereo Mix' in Sound settings\n"
"3. Or install VB-Cable virtual audio device"
)
print(f"✓ Using device: {self.device_info['name']} (index {self.device_info['index']})")
# Queue for audio data
self.audio_queue = queue.Queue()
self.stop_event = threading.Event()
# Start the stream
try:
self.stream = sd.InputStream(
device=self.device_info['index'],
channels=1,
samplerate=sample_rate,
blocksize=chunk_size,
dtype='int16',
latency='low',
callback=self._audio_callback
)
self.stream.start()
print("✓ Audio capture stream started")
except Exception as e:
raise RuntimeError(f"Failed to start audio stream: {e}")
def _find_loopback_device(self, device_name):
"""Find the speaker device with loopback capability"""
devices = sd.query_devices()
# If device name specified, find exact match
if device_name:
for dev in devices:
if (device_name.lower() in dev['name'].lower() and
dev['max_input_channels'] > 0):
return dev
# Auto-detect: look for WASAPI speakers/headphones
for dev in devices:
if (dev['max_input_channels'] > 0 and
any(x in dev['name'] for x in ['Speakers', 'Headphones', 'Output'])):
return dev
# Fallback: Stereo Mix or similar
for dev in devices:
if 'Stereo Mix' in dev['name']:
return dev
return None
def _audio_callback(self, indata, frames, time_info, status):
"""Callback for audio data"""
if status:
print(f"⚠ Audio status: {status}")
self.audio_queue.put(indata.copy())
def read_chunk(self):
"""Read audio data from queue"""
try:
return self.audio_queue.get(timeout=0.05).flatten()
except queue.Empty:
return None
def close(self):
"""Cleanup resources"""
if hasattr(self, 'stream'):
self.stream.stop()
self.stream.close()
class WhisperStreamTranscriber:
"""Process audio chunks with Whisper/faster-whisper"""
def __init__(self, model_name="base", language="en", force_cpu=False, device_index=0):
print(f"Loading Whisper model '{model_name}'...")
# Check for CUDA availability
import torch
has_cuda = torch.cuda.is_available() and not force_cpu
# Force CPU if CUDA libraries incompatible
device = "cpu"
compute_type = "int8"
if has_cuda:
try:
# Test if CTranslate2 can actually use CUDA
import ctranslate2
cuda_count = ctranslate2.get_cuda_device_count()
if cuda_count > 0:
# Validate device index
if device_index >= cuda_count:
print(f"⚠️ GPU index {device_index} not available. Found {cuda_count} GPU(s). Using GPU 0.")
device_index = 0
# CTranslate2 uses "cuda" + device_index parameter, not "cuda:N"
device = "cuda"
compute_type = "float16"
print(f"Using device: cuda:{device_index} ({torch.cuda.get_device_name(device_index)})")
else:
print(f"CUDA available in PyTorch but not in CTranslate2. Using CPU.")
device = "cpu"
compute_type = "int8"
except Exception as e:
print(f"CUDA libraries not found ({e}). Using CPU.")
device = "cpu"
compute_type = "int8"
else:
print("Using device: cpu")
# FASTER-WHISPER (recommended):
model_kwargs = {
"device": device,
"compute_type": compute_type
}
if device == "cuda":
model_kwargs["device_index"] = device_index
elif device == "cpu":
model_kwargs["cpu_threads"] = 4
self.model = WhisperModel(model_name, **model_kwargs)
self.language = language
self.audio_buffer = np.array([], dtype=np.float32)
self.lock = threading.Lock()
def add_audio(self, audio_chunk):
"""Add new audio data to buffer"""
with self.lock:
audio_float = audio_chunk.astype(np.float32) / 32768.0
self.audio_buffer = np.concatenate([self.audio_buffer, audio_float])
def transcribe_chunk(self, min_duration=5.0, fast_mode=False):
"""Transcribe accumulated audio if enough duration"""
with self.lock:
duration = len(self.audio_buffer) / 16000
if duration < min_duration:
return None
audio_to_process = self.audio_buffer.copy()
self.audio_buffer = np.array([], dtype=np.float32)
# Process with FASTER-WHISPER:
try:
# Optimize parameters for speed vs accuracy
if fast_mode:
# Fast mode: lower beam size, no VAD
segments, _ = self.model.transcribe(
audio_to_process,
language=self.language,
beam_size=1, # Greedy decoding (fastest)
best_of=1,
temperature=0.0,
vad_filter=False,
word_timestamps=False
)
else:
# Balanced mode: moderate beam size with VAD
segments, _ = self.model.transcribe(
audio_to_process,
language=self.language,
beam_size=3, # Reduced from 5
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500),
word_timestamps=False
)
text = " ".join([segment.text for segment in segments]).strip()
return text if text else None
except Exception as e:
print(f"❌ Transcription error: {e}")
return None
class LocalLLMAnalyzer:
"""Local LLM for fact-checking and question generation using Ollama"""
def __init__(self, model="llama3.2", debug=False):
if not OLLAMA_AVAILABLE:
raise RuntimeError(
"Ollama package not installed.\n"
"Install with: pip install ollama"
)
self.model = model
self.debug = debug
self._test_connection()
def _test_connection(self):
"""Test connection to Ollama service"""
try:
ollama.list()
print(f"✓ Ollama connected using model: {self.model}")
except Exception as e:
raise RuntimeError(
f"Cannot connect to Ollama. Ensure it's installed and running.\n"
f"Error: {e}\n"
f"Install from: https://ollama.ai\n"
f"Then run: ollama pull {self.model}"
)
def _extract_json(self, text):
"""Extract JSON from text that might contain markdown or other formatting"""
# Try to find JSON block in markdown code fence
import re
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if json_match:
return json_match.group(1)
# Try to find raw JSON object
json_match = re.search(r'\{.*\}', text, re.DOTALL)
if json_match:
return json_match.group(0)
return text
def fact_check(self, text, context=""):
"""Analyze text for factual accuracy"""
# Try simple structured format first
prompt = f"""Analyze this for accuracy. Reply in this exact format:
VERDICT: [factual/dubious/not_factual]
CONFIDENCE: [0.0-1.0]
EXPLANATION: [one sentence]
Statement: "{text}"
"""
try:
response = ollama.generate(
model=self.model,
prompt=prompt,
options={"temperature": 0.1, "num_predict": 250}
)
response_text = response['response'].strip()
if self.debug:
print(f"\n[DEBUG] Fact-check response:\n{response_text}\n")
# Try to parse structured text format
verdict = "dubious"
confidence = 0.5
explanation = "No explanation provided"
# Extract VERDICT
import re
verdict_match = re.search(r'VERDICT:\s*(\w+)', response_text, re.IGNORECASE)
if verdict_match:
verdict = verdict_match.group(1).lower()
# Extract CONFIDENCE
conf_match = re.search(r'CONFIDENCE:\s*([\d.]+)', response_text, re.IGNORECASE)
if conf_match:
try:
confidence = float(conf_match.group(1))
confidence = max(0.0, min(1.0, confidence)) # Clamp to 0-1
except ValueError:
pass
# Extract EXPLANATION
expl_match = re.search(r'EXPLANATION:\s*(.+?)(?:\n\n|\Z)', response_text, re.IGNORECASE | re.DOTALL)
if expl_match:
explanation = expl_match.group(1).strip()
# Clean up incomplete sentences
if explanation and not explanation[-1] in '.!?':
# Try to find last complete sentence
last_period = max(explanation.rfind('.'), explanation.rfind('!'), explanation.rfind('?'))
if last_period > 20: # Keep at least some text
explanation = explanation[:last_period + 1]
return {
"verdict": verdict,
"confidence": confidence,
"explanation": explanation[:250] if explanation else "Analysis completed",
"sources": [],
"corrections": ""
}
except Exception as e:
if self.debug:
print(f"[DEBUG] Fact-check error: {e}")
return {
"verdict": "error",
"confidence": 0.0,
"explanation": f"Analysis failed: {str(e)}",
"sources": [],
"corrections": ""
}
def generate_augmenting_questions(self, text, context=""):
"""Generate insightful questions based on the text"""
prompt = f"""Generate 3 questions about this. Reply in this exact format:
Q1: [question]
Q2: [question]
Q3: [question]
Statement: "{text}"
"""
try:
response = ollama.generate(
model=self.model,
prompt=prompt,
options={"temperature": 0.7, "num_predict": 250}
)
response_text = response['response'].strip()
if self.debug:
print(f"\n[DEBUG] Questions response:\n{response_text}\n")
# Extract questions
import re
questions = []
for i in range(1, 4):
q_match = re.search(rf'Q{i}:\s*(.+?)(?:\n|$)', response_text, re.IGNORECASE)
if q_match:
question = q_match.group(1).strip()
# Clean up incomplete questions
if question and not question[-1] in '?':
# Try to find last complete question
last_q = question.rfind('?')
if last_q > 10:
question = question[:last_q + 1]
else:
question = question + "?"
questions.append(question)
# If we couldn't parse, try to split by newlines and take first 3 non-empty lines
if len(questions) < 3:
lines = [line.strip() for line in response_text.split('\n') if line.strip()]
# Filter out lines that look like question markers
lines = [l for l in lines if not re.match(r'^Q\d+:?\s*$', l)]
for line in lines[:3]:
if not line.endswith('?'):
line = line + "?"
questions.append(line)
# Ensure we have exactly 3 questions
default_questions = [
"What are the key points here?",
"What evidence supports this?",
"What are the implications?"
]
while len(questions) < 3:
questions.append(default_questions[len(questions)])
return {
"questions": questions[:3],
"topics": []
}
except Exception as e:
if self.debug:
print(f"[DEBUG] Questions error: {e}")
return {
"questions": [
"What are the key points?",
"What supports this claim?",
"What are the implications?"
],
"topics": []
}
def list_audio_devices():
"""Print all available audio input devices"""
print("\nAvailable audio capture devices:")
devices = sd.query_devices()
for i, dev in enumerate(devices):
if dev['max_input_channels'] > 0:
print(f" [{i}] {dev['name']}")
print(f" Channels: {dev['max_input_channels']} | Sample Rate: {dev['default_samplerate']}")
print()
def save_transcript(text, timestamp, filename):
"""Append transcript to file"""
os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
with open(filename, "a", encoding="utf-8") as f:
f.write(f"[{timestamp}] {text}\n")
def save_enriched_transcript(data, filename):
"""Save enriched transcript with LLM analysis"""
os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
with open(filename, "a", encoding="utf-8") as f:
f.write(f"\n{'='*70}\n")
f.write(f"[{data['timestamp']}] {data['text']}\n\n")
if 'fact_check' in data:
fc = data['fact_check']
f.write(f"📊 Fact Check: {fc.get('verdict', 'N/A').upper()} "
f"(confidence: {fc.get('confidence', 0):.2f})\n")
f.write(f"💡 {fc.get('explanation', 'N/A')}\n")
if fc.get('corrections'):
f.write(f"✏️ Correction: {fc['corrections']}\n")
f.write("\n")
if 'questions' in data and data['questions'].get('questions'):
f.write("❓ Questions:\n")
for i, q in enumerate(data['questions']['questions'], 1):
f.write(f"{i}. {q}\n")
f.write("\n")
def display_enriched_output(text, timestamp, fact_check=None, questions=None):
"""Display transcript with LLM analysis"""
print(f"\n[{timestamp}] {text}")
if fact_check:
verdict_emoji = {
'factual': '',
'dubious': '⚠️',
'not_factual': '',
'error': '⚠️'
}
emoji = verdict_emoji.get(fact_check.get('verdict', 'error'), '')
print(f"\n{emoji} Fact Check: {fact_check.get('verdict', 'N/A').upper()} "
f"(confidence: {fact_check.get('confidence', 0):.2f})")
print(f"💡 {fact_check.get('explanation', 'N/A')}")
if fact_check.get('corrections'):
print(f"✏️ Correction: {fact_check['corrections']}")
if questions and questions.get('questions'):
print(f"\n❓ Questions:")
for i, q in enumerate(questions['questions'], 1):
print(f" {i}. {q}")
def main():
parser = argparse.ArgumentParser(
description="Real-time transcription of Windows speaker output",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python transcribe_speakers.py
python transcribe_speakers.py --model small --language es --interval 5
python transcribe_speakers.py --device "Speakers" --output "meeting.txt"
python transcribe_speakers.py --model medium --interval 10 --output transcripts/live.txt
"""
)
parser.add_argument("--model", default="base",
choices=["tiny", "base", "small", "medium", "large"],
help="Whisper model size (default: base)")
parser.add_argument("--language", default="en",
help="Language code (default: en)")
parser.add_argument("--device", metavar="NAME",
help="Audio device name (partial match). If not specified, auto-detects")
parser.add_argument("--interval", type=float, default=8.0,
help="Processing interval in seconds (default: 8.0)")
parser.add_argument("--min-duration", type=float, default=3.0,
help="Minimum audio duration before transcription (default: 3.0)")
parser.add_argument("--fast-mode", action="store_true",
help="Enable fast mode (lower accuracy, faster transcription)")
parser.add_argument("--output", "-o", metavar="FILE",
help="Save transcript to file (e.g., transcript.txt)")
parser.add_argument("--list-devices", action="store_true",
help="List all available audio devices and exit")
parser.add_argument("--force-cpu", action="store_true",
help="Force CPU processing (disable GPU acceleration)")
parser.add_argument("--gpu-index", type=int, default=0,
help="GPU device index to use (default: 0)")
parser.add_argument("--enable-llm", action="store_true",
help="Enable LLM analysis (fact-checking and questions)")
parser.add_argument("--llm-model", default="gpt-oss:20b",
help="Ollama model to use for LLM analysis (default: gpt-oss:20b)")
parser.add_argument("--llm-debug", action="store_true",
help="Show LLM raw responses for debugging")
parser.add_argument("--sentence-mode", action="store_true",
help="Extract complete sentences by stitching chunks together")
args = parser.parse_args()
if args.list_devices:
list_audio_devices()
return
print("=== Windows Real-Time Audio Transcription ===")
print(f"Model: {args.model} | Language: {args.language} | Interval: {args.interval}s")
if args.output:
print(f"Output: {args.output}")
if args.enable_llm:
print(f"LLM Analysis: Enabled ({args.llm_model})")
if args.sentence_mode:
print(f"Sentence Mode: Enabled (stitching chunks into complete sentences)")
# Initialize audio capture
try:
capturer = WindowsLoopbackAudioCapture(
device_name=args.device,
sample_rate=16000,
chunk_size=2048
)
except RuntimeError as e:
print(f"\n❌ Audio Error: {e}")
print("\nTo fix this:")
print("1. Right-click speaker icon → Sounds → Recording tab")
print("2. Right-click in empty area → Show Disabled Devices")
print("3. Enable 'Stereo Mix' → Set as Default Device")
print("\nAlternative: Install VB-Cable (free) from vb-audio.com")
print(" Then use: --device 'CABLE Output'")
list_audio_devices()
return
# Initialize transcriber
try:
transcriber = WhisperStreamTranscriber(
model_name=args.model,
language=args.language,
force_cpu=args.force_cpu,
device_index=args.gpu_index
)
except Exception as e:
print(f"\n❌ Model Error: {e}")
print("Make sure you installed Whisper correctly")
return
# Initialize LLM analyzer (optional)
llm_analyzer = None
if args.enable_llm:
try:
llm_analyzer = LocalLLMAnalyzer(model=args.llm_model, debug=args.llm_debug)
except RuntimeError as e:
print(f"\n❌ LLM Error: {e}")
print("Continuing without LLM analysis...")
llm_analyzer = None
# Initialize sentence extractor (optional)
sentence_extractor = None
sentence_cleaner = None
if args.sentence_mode:
sentence_extractor = SentenceExtractor(max_buffer_words=150)
sentence_cleaner = SentenceCleaner()
print("✓ Sentence extraction initialized")
# Main processing loop
print(f"\n✅ Started transcription. Press Ctrl+C to stop.\n{'=' * 50}")
last_process_time = time.time()
total_duration = 0
segment_count = 0
# Thread pool for concurrent LLM processing
llm_executor = ThreadPoolExecutor(max_workers=2) if llm_analyzer else None
pending_llm_tasks = {} # Maps segment_count -> future
try:
while True:
# Collect audio
chunk = capturer.read_chunk()
if chunk is not None:
transcriber.add_audio(chunk)
total_duration += len(chunk) / 16000
# Process at intervals
current_time = time.time()
if current_time - last_process_time >= args.interval:
text = transcriber.transcribe_chunk(
min_duration=args.min_duration,
fast_mode=args.fast_mode
)
if text:
segment_count += 1
timestamp = datetime.now().strftime("%H:%M:%S")
# Sentence extraction mode
if sentence_extractor:
# Add chunk to extractor and get complete sentences
sentences = sentence_extractor.add_chunk(text)
for sentence in sentences:
# Clean the sentence
cleaned = sentence_cleaner.clean(sentence) if sentence_cleaner else sentence
if cleaned:
print(f"[{timestamp}] 📝 {cleaned}")
# Save individual sentences
if args.output and not llm_analyzer:
save_transcript(cleaned, timestamp, args.output)
# LLM analysis on complete sentences
if llm_analyzer:
context = f"Sentence from segment {segment_count}"
def run_llm_analysis(txt, ctx, ts, seg_num):
fc = llm_analyzer.fact_check(txt, ctx)
qs = llm_analyzer.generate_augmenting_questions(txt, ctx)
return {
'timestamp': ts,
'text': txt,
'segment_count': seg_num,
'fact_check': fc,
'questions': qs
}
future = llm_executor.submit(run_llm_analysis, cleaned, context, timestamp, segment_count)
pending_llm_tasks[segment_count] = future
else:
# Standard mode: display chunks as-is
# Display transcription immediately (don't wait for LLM)
print(f"[{timestamp}] {text}")
# LLM Analysis (run concurrently in background) - only in non-sentence mode
if llm_analyzer and not sentence_extractor:
context = f"Segment {segment_count}"
# Submit LLM tasks to thread pool
def run_llm_analysis(txt, ctx, ts, seg_num):
fc = llm_analyzer.fact_check(txt, ctx)
qs = llm_analyzer.generate_augmenting_questions(txt, ctx)
return {
'timestamp': ts,
'text': txt,
'segment_count': seg_num,
'fact_check': fc,
'questions': qs
}
future = llm_executor.submit(run_llm_analysis, text, context, timestamp, segment_count)
pending_llm_tasks[segment_count] = future
else:
# Save transcript immediately without LLM
if args.output:
save_transcript(text, timestamp, args.output)
last_process_time = current_time
# Check for completed LLM tasks (non-blocking)
if llm_analyzer:
completed_segments = []
for seg_num, future in pending_llm_tasks.items():
if future.done():
try:
result = future.result()
# Display enriched output
display_enriched_output(
result['text'],
result['timestamp'],
result['fact_check'],
result['questions']
)
# Save enriched output
if args.output:
save_enriched_transcript(result, args.output)
completed_segments.append(seg_num)
except Exception as e:
print(f"⚠️ LLM processing error for segment {seg_num}: {e}")
completed_segments.append(seg_num)
# Remove completed tasks
for seg_num in completed_segments:
del pending_llm_tasks[seg_num]
except KeyboardInterrupt:
print(f"\n{'=' * 50}\n🛑 Stopping transcription...")
# Wait for pending LLM tasks to complete
if llm_analyzer and pending_llm_tasks:
print(f"\n⏳ Waiting for {len(pending_llm_tasks)} pending LLM tasks to complete...")
for seg_num, future in pending_llm_tasks.items():
try:
result = future.result(timeout=30)
display_enriched_output(
result['text'],
result['timestamp'],
result['fact_check'],
result['questions']
)
if args.output:
save_enriched_transcript(result, args.output)
except Exception as e:
print(f"⚠️ LLM task {seg_num} failed: {e}")
# Shutdown executor
if llm_executor:
llm_executor.shutdown(wait=True)
# Cleanup
capturer.close()
# Flush sentence buffer if in sentence mode
if sentence_extractor:
print("\n📝 Flushing sentence buffer...")
final_sentences = sentence_extractor.flush()
for sentence in final_sentences:
cleaned = sentence_cleaner.clean(sentence) if sentence_cleaner else sentence
if cleaned:
timestamp = datetime.now().strftime("%H:%M:%S")
print(f"[{timestamp}] 📝 {cleaned}")
if args.output and not llm_analyzer:
save_transcript(cleaned, timestamp, args.output)
# LLM analysis for flushed sentences
if llm_analyzer:
fact_check = llm_analyzer.fact_check(cleaned, "Final sentence")
questions = llm_analyzer.generate_augmenting_questions(cleaned)
display_enriched_output(cleaned, timestamp, fact_check, questions)
if args.output:
data = {
'timestamp': timestamp,
'text': cleaned,
'fact_check': fact_check,
'questions': questions
}
save_enriched_transcript(data, args.output)
# Process remaining audio
print("\nProcessing remaining audio...")
final_text = transcriber.transcribe_chunk(min_duration=0)
if final_text:
timestamp = datetime.now().strftime("%H:%M:%S")
print(f"[{timestamp}] {final_text}")
# LLM Analysis for final segment (synchronous since we're shutting down)
if llm_analyzer:
fact_check = llm_analyzer.fact_check(final_text, "Final segment")
questions = llm_analyzer.generate_augmenting_questions(final_text)
display_enriched_output(final_text, timestamp, fact_check, questions)
if args.output:
data = {
'timestamp': timestamp,
'text': final_text,
'fact_check': fact_check,
'questions': questions
}
save_enriched_transcript(data, args.output)
else:
if args.output:
save_transcript(final_text, timestamp, args.output)
# Summary
print(f"\n✅ Complete! Processed {total_duration:.1f}s of audio")
print(f" Generated {segment_count} transcript segments")
if args.output and os.path.exists(args.output):
abs_path = os.path.abspath(args.output)
print(f"💾 Transcript saved to: {abs_path}")
if __name__ == "__main__":
main()