Files
verbatim-dicta/transcribe_No_llm.py
2025-12-17 13:07:01 +01:00

596 lines
21 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Real-time transcription of Windows speaker output using loopback capture.
Captures system audio and transcribes with Whisper in near real-time.
"""
import sounddevice as sd
import numpy as np
import threading
import queue
import time
import os
import argparse
import json
from datetime import datetime
# Choose your Whisper backend here:
# For faster-whisper (recommended):
from faster_whisper import WhisperModel
# LLM integration
try:
import ollama
OLLAMA_AVAILABLE = True
except ImportError:
OLLAMA_AVAILABLE = False
# # For regular whisper (comment out the line above and uncomment these):
# import whisper
class WindowsLoopbackAudioCapture:
"""Capture Windows speaker output using WASAPI loopback"""
def __init__(self, device_name=None, sample_rate=16000, chunk_size=2048):
self.sample_rate = sample_rate
self.chunk_size = chunk_size
# Find loopback device
self.device_info = self._find_loopback_device(device_name)
if not self.device_info:
raise RuntimeError(
"No loopback device found.\n"
"1. Ensure your speakers/headphones are connected\n"
"2. Enable 'Stereo Mix' in Sound settings\n"
"3. Or install VB-Cable virtual audio device"
)
print(f"✓ Using device: {self.device_info['name']} (index {self.device_info['index']})")
# Queue for audio data
self.audio_queue = queue.Queue()
self.stop_event = threading.Event()
# Start the stream
try:
self.stream = sd.InputStream(
device=self.device_info['index'],
channels=1,
samplerate=sample_rate,
blocksize=chunk_size,
dtype='int16',
latency='low',
callback=self._audio_callback
)
self.stream.start()
print("✓ Audio capture stream started")
except Exception as e:
raise RuntimeError(f"Failed to start audio stream: {e}")
def _find_loopback_device(self, device_name):
"""Find the speaker device with loopback capability"""
devices = sd.query_devices()
# If device name specified, find exact match
if device_name:
for dev in devices:
if (device_name.lower() in dev['name'].lower() and
dev['max_input_channels'] > 0):
return dev
# Auto-detect: look for WASAPI speakers/headphones
for dev in devices:
if (dev['max_input_channels'] > 0 and
any(x in dev['name'] for x in ['Speakers', 'Headphones', 'Output'])):
return dev
# Fallback: Stereo Mix or similar
for dev in devices:
if 'Stereo Mix' in dev['name']:
return dev
return None
def _audio_callback(self, indata, frames, time_info, status):
"""Callback for audio data"""
if status:
print(f"⚠ Audio status: {status}")
self.audio_queue.put(indata.copy())
def read_chunk(self):
"""Read audio data from queue"""
try:
return self.audio_queue.get(timeout=0.05).flatten()
except queue.Empty:
return None
def close(self):
"""Cleanup resources"""
if hasattr(self, 'stream'):
self.stream.stop()
self.stream.close()
class WhisperStreamTranscriber:
"""Process audio chunks with Whisper/faster-whisper"""
def __init__(self, model_name="base", language="en", force_cpu=False):
print(f"Loading Whisper model '{model_name}'...")
# Check for CUDA availability
import torch
has_cuda = torch.cuda.is_available() and not force_cpu
# Force CPU if CUDA libraries incompatible
device = "cpu"
compute_type = "int8"
if has_cuda:
try:
# Test if CTranslate2 can actually use CUDA
import ctranslate2
cuda_count = ctranslate2.get_cuda_device_count()
if cuda_count > 0:
device = "cuda"
compute_type = "float16"
print(f"Using device: cuda ({torch.cuda.get_device_name(0)})")
else:
print(f"CUDA available in PyTorch but not in CTranslate2. Using CPU.")
except Exception as e:
print(f"CUDA libraries not found ({e}). Using CPU.")
else:
print("Using device: cpu")
# FASTER-WHISPER (recommended):
model_kwargs = {
"device": device,
"compute_type": compute_type
}
if not has_cuda:
model_kwargs["cpu_threads"] = 4
self.model = WhisperModel(model_name, **model_kwargs)
self.language = language
self.audio_buffer = np.array([], dtype=np.float32)
self.lock = threading.Lock()
# # REGULAR WHISPER:
# self.model = whisper.load_model(model_name)
# self.language = language
# self.audio_buffer = np.array([], dtype=np.float32)
# self.lock = threading.Lock()
def add_audio(self, audio_chunk):
"""Add new audio data to buffer"""
with self.lock:
audio_float = audio_chunk.astype(np.float32) / 32768.0
self.audio_buffer = np.concatenate([self.audio_buffer, audio_float])
def transcribe_chunk(self, min_duration=5.0):
"""Transcribe accumulated audio if enough duration"""
with self.lock:
duration = len(self.audio_buffer) / 16000
if duration < min_duration:
return None
audio_to_process = self.audio_buffer.copy()
self.audio_buffer = np.array([], dtype=np.float32)
# Process with FASTER-WHISPER:
try:
segments, _ = self.model.transcribe(
audio_to_process,
language=self.language,
beam_size=5,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500),
word_timestamps=False
)
text = " ".join([segment.text for segment in segments]).strip()
return text if text else None
except Exception as e:
print(f"❌ Transcription error: {e}")
return None
# # REGULAR WHISPER:
# try:
# result = self.model.transcribe(
# audio_to_process,
# language=self.language,
# task="transcribe",
# fp16=False
# )
# return result["text"].strip()
# except Exception as e:
# print(f"❌ Transcription error: {e}")
# return None
class LocalLLMAnalyzer:
"""Local LLM for fact-checking and question generation using Ollama"""
def __init__(self, model="llama3.2"):
if not OLLAMA_AVAILABLE:
raise RuntimeError(
"Ollama package not installed.\n"
"Install with: pip install ollama"
)
self.model = model
self._test_connection()
def _test_connection(self):
"""Test connection to Ollama service"""
try:
ollama.list()
print(f"✓ Ollama connected using model: {self.model}")
except Exception as e:
raise RuntimeError(
f"Cannot connect to Ollama. Ensure it's installed and running.\n"
f"Error: {e}\n"
f"Install from: https://ollama.ai\n"
f"Then run: ollama pull {self.model}"
)
def _extract_json(self, text):
"""Extract JSON from text that might contain markdown or other formatting"""
# Try to find JSON block in markdown code fence
import re
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if json_match:
return json_match.group(1)
# Try to find raw JSON object
json_match = re.search(r'\{.*\}', text, re.DOTALL)
if json_match:
return json_match.group(0)
return text
def fact_check(self, text, context=""):
"""Analyze text for factual accuracy"""
prompt = f"""You are a fact-checking assistant. Analyze this statement for factual accuracy.
Context: {context}
Statement: "{text}"
You must respond with ONLY valid JSON in this exact format, no other text:
{{
"verdict": "factual",
"confidence": 0.95,
"explanation": "Brief explanation here",
"sources": ["source1"],
"corrections": ""
}}
Valid verdict values: "factual", "dubious", "not_factual"
Confidence must be a number between 0.0 and 1.0."""
try:
response = ollama.generate(
model=self.model,
prompt=prompt,
options={"temperature": 0.1, "num_predict": 200}
)
# Extract and parse JSON
response_text = response['response']
json_text = self._extract_json(response_text)
result = json.loads(json_text)
# Validate required fields
if 'verdict' not in result or 'confidence' not in result:
raise ValueError("Missing required fields")
# Ensure defaults for optional fields
result.setdefault('explanation', 'No explanation provided')
result.setdefault('sources', [])
result.setdefault('corrections', '')
return result
except (json.JSONDecodeError, ValueError) as e:
# Return a simple analysis without JSON parsing
return {
"verdict": "dubious",
"confidence": 0.5,
"explanation": f"Could not parse LLM response properly. Model may need JSON format support.",
"sources": [],
"corrections": ""
}
except Exception as e:
return {
"verdict": "error",
"confidence": 0.0,
"explanation": f"Analysis failed: {str(e)}",
"sources": [],
"corrections": ""
}
def generate_augmenting_questions(self, text, context=""):
"""Generate insightful questions based on the text"""
prompt = f"""Based on this statement, generate 3 insightful questions that would help understand the topic better.
Statement: "{text}"
Context: {context}
Respond with JSON only:
{{
"questions": ["Question 1", "Question 2", "Question 3"],
"topics": ["key_topic_1", "key_topic_2"]
}}"""
try:
response = ollama.generate(
model=self.model,
prompt=prompt,
format="json",
options={"temperature": 0.7}
)
return json.loads(response['response'])
except json.JSONDecodeError:
return {
"questions": ["Error: LLM response was not valid JSON"],
"topics": []
}
except Exception as e:
return {
"questions": [f"Error: {str(e)}"],
"topics": []
}
def list_audio_devices():
"""Print all available audio input devices"""
print("\nAvailable audio capture devices:")
devices = sd.query_devices()
for i, dev in enumerate(devices):
if dev['max_input_channels'] > 0:
print(f" [{i}] {dev['name']}")
print(f" Channels: {dev['max_input_channels']} | Sample Rate: {dev['default_samplerate']}")
print()
def save_transcript(text, timestamp, filename):
"""Append transcript to file"""
os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
with open(filename, "a", encoding="utf-8") as f:
f.write(f"[{timestamp}] {text}\n")
def save_enriched_transcript(data, filename):
"""Save enriched transcript with LLM analysis"""
os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
with open(filename, "a", encoding="utf-8") as f:
f.write(f"\n{'='*70}\n")
f.write(f"[{data['timestamp']}] {data['text']}\n\n")
if 'fact_check' in data:
fc = data['fact_check']
f.write(f"📊 Fact Check: {fc.get('verdict', 'N/A').upper()} "
f"(confidence: {fc.get('confidence', 0):.2f})\n")
f.write(f"💡 {fc.get('explanation', 'N/A')}\n")
if fc.get('corrections'):
f.write(f"✏️ Correction: {fc['corrections']}\n")
f.write("\n")
if 'questions' in data and data['questions'].get('questions'):
f.write("❓ Questions:\n")
for i, q in enumerate(data['questions']['questions'], 1):
f.write(f"{i}. {q}\n")
f.write("\n")
def display_enriched_output(text, timestamp, fact_check=None, questions=None):
"""Display transcript with LLM analysis"""
print(f"\n[{timestamp}] {text}")
if fact_check:
verdict_emoji = {
'factual': '',
'dubious': '⚠️',
'not_factual': '',
'error': '⚠️'
}
emoji = verdict_emoji.get(fact_check.get('verdict', 'error'), '')
print(f"\n{emoji} Fact Check: {fact_check.get('verdict', 'N/A').upper()} "
f"(confidence: {fact_check.get('confidence', 0):.2f})")
print(f"💡 {fact_check.get('explanation', 'N/A')}")
if fact_check.get('corrections'):
print(f"✏️ Correction: {fact_check['corrections']}")
if questions and questions.get('questions'):
print(f"\n❓ Questions:")
for i, q in enumerate(questions['questions'], 1):
print(f" {i}. {q}")
def main():
parser = argparse.ArgumentParser(
description="Real-time transcription of Windows speaker output",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python transcribe_speakers.py
python transcribe_speakers.py --model small --language es --interval 5
python transcribe_speakers.py --device "Speakers" --output "meeting.txt"
python transcribe_speakers.py --model medium --interval 10 --output transcripts/live.txt
"""
)
parser.add_argument("--model", default="base",
choices=["tiny", "base", "small", "medium", "large"],
help="Whisper model size (default: base)")
parser.add_argument("--language", default="en",
help="Language code (default: en)")
parser.add_argument("--device", metavar="NAME",
help="Audio device name (partial match). If not specified, auto-detects")
parser.add_argument("--interval", type=float, default=8.0,
help="Processing interval in seconds (default: 8.0)")
parser.add_argument("--output", "-o", metavar="FILE",
help="Save transcript to file (e.g., transcript.txt)")
parser.add_argument("--list-devices", action="store_true",
help="List all available audio devices and exit")
parser.add_argument("--force-cpu", action="store_true",
help="Force CPU processing (disable GPU acceleration)")
parser.add_argument("--enable-llm", action="store_true",
help="Enable LLM analysis (fact-checking and questions)")
parser.add_argument("--llm-model", default="gpt-oss:20b",
help="Ollama model to use for LLM analysis (default: gpt-oss:20b)")
args = parser.parse_args()
if args.list_devices:
list_audio_devices()
return
print("=== Windows Real-Time Audio Transcription ===")
print(f"Model: {args.model} | Language: {args.language} | Interval: {args.interval}s")
if args.output:
print(f"Output: {args.output}")
if args.enable_llm:
print(f"LLM Analysis: Enabled ({args.llm_model})")
# Initialize audio capture
try:
capturer = WindowsLoopbackAudioCapture(
device_name=args.device,
sample_rate=16000,
chunk_size=2048
)
except RuntimeError as e:
print(f"\n❌ Audio Error: {e}")
print("\nTo fix this:")
print("1. Right-click speaker icon → Sounds → Recording tab")
print("2. Right-click in empty area → Show Disabled Devices")
print("3. Enable 'Stereo Mix' → Set as Default Device")
print("\nAlternative: Install VB-Cable (free) from vb-audio.com")
print(" Then use: --device 'CABLE Output'")
list_audio_devices()
return
# Initialize transcriber
try:
transcriber = WhisperStreamTranscriber(
model_name=args.model,
language=args.language,
force_cpu=args.force_cpu
)
except Exception as e:
print(f"\n❌ Model Error: {e}")
print("Make sure you installed Whisper correctly")
return
# Initialize LLM analyzer (optional)
llm_analyzer = None
if args.enable_llm:
try:
llm_analyzer = LocalLLMAnalyzer(model=args.llm_model)
except RuntimeError as e:
print(f"\n❌ LLM Error: {e}")
print("Continuing without LLM analysis...")
llm_analyzer = None
# Main processing loop
print(f"\n✅ Started transcription. Press Ctrl+C to stop.\n{'=' * 50}")
last_process_time = time.time()
total_duration = 0
segment_count = 0
try:
while True:
# Collect audio
chunk = capturer.read_chunk()
if chunk is not None:
transcriber.add_audio(chunk)
total_duration += len(chunk) / 16000
# Process at intervals
current_time = time.time()
if current_time - last_process_time >= args.interval:
text = transcriber.transcribe_chunk()
if text:
segment_count += 1
timestamp = datetime.now().strftime("%H:%M:%S")
# LLM Analysis
fact_check = None
questions = None
if llm_analyzer:
context = f"Segment {segment_count}"
fact_check = llm_analyzer.fact_check(text, context)
questions = llm_analyzer.generate_augmenting_questions(text, context)
# Display output
if llm_analyzer:
display_enriched_output(text, timestamp, fact_check, questions)
else:
print(f"[{timestamp}] {text}")
# Save output
if args.output:
if llm_analyzer:
data = {
'timestamp': timestamp,
'text': text,
'fact_check': fact_check,
'questions': questions
}
save_enriched_transcript(data, args.output)
else:
save_transcript(text, timestamp, args.output)
last_process_time = current_time
except KeyboardInterrupt:
print(f"\n{'=' * 50}\n🛑 Stopping transcription...")
# Cleanup
capturer.close()
# Process remaining audio
print("\nProcessing remaining audio...")
final_text = transcriber.transcribe_chunk(min_duration=0)
if final_text:
timestamp = datetime.now().strftime("%H:%M:%S")
# LLM Analysis for final segment
fact_check = None
questions = None
if llm_analyzer:
fact_check = llm_analyzer.fact_check(final_text, "Final segment")
questions = llm_analyzer.generate_augmenting_questions(final_text)
# Display output
if llm_analyzer:
display_enriched_output(final_text, timestamp, fact_check, questions)
else:
print(f"[{timestamp}] {final_text}")
# Save output
if args.output:
if llm_analyzer:
data = {
'timestamp': timestamp,
'text': final_text,
'fact_check': fact_check,
'questions': questions
}
save_enriched_transcript(data, args.output)
else:
save_transcript(final_text, timestamp, args.output)
# Summary
print(f"\n✅ Complete! Processed {total_duration:.1f}s of audio")
print(f" Generated {segment_count} transcript segments")
if args.output and os.path.exists(args.output):
abs_path = os.path.abspath(args.output)
print(f"💾 Transcript saved to: {abs_path}")
if __name__ == "__main__":
main()