first commit

2025-12-17 13:07:01 +01:00
commit c0ca907b01
19 changed files with 3838 additions and 0 deletions
--- a/transcribe_dual_linux.py
+++ b/transcribe_dual_linux.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""
+Real-time transcription with dual audio capture (microphone + speaker monitor).
+Linux/PipeWire optimized with Ollama LLM fact-checking.
+"""
+
+import sounddevice as sd
+import numpy as np
+import threading
+import queue
+import time
+import argparse
+from datetime import datetime
+from faster_whisper import WhisperModel
+
+try:
+    import ollama
+    OLLAMA_AVAILABLE = True
+except ImportError:
+    OLLAMA_AVAILABLE = False
+
+
+class DualAudioCapture:
+    """Capture both microphone and speaker output simultaneously"""
+
+    def __init__(self, mic_device=None, monitor_device=None, sample_rate=16000, chunk_size=2048):
+        self.sample_rate = sample_rate
+        self.chunk_size = chunk_size
+        self.audio_queue = queue.Queue()
+
+        # Find devices
+        devices = sd.query_devices()
+
+        # Microphone (default input or specified)
+        if mic_device is None:
+            self.mic_device = sd.default.device[0]  # Default input
+        else:
+            self.mic_device = self._find_device(mic_device, input_required=True)
+
+        # Monitor/Loopback (for speaker output)
+        if monitor_device:
+            self.monitor_device = self._find_device(monitor_device, input_required=True)
+        else:
+            self.monitor_device = None
+
+        print(f"✓ Microphone: {devices[self.mic_device]['name']} (index {self.mic_device})")
+        if self.monitor_device:
+            print(f"✓ Monitor: {devices[self.monitor_device]['name']} (index {self.monitor_device})")
+        else:
+            print("⚠ No monitor device - capturing microphone only")
+
+        # Start streams
+        self.mic_stream = sd.InputStream(
+            device=self.mic_device,
+            channels=1,
+            samplerate=sample_rate,
+            blocksize=chunk_size,
+            dtype='int16',
+            callback=self._mic_callback
+        )
+
+        if self.monitor_device:
+            self.monitor_stream = sd.InputStream(
+                device=self.monitor_device,
+                channels=1,
+                samplerate=sample_rate,
+                blocksize=chunk_size,
+                dtype='int16',
+                callback=self._monitor_callback
+            )
+        else:
+            self.monitor_stream = None
+
+        self.mic_stream.start()
+        if self.monitor_stream:
+            self.monitor_stream.start()
+
+        print("✓ Audio capture started")
+
+    def _find_device(self, device_name, input_required=True):
+        """Find device by name substring"""
+        devices = sd.query_devices()
+        for i, dev in enumerate(devices):
+            if device_name.lower() in dev['name'].lower():
+                if not input_required or dev['max_input_channels'] > 0:
+                    return i
+        raise RuntimeError(f"Device '{device_name}' not found")
+
+    def _mic_callback(self, indata, frames, time_info, status):
+        """Microphone audio callback"""
+        if status:
+            print(f"⚠ Mic status: {status}")
+        self.audio_queue.put(('mic', indata.copy()))
+
+    def _monitor_callback(self, indata, frames, time_info, status):
+        """Monitor/speaker audio callback"""
+        if status:
+            print(f"⚠ Monitor status: {status}")
+        self.audio_queue.put(('monitor', indata.copy()))
+
+    def read_chunk(self):
+        """Read audio data from queue"""
+        try:
+            return self.audio_queue.get(timeout=0.05)
+        except queue.Empty:
+            return None
+
+    def close(self):
+        """Cleanup resources"""
+        self.mic_stream.stop()
+        self.mic_stream.close()
+        if self.monitor_stream:
+            self.monitor_stream.stop()
+            self.monitor_stream.close()
+
+
+class WhisperTranscriber:
+    """Process audio with Whisper"""
+
+    def __init__(self, model_name="base", language="en", force_cpu=False):
+        print(f"Loading Whisper model '{model_name}'...")
+
+        import torch
+        has_cuda = torch.cuda.is_available() and not force_cpu
+
+        device = "cpu"
+        compute_type = "int8"
+
+        if has_cuda:
+            try:
+                import ctranslate2
+                if ctranslate2.get_cuda_device_count() > 0:
+                    device = "cuda"
+                    compute_type = "float16"
+                    print(f"✓ Using GPU: {torch.cuda.get_device_name(0)}")
+            except Exception as e:
+                print(f"⚠ CUDA unavailable: {e}")
+
+        if device == "cpu":
+            print("✓ Using CPU")
+
+        model_kwargs = {"device": device, "compute_type": compute_type}
+        if device == "cpu":
+            model_kwargs["cpu_threads"] = 4
+
+        self.model = WhisperModel(model_name, **model_kwargs)
+        self.language = language
+        self.mic_buffer = np.array([], dtype=np.float32)
+        self.monitor_buffer = np.array([], dtype=np.float32)
+        self.lock = threading.Lock()
+
+    def add_audio(self, source, audio_chunk):
+        """Add audio to appropriate buffer"""
+        with self.lock:
+            audio_float = audio_chunk.flatten().astype(np.float32) / 32768.0
+            if source == 'mic':
+                self.mic_buffer = np.concatenate([self.mic_buffer, audio_float])
+            else:
+                self.monitor_buffer = np.concatenate([self.monitor_buffer, audio_float])
+
+    def transcribe_chunk(self, min_duration=3.0):
+        """Transcribe accumulated audio"""
+        with self.lock:
+            mic_duration = len(self.mic_buffer) / 16000
+            monitor_duration = len(self.monitor_buffer) / 16000
+
+            results = {}
+
+            # Transcribe microphone
+            if mic_duration >= min_duration:
+                mic_audio = self.mic_buffer.copy()
+                self.mic_buffer = np.array([], dtype=np.float32)
+                results['mic'] = self._transcribe(mic_audio)
+
+            # Transcribe monitor
+            if monitor_duration >= min_duration:
+                monitor_audio = self.monitor_buffer.copy()
+                self.monitor_buffer = np.array([], dtype=np.float32)
+                results['monitor'] = self._transcribe(monitor_audio)
+
+            return results if results else None
+
+    def _transcribe(self, audio):
+        """Internal transcription"""
+        try:
+            segments, _ = self.model.transcribe(
+                audio,
+                language=self.language,
+                beam_size=3,  # Faster than default 5
+                vad_filter=True,
+                vad_parameters=dict(min_silence_duration_ms=500)
+            )
+            text = " ".join([seg.text for seg in segments]).strip()
+            return text if text else None
+        except Exception as e:
+            print(f"❌ Transcription error: {e}")
+            return None
+
+
+class LLMFactChecker:
+    """Fast fact-checking with Ollama"""
+
+    def __init__(self, model="qwen2.5:3b"):
+        if not OLLAMA_AVAILABLE:
+            raise RuntimeError("Ollama not installed: pip install ollama")
+
+        self.model = model
+        try:
+            ollama.list()
+            print(f"✓ Ollama connected: {self.model}")
+        except Exception as e:
+            raise RuntimeError(f"Ollama not running: {e}")
+
+    def fact_check(self, text):
+        """Quick fact-check"""
+        prompt = f"""Fact-check this statement. Reply ONLY with:
+VERDICT: factual/dubious/false
+CONFIDENCE: 0.0-1.0
+REASON: one sentence
+
+Statement: "{text}" """
+
+        try:
+            response = ollama.generate(
+                model=self.model,
+                prompt=prompt,
+                options={"temperature": 0.1, "num_predict": 80}
+            )
+
+            import re
+            text = response['response']
+
+            verdict = re.search(r'VERDICT:\s*(\w+)', text, re.I)
+            confidence = re.search(r'CONFIDENCE:\s*([\d.]+)', text, re.I)
+            reason = re.search(r'REASON:\s*(.+?)(?:\n|$)', text, re.I | re.DOTALL)
+
+            return {
+                'verdict': verdict.group(1).lower() if verdict else 'unknown',
+                'confidence': float(confidence.group(1)) if confidence else 0.5,
+                'reason': reason.group(1).strip() if reason else text[:150]
+            }
+        except Exception as e:
+            return {'verdict': 'error', 'confidence': 0.0, 'reason': str(e)}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Dual audio transcription with fact-checking")
+    parser.add_argument("--model", default="tiny", choices=["tiny", "base", "small", "medium"],
+                        help="Whisper model (default: tiny for speed)")
+    parser.add_argument("--language", default="en", help="Language code")
+    parser.add_argument("--mic", help="Microphone device name (partial match)")
+    parser.add_argument("--monitor", help="Monitor device name for speaker capture")
+    parser.add_argument("--interval", type=float, default=5.0, help="Processing interval (seconds)")
+    parser.add_argument("--min-duration", type=float, default=2.0, help="Min audio duration")
+    parser.add_argument("--enable-llm", action="store_true", help="Enable fact-checking")
+    parser.add_argument("--llm-model", default="qwen2.5:3b", help="Ollama model")
+    parser.add_argument("--list-devices", action="store_true", help="List audio devices")
+    parser.add_argument("--force-cpu", action="store_true", help="Force CPU")
+
+    args = parser.parse_args()
+
+    if args.list_devices:
+        print("\nAvailable audio devices:")
+        for i, dev in enumerate(sd.query_devices()):
+            in_ch = dev['max_input_channels']
+            out_ch = dev['max_output_channels']
+            if in_ch > 0:
+                print(f"  [{i:2d}] {dev['name']:<50} IN:{in_ch} OUT:{out_ch}")
+        return
+
+    print("=== Dual Audio Transcription with Fact-Checking ===")
+    print(f"Model: {args.model} | Language: {args.language} | Interval: {args.interval}s")
+
+    # Initialize capture
+    try:
+        capturer = DualAudioCapture(
+            mic_device=args.mic,
+            monitor_device=args.monitor,
+            sample_rate=16000,
+            chunk_size=2048
+        )
+    except Exception as e:
+        print(f"\n❌ Audio Error: {e}")
+        print("\nTip: Use --list-devices to see available devices")
+        print("     Use --mic and --monitor to specify devices")
+        return
+
+    # Initialize transcriber
+    try:
+        transcriber = WhisperTranscriber(
+            model_name=args.model,
+            language=args.language,
+            force_cpu=args.force_cpu
+        )
+    except Exception as e:
+        print(f"\n❌ Whisper Error: {e}")
+        return
+
+    # Initialize fact checker
+    fact_checker = None
+    if args.enable_llm:
+        try:
+            fact_checker = LLMFactChecker(model=args.llm_model)
+        except Exception as e:
+            print(f"\n⚠ LLM Error: {e}")
+            print("Continuing without fact-checking...")
+
+    # Main loop
+    print(f"\n✅ Started. Press Ctrl+C to stop.\n{'='*60}")
+    last_process = time.time()
+
+    try:
+        while True:
+            # Collect audio
+            chunk = capturer.read_chunk()
+            if chunk:
+                source, audio = chunk
+                transcriber.add_audio(source, audio)
+
+            # Process at intervals
+            if time.time() - last_process >= args.interval:
+                results = transcriber.transcribe_chunk(min_duration=args.min_duration)
+
+                if results:
+                    timestamp = datetime.now().strftime("%H:%M:%S")
+
+                    for source, text in results.items():
+                        if text:
+                            source_emoji = "🎤" if source == 'mic' else "🔊"
+                            print(f"\n{source_emoji} [{timestamp}] {text}")
+
+                            if fact_checker:
+                                fc = fact_checker.fact_check(text)
+                                verdict_emoji = {'factual': '✅', 'dubious': '⚠️', 'false': '❌'}.get(fc['verdict'], '❓')
+                                print(f"   {verdict_emoji} {fc['verdict'].upper()} ({fc['confidence']:.2f}): {fc['reason']}")
+
+                last_process = time.time()
+
+    except KeyboardInterrupt:
+        print(f"\n{'='*60}\n🛑 Stopping...")
+
+    capturer.close()
+    print("\n✅ Done!")
+
+
+if __name__ == "__main__":
+    main()