From ae818f0b4bda099b7afc64b55f44e7522db287c8 Mon Sep 17 00:00:00 2001 From: mike Date: Wed, 17 Dec 2025 16:33:19 +0100 Subject: [PATCH] init --- .gitignore | 307 +++++++++++++++ README.md | 222 +++++++++++ enriched.txt | 226 +++++++++++ requirements.txt | 21 ++ run.sh | 4 + run_transcribe.sh | 14 + sentence_extractor.py | 260 +++++++++++++ transcribe.iml | 13 + transcribe_dual_linux.py | 347 +++++++++++++++++ transcribe_speakers.py | 792 +++++++++++++++++++++++++++++++++++++++ 10 files changed, 2206 insertions(+) create mode 100755 .gitignore create mode 100644 README.md create mode 100755 enriched.txt create mode 100755 requirements.txt create mode 100755 run.sh create mode 100755 run_transcribe.sh create mode 100644 sentence_extractor.py create mode 100644 transcribe.iml create mode 100755 transcribe_dual_linux.py create mode 100755 transcribe_speakers.py diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..566d245 --- /dev/null +++ b/.gitignore @@ -0,0 +1,307 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* +.pnpm-debug.log* + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +free-cluely/node_modules/ +jspm_packages/ +dist-electron/ +dist/ + +# Snowpack dependency directory (https://snowpack.dev/) +web_modules/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional stylelint cache +.stylelintcache + +# Microbundle cache +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variable files +.env +.env.development.local +.env.test.local +.env.production.local +.env.local + +# parcel-bundler cache (https://parceljs.org/) +.cache +.parcel-cache + +# Next.js build output +.next +out + +# Nuxt.js build / generate output +.nuxt +dist + +# Gatsby files +.cache/ +# Comment in the public line in if your project uses Gatsby and not Next.js +# https://nextjs.org/blog/next-9-1#public-directory-support +# public + +# vuepress build output +.vuepress/dist + +# vuepress v2.x temp and cache directory +.temp +.cache + +# vitepress build output +**/.vitepress/dist + +# vitepress cache directory +**/.vitepress/cache + +# Docusaurus cache and generated files +.docusaurus + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +# TernJS port file +.tern-port + +# Stores VSCode versions used for testing VSCode extensions +.vscode-test + +# yarn v2 +.yarn/cache +.yarn/unplugged +.yarn/build-state.yml +.yarn/install-state.gz +.pnp.* + + + +##### +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +.idea/ +.voice/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d1c5ef --- /dev/null +++ b/README.md @@ -0,0 +1,222 @@ +# Verbatim Dicta + +Real-time audio transcription using Whisper AI with optional LLM-powered analysis. Captures system audio via loopback and transcribes it with configurable models and processing options. + +## Features + +- Real-time transcription of system audio (Windows/Linux) +- Multiple Whisper model sizes (tiny to large) +- Multi-language support +- **Sentence extraction mode** - Stitches audio chunks into complete sentences +- Optional LLM analysis for fact-checking and question generation (via Ollama) +- GPU acceleration support +- Flexible audio device configuration + +## Quick Start + +```bash +# Install dependencies +pip install -r requirements.txt + +# Basic transcription (no LLM) +python transcribe_speakers.py + +# With LLM analysis (optional) +python transcribe_speakers.py --enable-llm + +# With sentence extraction +python transcribe_speakers.py --sentence-mode + +# List audio devices +python transcribe_speakers.py --list-devices +``` + +## Requirements + +- **OS**: Windows 10/11 or Linux +- **Python**: 3.8+ +- **Audio**: Loopback device (Stereo Mix/VB-Cable on Windows, PulseAudio on Linux) +- **Optional**: CUDA-capable GPU, Ollama for LLM features + +## Installation + +### 1. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +### 2. GPU Support (Optional) + +For CUDA 11.8: +```bash +pip install torch==2.8.0+cu118 --index-url https://download.pytorch.org/whl/cu118 +``` + +For CUDA 12.1: +```bash +pip install torch==2.8.0+cu121 --index-url https://download.pytorch.org/whl/cu121 +``` + +### 3. Audio Loopback Setup + +**Windows - Option A (Stereo Mix):** +1. Right-click speaker icon β†’ Sounds β†’ Recording tab +2. Right-click β†’ Show Disabled Devices +3. Enable and set Stereo Mix as default + +**Windows - Option B (VB-Cable, recommended):** +1. Download from [vb-audio.com](https://vb-audio.com/Cable/) +2. Install and restart +3. Use `--device "CABLE Output"` + +**Linux:** +Configure PulseAudio loopback or use `transcribe_dual_linux.py` + +### 4. LLM Features (Optional) + +```bash +# Install Ollama from ollama.ai +ollama pull llama3.2 +``` + +## Usage + +### Available Scripts + +- `transcribe_speakers.py` - Main script with all features (LLM optional via `--enable-llm`) +- `transcribe_dual_linux.py` - Linux-specific with dual audio support + +### Common Commands + +```bash +# Specify device and model +python transcribe_speakers.py --device "CABLE Output" --model medium + +# Save to file with language +python transcribe_speakers.py --language es --output transcript.txt + +# Fast mode (low latency) +python transcribe_speakers.py --fast-mode --model tiny --interval 3 + +# Extract complete sentences from chunks +python transcribe_speakers.py --sentence-mode --output sentences.txt + +# Maximum accuracy with LLM and sentence extraction +python transcribe_speakers.py --model large --enable-llm --sentence-mode --output enriched.txt + +# Force CPU (avoid GPU issues) +python transcribe_speakers.py --force-cpu +``` + +### Key Options + +| Option | Description | Default | +|--------|-------------|---------| +| `--model` | Model size: tiny/base/small/medium/large | base | +| `--language` | Language code (en/es/fr/de/ja/etc.) | en | +| `--device` | Audio device name (partial match) | Auto | +| `--interval` | Processing interval (seconds) | 8.0 | +| `--min-duration` | Minimum audio duration | 3.0 | +| `--fast-mode` | Fast mode (3-5x faster, lower accuracy) | False | +| `--enable-llm` | Enable fact-checking and questions | False | +| `--llm-model` | Ollama model to use | llama3.2 | +| `--output` | Save to file | None | +| `--force-cpu` | Disable GPU | False | +| `--gpu-index` | GPU device index | 0 | +| `--sentence-mode` | Extract complete sentences from chunks | False | + +## Model Performance + +| Model | Size | Speed | Quality | Best For | +|-------|------|-------|---------|----------| +| tiny | ~75 MB | Fastest | Basic | Quick tests, low-latency | +| base | ~145 MB | Fast | Good | General real-time use | +| small | ~485 MB | Moderate | Better | Balanced accuracy/speed | +| medium | ~1.5 GB | Slow | Great | High accuracy needs | +| large | ~3 GB | Slowest | Best | Maximum accuracy | + +## Optimization Presets + +**Low Latency (Real-Time):** +```bash +python transcribe_speakers.py --model tiny --fast-mode --interval 2 --min-duration 1.5 +``` + +**Balanced:** +```bash +python transcribe_speakers.py --model base --interval 5 +``` + +**High Accuracy:** +```bash +python transcribe_speakers.py --model large --interval 10 --enable-llm +``` + +## Troubleshooting + +**No loopback device:** +- Windows: Enable Stereo Mix or install VB-Cable +- Linux: Configure PulseAudio loopback + +**CUDA errors:** +```bash +python transcribe_speakers.py --force-cpu +``` + +**No audio captured:** +- Verify audio is playing +- Check device: `--list-devices` +- Increase system volume + +**Poor quality:** +- Use larger model: `--model medium` +- Increase interval: `--interval 10` +- Specify language: `--language ` + +**Ollama errors:** +- Ensure Ollama is running +- Pull model: `ollama pull llama3.2` + +## Output Format + +**Standard:** +``` +[14:23:15] Transcribed audio segment. +[14:23:23] Another segment with timestamp. +``` + +**With LLM (--enable-llm):** +``` +====================================================================== +[14:23:15] The Earth revolves around the Sun in 365 days. + +πŸ“Š Fact Check: FACTUAL (confidence: 0.98) +πŸ’‘ Scientifically accurate. Earth's orbital period is 365.25 days. + +❓ Questions: +1. Why do we need leap years? +2. How does Earth's orbit affect seasons? +====================================================================== +``` + +## Technical Stack + +- **Audio**: sounddevice, soundfile (16kHz mono, 16-bit PCM) +- **Transcription**: faster-whisper (optimized Whisper) +- **LLM**: Ollama (local inference) +- **Capture**: WASAPI loopback (Windows), PulseAudio (Linux) + +## Future Work + +- Real-time streaming transcription with reduced buffering +- Speaker diarization improvements +- Web interface for remote monitoring +- Multi-device simultaneous transcription +- Cloud LLM integration options +- Custom vocabulary and domain adaptation +- Noise reduction preprocessing + +## License + +Uses [Whisper](https://github.com/openai/whisper) (OpenAI), [faster-whisper](https://github.com/SYSTRAN/faster-whisper) (SYSTRAN), and [Ollama](https://ollama.ai). diff --git a/enriched.txt b/enriched.txt new file mode 100755 index 0000000..eb532c3 --- /dev/null +++ b/enriched.txt @@ -0,0 +1,226 @@ +[23:31:46] So it helps us get back into a grounded information terrain and then also it requires us. + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.90) +πŸ’‘ The statement is a vague, non‑specific claim that cannot be verified against any factual evidence. + +❓ Questions: +1. What specific processes or actions help us return to a grounded information terrain? +2. In what ways does this approach require us to change our current practices or mindset? +3. How does re-establishing a grounded information terrain impact the overall effectiveness of the project? + + +====================================================================== +[23:31:54] to take the time to pay attention to information, really absorb it properly, and then to make decisions based on that. So we need to bring people into the process of + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.99) +πŸ’‘ The statement is an incomplete, non‑factual description of a process, not a verifiable claim. + +❓ Questions: +1. What are the key points here? +2. What evidence supports this? +3. What are the implications? + + +====================================================================== +[23:32:02] decision making and at the same time as part of that bring them into a terrain of really + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 1.00) +πŸ’‘ The statement is a nonsensical fragment and does not convey a verifiable factual claim. + +❓ Questions: +1. What does the phrase "bring them into a terrain of really" refer to in the context of decision making? +2. How does the process of decision making simultaneously involve "bringing them into a terrain" as mentioned? +3. Can you explain how the concept of "terrain"? + + +====================================================================== +[23:32:10] curing and discerning information properly and then engage in the decision-making process. That's the only way we're actually going to turn this around. It's not going to be good enough to... + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.90) +πŸ’‘ The fragment is an incomplete quote with no verifiable factual claim. + +❓ Questions: +1. What does "curing and discerning information properly" entail in the context of this statement? +2. How does engaging in the decision‑making process contribute to turning the situation around? +3. Why is simply having information or a plan not sufficient according to the speaker? + + +====================================================================== +[23:32:18] to elect new politicians because the underlying problem of the way we absorb, process and deal with information now remains. And the only way we can do that is actually to do it. + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.95) +πŸ’‘ The statement is an opinion about politics and information processing, not a verifiable factual claim. + +❓ Questions: +1. What specific aspects of the way we absorb, process, and deal with information are identified as the underlying problem in the statement? +2. How does the statement justify the election of new politicians as a solution to the information-related issue it describes? +3. What practical steps or strategies does the statement imply we should take to "actually do it" in addressing the information problem? + + +====================================================================== +[23:32:26] actually by bringing people in on a mass basis, having huge numbers of citizens, juries around the country on a regular basis for decisions we're making involving the + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.90) +πŸ’‘ The statement is incomplete and lacks context, making it impossible to verify its factual accuracy. + +❓ Questions: +1. What are the key points here? +2. What evidence supports this? +3. What are the implications? + + +====================================================================== +[23:32:35] public that's the only way we're going to be able to turn this around and not just think that okay let's just wait for another Kamala Harris or somebody like that to come along and win an election then + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.95) +πŸ’‘ The quoted phrase + +❓ Questions: +1. What specific actions does the speaker believe are necessary? +2. Q1: What specific actions does the speaker believe are necessary? +3. What are the implications? + + +====================================================================== +[23:32:42] we'll all be right and we'll be able to turn the clock back. It won't work like that. The problem is far too deep seated than that. So, yes, we are becoming + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.90) +πŸ’‘ The statement is a vague, incomplete fragment with no verifiable factual claim. + +❓ Questions: +1. What specific problem is the speaker implying is "far too deep seated" to be solved by simply "turning the clock back"? +2. How does the speaker's claim that "we'll all be right" relate to the broader context or argument being presented? +3. In what ways might the statement "So, yes, we are becoming" reflect a shift in perspective or identity for the speaker or the audience? + + +====================================================================== +[23:32:50] That's basically what's going on at the moment. But that doesn't mean that we can lose hope, because there are mechanisms in which we can actually turn that around. + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.90) +πŸ’‘ The statement is a general, non‑specific claim that cannot be verified as true or false. + +❓ Questions: +1. What specific situation or issue is being described as "what's going on at the moment"? +2. What mechanisms are being referred to that could help "turn that around"? +3. How does the speaker justify maintaining hope despite the current challenges? + + +====================================================================== +[23:32:58] by actually engaging in the political process ourselves, which would force us to then utilise information in a different way. + +πŸ“Š Fact Check: DUBIOUS (confidence: 0.70) +πŸ’‘ The claim is a speculative assertion about how political engagement might change information use, and it cannot be verified as a factual statement. + +❓ Questions: +1. What are the key points here? +2. What evidence supports this? +3. What are the implications? + + +====================================================================== +[23:33:06] hopefully in the end come to different conclusions, but be part of that decision-making process too. So it's an important realization. What's happening to us is + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 1.00) +πŸ’‘ The statement is a subjective expression of hope and realization, not a verifiable factual claim. + +❓ Questions: +1. What does the speaker mean by "hopefully in the end come to different conclusions" and how does that relate to the decision-making process mentioned? +2. In what ways might being part of the decision-making process influence the outcomes described in the statement? +3. What specific "important realization" is referenced, and how does it connect to "what's happening to us"? + + +====================================================================== +[23:33:25] species in terms of our intelligence but it more importantly gives us a very important call to action. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. We need to think. + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 1.00) +πŸ’‘ The statement is a nonsensical fragment that does not present any verifiable factual claim. + +❓ Questions: +1. What is the main message conveyed by? +2. Q1: What is the main message conveyed by? +3. What are the implications? + + +====================================================================== +[23:33:40] differently about how we govern ourselves going forward if we are to reverse this genuine decline. I hope you like my video. As a psychiatrist who loves politics and economics and philosophy I love to make videos like this and you can really help promote this video to other people and get it on your feed more by liking and commenting and subscribing to the video. + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 1.00) +πŸ’‘ The statement is a personal comment and request for promotion, not a claim that can be verified as true or false. + +❓ Questions: +1. What specific strategies does the speaker propose for reversing the "genuine decline" in governance mentioned in the statement? +2. How does the speaker’s background as a psychiatrist influence their perspective on politics, economics, and philosophy? +3. In what ways does the speaker suggest viewers can effectively promote the video to reach a wider audience? + + +====================================================================== +[23:33:46] well. We have a wonderful community of people here who comment and support each other through this very traumatic period of world history. + +πŸ“Š Fact Check: DUBIOUS (confidence: 0.60) +πŸ’‘ The claim is a subjective, unverified assertion about a community’s nature and cannot be confirmed or refuted with available evidence. + +❓ Questions: +1. Who are the members of the community mentioned in the statement? +2. Which specific traumatic period of world history is being referred to? +3. In what ways do the community members comment and support each other during this period? + + +====================================================================== +[23:33:54] that we're going through right now. I also hope you consider becoming a subscriber to the channel and also subscribing to my. + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 1.00) +πŸ’‘ The sentence is a fragment expressing a hope, not a verifiable factual claim. + +❓ Questions: +1. What are the key points here? +2. What evidence supports this? +3. What are the implications? + + +====================================================================== +[23:34:02] E-newsletter, there's a link in the description, and that way we can stay in touch outside the channel so you can stay, keep a + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.90) +πŸ’‘ [one sentence]" + +❓ Questions: +1. What are the key points here? +2. What evidence supports this? +3. What are the implications? + + +====================================================================== +[23:34:10] rest of all of the content that I'm making on an ongoing basis. The latest of which actually is my latest book called We the People. + +πŸ“Š Fact Check: DUBIOUS (confidence: 0.50) +πŸ’‘ The statement is a fragment with no verifiable context or evidence that the speaker’s latest book is titled *We the People*. + +❓ Questions: +1. What are the key points here? +2. What evidence supports this? +3. What are the implications? + + +====================================================================== +[23:34:19] very very proud of this book. It's actually a novel, a fiction book, written by myself and the famous award-winning author T.J. McGregor. + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.90) +πŸ’‘ No verifiable record exists of a novel co‑authored by the user and an award‑winning author + +❓ Questions: +1. How did you and T.J. McGregor collaborate on the novel? +2. What inspired you to co-write a fiction book with an award‑winning author? +3. What genre and themes does the novel? + + +====================================================================== +[23:34:27] Together we wrote a book about what the future might look like. Bit of a dystopian novel, but what might happen if autocracy goes to its next stage? + +πŸ“Š Fact Check: NOT_FACTUAL (confidence: 0.90) +πŸ’‘ There is no verifiable evidence that the speaker and the other person co‑authored a book on future dystopias. + +❓ Questions: +1. What core themes and motifs did the book explore to envision the next stage of autocracy? +2. How does the narrative structure of the novel reflect the progression of authoritarian power in a dystopian future? +3. What real-world events or historical patterns inspired the authors to imagine a future where autocracy has evolved beyond its current form? + diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..60be6fd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +# Core dependencies for Windows Real-Time Audio Transcription +# Audio Processing +sounddevice==0.5.3 +soundfile==0.13.1 +numpy==2.2.2 + +# Whisper (faster-whisper backend) +faster-whisper==1.2.1 +ctranslate2==4.6.1 + +# PyTorch (CPU version - see README for GPU installation) +torch==2.8.0 + +# NOTE: For GPU support, uninstall torch first, then run ONE of these commands: +# pip install torch==2.8.0+cu118 --index-url https://download.pytorch.org/whl/cu118 +# pip install torch==2.8.0+cu121 --index-url https://download.pytorch.org/whl/cu121 + +# LLM Analysis (optional - for fact-checking and question generation) +# Requires Ollama to be installed and running +# Install from: https://ollama.ai +ollama==0.6.1 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..6765e29 --- /dev/null +++ b/run.sh @@ -0,0 +1,4 @@ +# Install all dependencies with correct NumPy version +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 +pip install "numpy<2.0" +pip install faster-whisper pyannote.audio==3.1.1 ollama torchaudio sounddevice pydub \ No newline at end of file diff --git a/run_transcribe.sh b/run_transcribe.sh new file mode 100755 index 0000000..36f4e52 --- /dev/null +++ b/run_transcribe.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Run transcription with proper cuDNN library paths + +cd "$(dirname "$0")" +source .venv/bin/activate + +# Set cuDNN library path +CUDNN_PATH=".venv/lib/python3.13/site-packages/nvidia/cudnn/lib" +CUBLAS_PATH=".venv/lib/python3.13/site-packages/nvidia/cublas/lib" + +export LD_LIBRARY_PATH="${CUDNN_PATH}:${CUBLAS_PATH}:${LD_LIBRARY_PATH}" + +# Run the transcription script with all arguments +python3 transcribe_dual_linux.py "$@" diff --git a/sentence_extractor.py b/sentence_extractor.py new file mode 100644 index 0000000..06fcb85 --- /dev/null +++ b/sentence_extractor.py @@ -0,0 +1,260 @@ +""" +Sentence extraction from chunked transcriptions. +Stitches partial chunks together and extracts complete sentences. +""" + +import re +from typing import List, Tuple, Optional +from collections import deque + + +class SentenceExtractor: + """ + Buffers transcription chunks and extracts complete sentences. + Handles sentence boundaries that span across audio chunks. + """ + + def __init__(self, max_buffer_words=200): + """ + Initialize the sentence extractor. + + Args: + max_buffer_words: Maximum words to keep in buffer before forcing extraction + """ + self.buffer = "" + self.max_buffer_words = max_buffer_words + self.completed_sentences = deque() + + # Sentence boundary patterns + self.sentence_end_pattern = re.compile(r'([.!?]+)\s+') + self.sentence_boundaries = re.compile(r'(?<=[.!?])\s+(?=[A-Z])') + + def add_chunk(self, text: str) -> List[str]: + """ + Add a new transcription chunk and extract any complete sentences. + + Args: + text: New transcription text chunk + + Returns: + List of complete sentences extracted + """ + if not text or not text.strip(): + return [] + + # Add to buffer + if self.buffer: + # Smart joining: check if we need a space + if not self.buffer[-1].isspace() and not text[0].isspace(): + self.buffer += " " + self.buffer += text.strip() + + # Extract complete sentences + sentences = self._extract_sentences() + + # Check if buffer is too large + word_count = len(self.buffer.split()) + if word_count > self.max_buffer_words: + # Force extraction of what we have + forced = self._force_extract() + if forced: + sentences.extend(forced) + + return sentences + + def _extract_sentences(self) -> List[str]: + """ + Extract complete sentences from buffer. + Keeps incomplete sentence in buffer. + + Returns: + List of complete sentences + """ + sentences = [] + + # Find sentence boundaries + # Pattern: sentence ending punctuation followed by space and capital letter + # or sentence ending at punctuation before end of buffer + parts = self.sentence_boundaries.split(self.buffer) + + if len(parts) > 1: + # We have complete sentences + # Keep the last part (incomplete sentence) in buffer + sentences = [s.strip() for s in parts[:-1] if s.strip()] + self.buffer = parts[-1].strip() + + return sentences + + def _force_extract(self) -> List[str]: + """ + Force extraction when buffer is too large. + Tries to break at reasonable points. + + Returns: + List of extracted text segments + """ + # Try to find the last sentence-like boundary + last_period = max( + self.buffer.rfind('. '), + self.buffer.rfind('! '), + self.buffer.rfind('? ') + ) + + if last_period > 0: + # Extract up to last period + extracted = self.buffer[:last_period + 1].strip() + self.buffer = self.buffer[last_period + 1:].strip() + return [extracted] + else: + # No sentence boundary found, extract by word limit + words = self.buffer.split() + if len(words) > self.max_buffer_words: + # Take 80% of max_buffer_words + split_point = int(self.max_buffer_words * 0.8) + extracted = " ".join(words[:split_point]) + self.buffer = " ".join(words[split_point:]) + return [extracted + "..."] + + return [] + + def flush(self) -> List[str]: + """ + Flush remaining buffer and return as sentence(s). + Call this at end of transcription. + + Returns: + List of remaining text as sentences + """ + sentences = [] + + if self.buffer.strip(): + # Try to extract any remaining complete sentences first + extracted = self._extract_sentences() + sentences.extend(extracted) + + # Return remaining buffer if it has content + if self.buffer.strip(): + # Check if it ends with punctuation + if not self.buffer[-1] in '.!?': + self.buffer += "." + sentences.append(self.buffer.strip()) + self.buffer = "" + + return sentences + + def get_buffer_status(self) -> dict: + """ + Get current buffer status for debugging. + + Returns: + Dictionary with buffer stats + """ + return { + "buffer_length": len(self.buffer), + "buffer_words": len(self.buffer.split()) if self.buffer else 0, + "buffer_preview": self.buffer[:100] + "..." if len(self.buffer) > 100 else self.buffer + } + + +class SentenceCleaner: + """ + Cleans and normalizes extracted sentences. + Removes duplicates, fixes common transcription issues. + """ + + def __init__(self): + self.seen_sentences = set() + self.similarity_threshold = 0.85 + + def clean(self, sentence: str) -> Optional[str]: + """ + Clean and normalize a sentence. + + Args: + sentence: Raw sentence text + + Returns: + Cleaned sentence or None if should be filtered + """ + if not sentence or not sentence.strip(): + return None + + # Basic cleaning + cleaned = sentence.strip() + + # Remove multiple spaces + cleaned = re.sub(r'\s+', ' ', cleaned) + + # Fix spacing around punctuation + cleaned = re.sub(r'\s+([.!?,;:])', r'\1', cleaned) + + # Capitalize first letter + if cleaned and not cleaned[0].isupper(): + cleaned = cleaned[0].upper() + cleaned[1:] + + # Ensure ends with punctuation + if cleaned and not cleaned[-1] in '.!?': + cleaned += '.' + + # Filter very short sentences (likely fragments) + if len(cleaned.split()) < 3: + return None + + # Check for duplicates (exact) + if cleaned in self.seen_sentences: + return None + + self.seen_sentences.add(cleaned) + return cleaned + + def reset(self): + """Reset seen sentences cache.""" + self.seen_sentences.clear() + + +def demo(): + """Demo usage of sentence extractor.""" + extractor = SentenceExtractor() + cleaner = SentenceCleaner() + + # Simulate chunked transcription + chunks = [ + "Hello everyone welcome to", + "to this presentation today we will", + "will discuss the importance of AI. Artificial intelligence is", + "is transforming many industries. It helps us automate", + "automate tasks and make better decisions. What do you", + "you think about this technology? I believe it has", + "has great potential for the future." + ] + + print("=== Sentence Extraction Demo ===\n") + print("Input chunks:") + for i, chunk in enumerate(chunks, 1): + print(f" Chunk {i}: '{chunk}'") + + print("\n" + "="*50) + print("Extracted sentences:\n") + + for i, chunk in enumerate(chunks, 1): + sentences = extractor.add_chunk(chunk) + for sent in sentences: + cleaned = cleaner.clean(sent) + if cleaned: + print(f" [{i}] {cleaned}") + + # Flush remaining buffer + print("\nFlushing buffer...") + final_sentences = extractor.flush() + for sent in final_sentences: + cleaned = cleaner.clean(sent) + if cleaned: + print(f" [final] {cleaned}") + + print("\n" + "="*50) + print("Buffer status:") + print(extractor.get_buffer_status()) + + +if __name__ == "__main__": + demo() diff --git a/transcribe.iml b/transcribe.iml new file mode 100644 index 0000000..50c95d3 --- /dev/null +++ b/transcribe.iml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/transcribe_dual_linux.py b/transcribe_dual_linux.py new file mode 100755 index 0000000..039a16c --- /dev/null +++ b/transcribe_dual_linux.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +""" +Real-time transcription with dual audio capture (microphone + speaker monitor). +Linux/PipeWire optimized with Ollama LLM fact-checking. +""" + +import sounddevice as sd +import numpy as np +import threading +import queue +import time +import argparse +from datetime import datetime +from faster_whisper import WhisperModel + +try: + import ollama + OLLAMA_AVAILABLE = True +except ImportError: + OLLAMA_AVAILABLE = False + + +class DualAudioCapture: + """Capture both microphone and speaker output simultaneously""" + + def __init__(self, mic_device=None, monitor_device=None, sample_rate=16000, chunk_size=2048): + self.sample_rate = sample_rate + self.chunk_size = chunk_size + self.audio_queue = queue.Queue() + + # Find devices + devices = sd.query_devices() + + # Microphone (default input or specified) + if mic_device is None: + self.mic_device = sd.default.device[0] # Default input + else: + self.mic_device = self._find_device(mic_device, input_required=True) + + # Monitor/Loopback (for speaker output) + if monitor_device: + self.monitor_device = self._find_device(monitor_device, input_required=True) + else: + self.monitor_device = None + + print(f"βœ“ Microphone: {devices[self.mic_device]['name']} (index {self.mic_device})") + if self.monitor_device: + print(f"βœ“ Monitor: {devices[self.monitor_device]['name']} (index {self.monitor_device})") + else: + print("⚠ No monitor device - capturing microphone only") + + # Start streams + self.mic_stream = sd.InputStream( + device=self.mic_device, + channels=1, + samplerate=sample_rate, + blocksize=chunk_size, + dtype='int16', + callback=self._mic_callback + ) + + if self.monitor_device: + self.monitor_stream = sd.InputStream( + device=self.monitor_device, + channels=1, + samplerate=sample_rate, + blocksize=chunk_size, + dtype='int16', + callback=self._monitor_callback + ) + else: + self.monitor_stream = None + + self.mic_stream.start() + if self.monitor_stream: + self.monitor_stream.start() + + print("βœ“ Audio capture started") + + def _find_device(self, device_name, input_required=True): + """Find device by name substring""" + devices = sd.query_devices() + for i, dev in enumerate(devices): + if device_name.lower() in dev['name'].lower(): + if not input_required or dev['max_input_channels'] > 0: + return i + raise RuntimeError(f"Device '{device_name}' not found") + + def _mic_callback(self, indata, frames, time_info, status): + """Microphone audio callback""" + if status: + print(f"⚠ Mic status: {status}") + self.audio_queue.put(('mic', indata.copy())) + + def _monitor_callback(self, indata, frames, time_info, status): + """Monitor/speaker audio callback""" + if status: + print(f"⚠ Monitor status: {status}") + self.audio_queue.put(('monitor', indata.copy())) + + def read_chunk(self): + """Read audio data from queue""" + try: + return self.audio_queue.get(timeout=0.05) + except queue.Empty: + return None + + def close(self): + """Cleanup resources""" + self.mic_stream.stop() + self.mic_stream.close() + if self.monitor_stream: + self.monitor_stream.stop() + self.monitor_stream.close() + + +class WhisperTranscriber: + """Process audio with Whisper""" + + def __init__(self, model_name="base", language="en", force_cpu=False): + print(f"Loading Whisper model '{model_name}'...") + + import torch + has_cuda = torch.cuda.is_available() and not force_cpu + + device = "cpu" + compute_type = "int8" + + if has_cuda: + try: + import ctranslate2 + if ctranslate2.get_cuda_device_count() > 0: + device = "cuda" + compute_type = "float16" + print(f"βœ“ Using GPU: {torch.cuda.get_device_name(0)}") + except Exception as e: + print(f"⚠ CUDA unavailable: {e}") + + if device == "cpu": + print("βœ“ Using CPU") + + model_kwargs = {"device": device, "compute_type": compute_type} + if device == "cpu": + model_kwargs["cpu_threads"] = 4 + + self.model = WhisperModel(model_name, **model_kwargs) + self.language = language + self.mic_buffer = np.array([], dtype=np.float32) + self.monitor_buffer = np.array([], dtype=np.float32) + self.lock = threading.Lock() + + def add_audio(self, source, audio_chunk): + """Add audio to appropriate buffer""" + with self.lock: + audio_float = audio_chunk.flatten().astype(np.float32) / 32768.0 + if source == 'mic': + self.mic_buffer = np.concatenate([self.mic_buffer, audio_float]) + else: + self.monitor_buffer = np.concatenate([self.monitor_buffer, audio_float]) + + def transcribe_chunk(self, min_duration=3.0): + """Transcribe accumulated audio""" + with self.lock: + mic_duration = len(self.mic_buffer) / 16000 + monitor_duration = len(self.monitor_buffer) / 16000 + + results = {} + + # Transcribe microphone + if mic_duration >= min_duration: + mic_audio = self.mic_buffer.copy() + self.mic_buffer = np.array([], dtype=np.float32) + results['mic'] = self._transcribe(mic_audio) + + # Transcribe monitor + if monitor_duration >= min_duration: + monitor_audio = self.monitor_buffer.copy() + self.monitor_buffer = np.array([], dtype=np.float32) + results['monitor'] = self._transcribe(monitor_audio) + + return results if results else None + + def _transcribe(self, audio): + """Internal transcription""" + try: + segments, _ = self.model.transcribe( + audio, + language=self.language, + beam_size=3, # Faster than default 5 + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500) + ) + text = " ".join([seg.text for seg in segments]).strip() + return text if text else None + except Exception as e: + print(f"❌ Transcription error: {e}") + return None + + +class LLMFactChecker: + """Fast fact-checking with Ollama""" + + def __init__(self, model="qwen2.5:3b"): + if not OLLAMA_AVAILABLE: + raise RuntimeError("Ollama not installed: pip install ollama") + + self.model = model + try: + ollama.list() + print(f"βœ“ Ollama connected: {self.model}") + except Exception as e: + raise RuntimeError(f"Ollama not running: {e}") + + def fact_check(self, text): + """Quick fact-check""" + prompt = f"""Fact-check this statement. Reply ONLY with: +VERDICT: factual/dubious/false +CONFIDENCE: 0.0-1.0 +REASON: one sentence + +Statement: "{text}" """ + + try: + response = ollama.generate( + model=self.model, + prompt=prompt, + options={"temperature": 0.1, "num_predict": 80} + ) + + import re + text = response['response'] + + verdict = re.search(r'VERDICT:\s*(\w+)', text, re.I) + confidence = re.search(r'CONFIDENCE:\s*([\d.]+)', text, re.I) + reason = re.search(r'REASON:\s*(.+?)(?:\n|$)', text, re.I | re.DOTALL) + + return { + 'verdict': verdict.group(1).lower() if verdict else 'unknown', + 'confidence': float(confidence.group(1)) if confidence else 0.5, + 'reason': reason.group(1).strip() if reason else text[:150] + } + except Exception as e: + return {'verdict': 'error', 'confidence': 0.0, 'reason': str(e)} + + +def main(): + parser = argparse.ArgumentParser(description="Dual audio transcription with fact-checking") + parser.add_argument("--model", default="tiny", choices=["tiny", "base", "small", "medium"], + help="Whisper model (default: tiny for speed)") + parser.add_argument("--language", default="en", help="Language code") + parser.add_argument("--mic", help="Microphone device name (partial match)") + parser.add_argument("--monitor", help="Monitor device name for speaker capture") + parser.add_argument("--interval", type=float, default=5.0, help="Processing interval (seconds)") + parser.add_argument("--min-duration", type=float, default=2.0, help="Min audio duration") + parser.add_argument("--enable-llm", action="store_true", help="Enable fact-checking") + parser.add_argument("--llm-model", default="qwen2.5:3b", help="Ollama model") + parser.add_argument("--list-devices", action="store_true", help="List audio devices") + parser.add_argument("--force-cpu", action="store_true", help="Force CPU") + + args = parser.parse_args() + + if args.list_devices: + print("\nAvailable audio devices:") + for i, dev in enumerate(sd.query_devices()): + in_ch = dev['max_input_channels'] + out_ch = dev['max_output_channels'] + if in_ch > 0: + print(f" [{i:2d}] {dev['name']:<50} IN:{in_ch} OUT:{out_ch}") + return + + print("=== Dual Audio Transcription with Fact-Checking ===") + print(f"Model: {args.model} | Language: {args.language} | Interval: {args.interval}s") + + # Initialize capture + try: + capturer = DualAudioCapture( + mic_device=args.mic, + monitor_device=args.monitor, + sample_rate=16000, + chunk_size=2048 + ) + except Exception as e: + print(f"\n❌ Audio Error: {e}") + print("\nTip: Use --list-devices to see available devices") + print(" Use --mic and --monitor to specify devices") + return + + # Initialize transcriber + try: + transcriber = WhisperTranscriber( + model_name=args.model, + language=args.language, + force_cpu=args.force_cpu + ) + except Exception as e: + print(f"\n❌ Whisper Error: {e}") + return + + # Initialize fact checker + fact_checker = None + if args.enable_llm: + try: + fact_checker = LLMFactChecker(model=args.llm_model) + except Exception as e: + print(f"\n⚠ LLM Error: {e}") + print("Continuing without fact-checking...") + + # Main loop + print(f"\nβœ… Started. Press Ctrl+C to stop.\n{'='*60}") + last_process = time.time() + + try: + while True: + # Collect audio + chunk = capturer.read_chunk() + if chunk: + source, audio = chunk + transcriber.add_audio(source, audio) + + # Process at intervals + if time.time() - last_process >= args.interval: + results = transcriber.transcribe_chunk(min_duration=args.min_duration) + + if results: + timestamp = datetime.now().strftime("%H:%M:%S") + + for source, text in results.items(): + if text: + source_emoji = "🎀" if source == 'mic' else "πŸ”Š" + print(f"\n{source_emoji} [{timestamp}] {text}") + + if fact_checker: + fc = fact_checker.fact_check(text) + verdict_emoji = {'factual': 'βœ…', 'dubious': '⚠️', 'false': '❌'}.get(fc['verdict'], '❓') + print(f" {verdict_emoji} {fc['verdict'].upper()} ({fc['confidence']:.2f}): {fc['reason']}") + + last_process = time.time() + + except KeyboardInterrupt: + print(f"\n{'='*60}\nπŸ›‘ Stopping...") + + capturer.close() + print("\nβœ… Done!") + + +if __name__ == "__main__": + main() diff --git a/transcribe_speakers.py b/transcribe_speakers.py new file mode 100755 index 0000000..1157130 --- /dev/null +++ b/transcribe_speakers.py @@ -0,0 +1,792 @@ +#!/usr/bin/env python3 +""" +Real-time transcription of Windows speaker output using loopback capture. +Captures system audio and transcribes with Whisper in near real-time. +""" + +import sounddevice as sd +import numpy as np +import threading +import queue +import time +import os +import argparse +import json +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed + +# Whisper transcription (using faster-whisper for optimal performance) +from faster_whisper import WhisperModel + +# Sentence extraction for stitching chunks +from sentence_extractor import SentenceExtractor, SentenceCleaner + +# LLM integration (optional) +try: + import ollama + OLLAMA_AVAILABLE = True +except ImportError: + OLLAMA_AVAILABLE = False + + +class WindowsLoopbackAudioCapture: + """Capture Windows speaker output using WASAPI loopback""" + + def __init__(self, device_name=None, sample_rate=16000, chunk_size=2048): + self.sample_rate = sample_rate + self.chunk_size = chunk_size + + # Find loopback device + self.device_info = self._find_loopback_device(device_name) + if not self.device_info: + raise RuntimeError( + "No loopback device found.\n" + "1. Ensure your speakers/headphones are connected\n" + "2. Enable 'Stereo Mix' in Sound settings\n" + "3. Or install VB-Cable virtual audio device" + ) + + print(f"βœ“ Using device: {self.device_info['name']} (index {self.device_info['index']})") + + # Queue for audio data + self.audio_queue = queue.Queue() + self.stop_event = threading.Event() + + # Start the stream + try: + self.stream = sd.InputStream( + device=self.device_info['index'], + channels=1, + samplerate=sample_rate, + blocksize=chunk_size, + dtype='int16', + latency='low', + callback=self._audio_callback + ) + self.stream.start() + print("βœ“ Audio capture stream started") + except Exception as e: + raise RuntimeError(f"Failed to start audio stream: {e}") + + def _find_loopback_device(self, device_name): + """Find the speaker device with loopback capability""" + devices = sd.query_devices() + + # If device name specified, find exact match + if device_name: + for dev in devices: + if (device_name.lower() in dev['name'].lower() and + dev['max_input_channels'] > 0): + return dev + + # Auto-detect: look for WASAPI speakers/headphones + for dev in devices: + if (dev['max_input_channels'] > 0 and + any(x in dev['name'] for x in ['Speakers', 'Headphones', 'Output'])): + return dev + + # Fallback: Stereo Mix or similar + for dev in devices: + if 'Stereo Mix' in dev['name']: + return dev + + return None + + def _audio_callback(self, indata, frames, time_info, status): + """Callback for audio data""" + if status: + print(f"⚠ Audio status: {status}") + self.audio_queue.put(indata.copy()) + + def read_chunk(self): + """Read audio data from queue""" + try: + return self.audio_queue.get(timeout=0.05).flatten() + except queue.Empty: + return None + + def close(self): + """Cleanup resources""" + if hasattr(self, 'stream'): + self.stream.stop() + self.stream.close() + + +class WhisperStreamTranscriber: + """Process audio chunks with Whisper/faster-whisper""" + + def __init__(self, model_name="base", language="en", force_cpu=False, device_index=0): + print(f"Loading Whisper model '{model_name}'...") + + # Check for CUDA availability + import torch + has_cuda = torch.cuda.is_available() and not force_cpu + + # Force CPU if CUDA libraries incompatible + device = "cpu" + compute_type = "int8" + + if has_cuda: + try: + # Test if CTranslate2 can actually use CUDA + import ctranslate2 + cuda_count = ctranslate2.get_cuda_device_count() + if cuda_count > 0: + # Validate device index + if device_index >= cuda_count: + print(f"⚠️ GPU index {device_index} not available. Found {cuda_count} GPU(s). Using GPU 0.") + device_index = 0 + + # CTranslate2 uses "cuda" + device_index parameter, not "cuda:N" + device = "cuda" + compute_type = "float16" + print(f"Using device: cuda:{device_index} ({torch.cuda.get_device_name(device_index)})") + else: + print(f"CUDA available in PyTorch but not in CTranslate2. Using CPU.") + device = "cpu" + compute_type = "int8" + except Exception as e: + print(f"CUDA libraries not found ({e}). Using CPU.") + device = "cpu" + compute_type = "int8" + else: + print("Using device: cpu") + + # FASTER-WHISPER (recommended): + model_kwargs = { + "device": device, + "compute_type": compute_type + } + + if device == "cuda": + model_kwargs["device_index"] = device_index + elif device == "cpu": + model_kwargs["cpu_threads"] = 4 + + self.model = WhisperModel(model_name, **model_kwargs) + self.language = language + self.audio_buffer = np.array([], dtype=np.float32) + self.lock = threading.Lock() + + def add_audio(self, audio_chunk): + """Add new audio data to buffer""" + with self.lock: + audio_float = audio_chunk.astype(np.float32) / 32768.0 + self.audio_buffer = np.concatenate([self.audio_buffer, audio_float]) + + def transcribe_chunk(self, min_duration=5.0, fast_mode=False): + """Transcribe accumulated audio if enough duration""" + with self.lock: + duration = len(self.audio_buffer) / 16000 + if duration < min_duration: + return None + + audio_to_process = self.audio_buffer.copy() + self.audio_buffer = np.array([], dtype=np.float32) + + # Process with FASTER-WHISPER: + try: + # Optimize parameters for speed vs accuracy + if fast_mode: + # Fast mode: lower beam size, no VAD + segments, _ = self.model.transcribe( + audio_to_process, + language=self.language, + beam_size=1, # Greedy decoding (fastest) + best_of=1, + temperature=0.0, + vad_filter=False, + word_timestamps=False + ) + else: + # Balanced mode: moderate beam size with VAD + segments, _ = self.model.transcribe( + audio_to_process, + language=self.language, + beam_size=3, # Reduced from 5 + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500), + word_timestamps=False + ) + text = " ".join([segment.text for segment in segments]).strip() + return text if text else None + except Exception as e: + print(f"❌ Transcription error: {e}") + return None + + +class LocalLLMAnalyzer: + """Local LLM for fact-checking and question generation using Ollama""" + + def __init__(self, model="llama3.2", debug=False): + if not OLLAMA_AVAILABLE: + raise RuntimeError( + "Ollama package not installed.\n" + "Install with: pip install ollama" + ) + + self.model = model + self.debug = debug + self._test_connection() + + def _test_connection(self): + """Test connection to Ollama service""" + try: + ollama.list() + print(f"βœ“ Ollama connected using model: {self.model}") + except Exception as e: + raise RuntimeError( + f"Cannot connect to Ollama. Ensure it's installed and running.\n" + f"Error: {e}\n" + f"Install from: https://ollama.ai\n" + f"Then run: ollama pull {self.model}" + ) + + def _extract_json(self, text): + """Extract JSON from text that might contain markdown or other formatting""" + # Try to find JSON block in markdown code fence + import re + json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL) + if json_match: + return json_match.group(1) + + # Try to find raw JSON object + json_match = re.search(r'\{.*\}', text, re.DOTALL) + if json_match: + return json_match.group(0) + + return text + + def fact_check(self, text, context=""): + """Analyze text for factual accuracy""" + # Try simple structured format first + prompt = f"""Analyze this for accuracy. Reply in this exact format: + +VERDICT: [factual/dubious/not_factual] +CONFIDENCE: [0.0-1.0] +EXPLANATION: [one sentence] + +Statement: "{text}" +""" + + try: + response = ollama.generate( + model=self.model, + prompt=prompt, + options={"temperature": 0.1, "num_predict": 250} + ) + + response_text = response['response'].strip() + + if self.debug: + print(f"\n[DEBUG] Fact-check response:\n{response_text}\n") + + # Try to parse structured text format + verdict = "dubious" + confidence = 0.5 + explanation = "No explanation provided" + + # Extract VERDICT + import re + verdict_match = re.search(r'VERDICT:\s*(\w+)', response_text, re.IGNORECASE) + if verdict_match: + verdict = verdict_match.group(1).lower() + + # Extract CONFIDENCE + conf_match = re.search(r'CONFIDENCE:\s*([\d.]+)', response_text, re.IGNORECASE) + if conf_match: + try: + confidence = float(conf_match.group(1)) + confidence = max(0.0, min(1.0, confidence)) # Clamp to 0-1 + except ValueError: + pass + + # Extract EXPLANATION + expl_match = re.search(r'EXPLANATION:\s*(.+?)(?:\n\n|\Z)', response_text, re.IGNORECASE | re.DOTALL) + if expl_match: + explanation = expl_match.group(1).strip() + # Clean up incomplete sentences + if explanation and not explanation[-1] in '.!?': + # Try to find last complete sentence + last_period = max(explanation.rfind('.'), explanation.rfind('!'), explanation.rfind('?')) + if last_period > 20: # Keep at least some text + explanation = explanation[:last_period + 1] + + return { + "verdict": verdict, + "confidence": confidence, + "explanation": explanation[:250] if explanation else "Analysis completed", + "sources": [], + "corrections": "" + } + + except Exception as e: + if self.debug: + print(f"[DEBUG] Fact-check error: {e}") + return { + "verdict": "error", + "confidence": 0.0, + "explanation": f"Analysis failed: {str(e)}", + "sources": [], + "corrections": "" + } + + def generate_augmenting_questions(self, text, context=""): + """Generate insightful questions based on the text""" + prompt = f"""Generate 3 questions about this. Reply in this exact format: + +Q1: [question] +Q2: [question] +Q3: [question] + +Statement: "{text}" +""" + + try: + response = ollama.generate( + model=self.model, + prompt=prompt, + options={"temperature": 0.7, "num_predict": 250} + ) + + response_text = response['response'].strip() + + if self.debug: + print(f"\n[DEBUG] Questions response:\n{response_text}\n") + + # Extract questions + import re + questions = [] + for i in range(1, 4): + q_match = re.search(rf'Q{i}:\s*(.+?)(?:\n|$)', response_text, re.IGNORECASE) + if q_match: + question = q_match.group(1).strip() + # Clean up incomplete questions + if question and not question[-1] in '?': + # Try to find last complete question + last_q = question.rfind('?') + if last_q > 10: + question = question[:last_q + 1] + else: + question = question + "?" + questions.append(question) + + # If we couldn't parse, try to split by newlines and take first 3 non-empty lines + if len(questions) < 3: + lines = [line.strip() for line in response_text.split('\n') if line.strip()] + # Filter out lines that look like question markers + lines = [l for l in lines if not re.match(r'^Q\d+:?\s*$', l)] + for line in lines[:3]: + if not line.endswith('?'): + line = line + "?" + questions.append(line) + + # Ensure we have exactly 3 questions + default_questions = [ + "What are the key points here?", + "What evidence supports this?", + "What are the implications?" + ] + while len(questions) < 3: + questions.append(default_questions[len(questions)]) + + return { + "questions": questions[:3], + "topics": [] + } + + except Exception as e: + if self.debug: + print(f"[DEBUG] Questions error: {e}") + return { + "questions": [ + "What are the key points?", + "What supports this claim?", + "What are the implications?" + ], + "topics": [] + } + + +def list_audio_devices(): + """Print all available audio input devices""" + print("\nAvailable audio capture devices:") + devices = sd.query_devices() + for i, dev in enumerate(devices): + if dev['max_input_channels'] > 0: + print(f" [{i}] {dev['name']}") + print(f" Channels: {dev['max_input_channels']} | Sample Rate: {dev['default_samplerate']}") + print() + + +def save_transcript(text, timestamp, filename): + """Append transcript to file""" + os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True) + with open(filename, "a", encoding="utf-8") as f: + f.write(f"[{timestamp}] {text}\n") + + +def save_enriched_transcript(data, filename): + """Save enriched transcript with LLM analysis""" + os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True) + with open(filename, "a", encoding="utf-8") as f: + f.write(f"\n{'='*70}\n") + f.write(f"[{data['timestamp']}] {data['text']}\n\n") + + if 'fact_check' in data: + fc = data['fact_check'] + f.write(f"πŸ“Š Fact Check: {fc.get('verdict', 'N/A').upper()} " + f"(confidence: {fc.get('confidence', 0):.2f})\n") + f.write(f"πŸ’‘ {fc.get('explanation', 'N/A')}\n") + if fc.get('corrections'): + f.write(f"✏️ Correction: {fc['corrections']}\n") + f.write("\n") + + if 'questions' in data and data['questions'].get('questions'): + f.write("❓ Questions:\n") + for i, q in enumerate(data['questions']['questions'], 1): + f.write(f"{i}. {q}\n") + f.write("\n") + + +def display_enriched_output(text, timestamp, fact_check=None, questions=None): + """Display transcript with LLM analysis""" + print(f"\n[{timestamp}] {text}") + + if fact_check: + verdict_emoji = { + 'factual': 'βœ…', + 'dubious': '⚠️', + 'not_factual': '❌', + 'error': '⚠️' + } + emoji = verdict_emoji.get(fact_check.get('verdict', 'error'), '❓') + + print(f"\n{emoji} Fact Check: {fact_check.get('verdict', 'N/A').upper()} " + f"(confidence: {fact_check.get('confidence', 0):.2f})") + print(f"πŸ’‘ {fact_check.get('explanation', 'N/A')}") + + if fact_check.get('corrections'): + print(f"✏️ Correction: {fact_check['corrections']}") + + if questions and questions.get('questions'): + print(f"\n❓ Questions:") + for i, q in enumerate(questions['questions'], 1): + print(f" {i}. {q}") + + +def main(): + parser = argparse.ArgumentParser( + description="Real-time transcription of Windows speaker output", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python transcribe_speakers.py + python transcribe_speakers.py --model small --language es --interval 5 + python transcribe_speakers.py --device "Speakers" --output "meeting.txt" + python transcribe_speakers.py --model medium --interval 10 --output transcripts/live.txt + """ + ) + + parser.add_argument("--model", default="base", + choices=["tiny", "base", "small", "medium", "large"], + help="Whisper model size (default: base)") + parser.add_argument("--language", default="en", + help="Language code (default: en)") + parser.add_argument("--device", metavar="NAME", + help="Audio device name (partial match). If not specified, auto-detects") + parser.add_argument("--interval", type=float, default=8.0, + help="Processing interval in seconds (default: 8.0)") + parser.add_argument("--min-duration", type=float, default=3.0, + help="Minimum audio duration before transcription (default: 3.0)") + parser.add_argument("--fast-mode", action="store_true", + help="Enable fast mode (lower accuracy, faster transcription)") + parser.add_argument("--output", "-o", metavar="FILE", + help="Save transcript to file (e.g., transcript.txt)") + parser.add_argument("--list-devices", action="store_true", + help="List all available audio devices and exit") + parser.add_argument("--force-cpu", action="store_true", + help="Force CPU processing (disable GPU acceleration)") + parser.add_argument("--gpu-index", type=int, default=0, + help="GPU device index to use (default: 0)") + parser.add_argument("--enable-llm", action="store_true", + help="Enable LLM analysis (fact-checking and questions)") + parser.add_argument("--llm-model", default="gpt-oss:20b", + help="Ollama model to use for LLM analysis (default: gpt-oss:20b)") + parser.add_argument("--llm-debug", action="store_true", + help="Show LLM raw responses for debugging") + parser.add_argument("--sentence-mode", action="store_true", + help="Extract complete sentences by stitching chunks together") + + args = parser.parse_args() + + if args.list_devices: + list_audio_devices() + return + + print("=== Windows Real-Time Audio Transcription ===") + print(f"Model: {args.model} | Language: {args.language} | Interval: {args.interval}s") + if args.output: + print(f"Output: {args.output}") + if args.enable_llm: + print(f"LLM Analysis: Enabled ({args.llm_model})") + if args.sentence_mode: + print(f"Sentence Mode: Enabled (stitching chunks into complete sentences)") + + # Initialize audio capture + try: + capturer = WindowsLoopbackAudioCapture( + device_name=args.device, + sample_rate=16000, + chunk_size=2048 + ) + except RuntimeError as e: + print(f"\n❌ Audio Error: {e}") + print("\nTo fix this:") + print("1. Right-click speaker icon β†’ Sounds β†’ Recording tab") + print("2. Right-click in empty area β†’ Show Disabled Devices") + print("3. Enable 'Stereo Mix' β†’ Set as Default Device") + print("\nAlternative: Install VB-Cable (free) from vb-audio.com") + print(" Then use: --device 'CABLE Output'") + list_audio_devices() + return + + # Initialize transcriber + try: + transcriber = WhisperStreamTranscriber( + model_name=args.model, + language=args.language, + force_cpu=args.force_cpu, + device_index=args.gpu_index + ) + except Exception as e: + print(f"\n❌ Model Error: {e}") + print("Make sure you installed Whisper correctly") + return + + # Initialize LLM analyzer (optional) + llm_analyzer = None + if args.enable_llm: + try: + llm_analyzer = LocalLLMAnalyzer(model=args.llm_model, debug=args.llm_debug) + except RuntimeError as e: + print(f"\n❌ LLM Error: {e}") + print("Continuing without LLM analysis...") + llm_analyzer = None + + # Initialize sentence extractor (optional) + sentence_extractor = None + sentence_cleaner = None + if args.sentence_mode: + sentence_extractor = SentenceExtractor(max_buffer_words=150) + sentence_cleaner = SentenceCleaner() + print("βœ“ Sentence extraction initialized") + + # Main processing loop + print(f"\nβœ… Started transcription. Press Ctrl+C to stop.\n{'=' * 50}") + last_process_time = time.time() + total_duration = 0 + segment_count = 0 + + # Thread pool for concurrent LLM processing + llm_executor = ThreadPoolExecutor(max_workers=2) if llm_analyzer else None + pending_llm_tasks = {} # Maps segment_count -> future + + try: + while True: + # Collect audio + chunk = capturer.read_chunk() + if chunk is not None: + transcriber.add_audio(chunk) + total_duration += len(chunk) / 16000 + + # Process at intervals + current_time = time.time() + if current_time - last_process_time >= args.interval: + text = transcriber.transcribe_chunk( + min_duration=args.min_duration, + fast_mode=args.fast_mode + ) + if text: + segment_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + + # Sentence extraction mode + if sentence_extractor: + # Add chunk to extractor and get complete sentences + sentences = sentence_extractor.add_chunk(text) + + for sentence in sentences: + # Clean the sentence + cleaned = sentence_cleaner.clean(sentence) if sentence_cleaner else sentence + if cleaned: + print(f"[{timestamp}] πŸ“ {cleaned}") + + # Save individual sentences + if args.output and not llm_analyzer: + save_transcript(cleaned, timestamp, args.output) + + # LLM analysis on complete sentences + if llm_analyzer: + context = f"Sentence from segment {segment_count}" + + def run_llm_analysis(txt, ctx, ts, seg_num): + fc = llm_analyzer.fact_check(txt, ctx) + qs = llm_analyzer.generate_augmenting_questions(txt, ctx) + return { + 'timestamp': ts, + 'text': txt, + 'segment_count': seg_num, + 'fact_check': fc, + 'questions': qs + } + + future = llm_executor.submit(run_llm_analysis, cleaned, context, timestamp, segment_count) + pending_llm_tasks[segment_count] = future + else: + # Standard mode: display chunks as-is + # Display transcription immediately (don't wait for LLM) + print(f"[{timestamp}] {text}") + + # LLM Analysis (run concurrently in background) - only in non-sentence mode + if llm_analyzer and not sentence_extractor: + context = f"Segment {segment_count}" + + # Submit LLM tasks to thread pool + def run_llm_analysis(txt, ctx, ts, seg_num): + fc = llm_analyzer.fact_check(txt, ctx) + qs = llm_analyzer.generate_augmenting_questions(txt, ctx) + return { + 'timestamp': ts, + 'text': txt, + 'segment_count': seg_num, + 'fact_check': fc, + 'questions': qs + } + + future = llm_executor.submit(run_llm_analysis, text, context, timestamp, segment_count) + pending_llm_tasks[segment_count] = future + else: + # Save transcript immediately without LLM + if args.output: + save_transcript(text, timestamp, args.output) + + last_process_time = current_time + + # Check for completed LLM tasks (non-blocking) + if llm_analyzer: + completed_segments = [] + for seg_num, future in pending_llm_tasks.items(): + if future.done(): + try: + result = future.result() + # Display enriched output + display_enriched_output( + result['text'], + result['timestamp'], + result['fact_check'], + result['questions'] + ) + # Save enriched output + if args.output: + save_enriched_transcript(result, args.output) + completed_segments.append(seg_num) + except Exception as e: + print(f"⚠️ LLM processing error for segment {seg_num}: {e}") + completed_segments.append(seg_num) + + # Remove completed tasks + for seg_num in completed_segments: + del pending_llm_tasks[seg_num] + + except KeyboardInterrupt: + print(f"\n{'=' * 50}\nπŸ›‘ Stopping transcription...") + + # Wait for pending LLM tasks to complete + if llm_analyzer and pending_llm_tasks: + print(f"\n⏳ Waiting for {len(pending_llm_tasks)} pending LLM tasks to complete...") + for seg_num, future in pending_llm_tasks.items(): + try: + result = future.result(timeout=30) + display_enriched_output( + result['text'], + result['timestamp'], + result['fact_check'], + result['questions'] + ) + if args.output: + save_enriched_transcript(result, args.output) + except Exception as e: + print(f"⚠️ LLM task {seg_num} failed: {e}") + + # Shutdown executor + if llm_executor: + llm_executor.shutdown(wait=True) + + # Cleanup + capturer.close() + + # Flush sentence buffer if in sentence mode + if sentence_extractor: + print("\nπŸ“ Flushing sentence buffer...") + final_sentences = sentence_extractor.flush() + for sentence in final_sentences: + cleaned = sentence_cleaner.clean(sentence) if sentence_cleaner else sentence + if cleaned: + timestamp = datetime.now().strftime("%H:%M:%S") + print(f"[{timestamp}] πŸ“ {cleaned}") + + if args.output and not llm_analyzer: + save_transcript(cleaned, timestamp, args.output) + + # LLM analysis for flushed sentences + if llm_analyzer: + fact_check = llm_analyzer.fact_check(cleaned, "Final sentence") + questions = llm_analyzer.generate_augmenting_questions(cleaned) + display_enriched_output(cleaned, timestamp, fact_check, questions) + + if args.output: + data = { + 'timestamp': timestamp, + 'text': cleaned, + 'fact_check': fact_check, + 'questions': questions + } + save_enriched_transcript(data, args.output) + + # Process remaining audio + print("\nProcessing remaining audio...") + final_text = transcriber.transcribe_chunk(min_duration=0) + if final_text: + timestamp = datetime.now().strftime("%H:%M:%S") + print(f"[{timestamp}] {final_text}") + + # LLM Analysis for final segment (synchronous since we're shutting down) + if llm_analyzer: + fact_check = llm_analyzer.fact_check(final_text, "Final segment") + questions = llm_analyzer.generate_augmenting_questions(final_text) + + display_enriched_output(final_text, timestamp, fact_check, questions) + + if args.output: + data = { + 'timestamp': timestamp, + 'text': final_text, + 'fact_check': fact_check, + 'questions': questions + } + save_enriched_transcript(data, args.output) + else: + if args.output: + save_transcript(final_text, timestamp, args.output) + + # Summary + print(f"\nβœ… Complete! Processed {total_duration:.1f}s of audio") + print(f" Generated {segment_count} transcript segments") + if args.output and os.path.exists(args.output): + abs_path = os.path.abspath(args.output) + print(f"πŸ’Ύ Transcript saved to: {abs_path}") + + +if __name__ == "__main__": + main() \ No newline at end of file