Last active
December 3, 2025 16:21
-
-
Save yiwenlu66/cf497d7a940a1fc8d6b29d98a976c040 to your computer and use it in GitHub Desktop.
Whisper dictation: F2 to transcribe, F3 to transcribe + cleanup with Ollama (Qwen 2.5). ~160 lines, GPU-accelerated.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Whisper dictation: F2 to transcribe, F3 to transcribe + cleanup with Ollama.""" | |
| import json | |
| import re | |
| import subprocess | |
| import threading | |
| import time | |
| import urllib.request | |
| import numpy as np | |
| import sounddevice as sd | |
| import whisper | |
| from pynput import keyboard | |
| MODEL = "turbo" | |
| SAMPLE_RATE = 16000 | |
| MIN_DURATION = 0.5 # Ignore recordings shorter than this (seconds) | |
| PROMPT = None # Optional context for transcription | |
| # Ollama configuration for text cleanup (F3) | |
| OLLAMA_HOST = "http://localhost:11434" | |
| OLLAMA_MODEL = "qwen2.5:3b" | |
| CLEANUP_PROMPT = """Clean up the following transcribed speech: | |
| - Add proper punctuation and capitalization | |
| - Remove spoken disfluencies (um, uh, er, like, you know, repeated words, false starts) | |
| - Fix minor grammar issues | |
| - Keep the original language, do NOT translate | |
| - Preserve the original wording, meaning, and tone; only make minimal edits for readability | |
| Output ONLY the cleaned text, nothing else. | |
| Text: {text} | |
| Cleaned:""" | |
| # Post-processing replacements (case-insensitive) | |
| REPLACEMENTS = { | |
| "claude.md": "CLAUDE.md", | |
| } | |
| def postprocess(text): | |
| for wrong, right in REPLACEMENTS.items(): | |
| text = re.sub(re.escape(wrong), right, text, flags=re.IGNORECASE) | |
| return text | |
| def ensure_ollama_running(): | |
| """Start Ollama server if not already running.""" | |
| try: | |
| req = urllib.request.Request(f"{OLLAMA_HOST}/api/tags") | |
| urllib.request.urlopen(req, timeout=2) | |
| return True | |
| except Exception: | |
| print("Starting Ollama server...", end="", flush=True) | |
| subprocess.Popen( | |
| ["ollama", "serve"], | |
| stdout=subprocess.DEVNULL, | |
| stderr=subprocess.DEVNULL, | |
| ) | |
| for _ in range(30): | |
| time.sleep(0.1) | |
| try: | |
| urllib.request.urlopen(req, timeout=1) | |
| print(" ready.") | |
| return True | |
| except Exception: | |
| pass | |
| print(" failed to start.") | |
| return False | |
| def cleanup_text(text): | |
| """Use Ollama + Qwen to fix punctuation and grammar.""" | |
| try: | |
| payload = json.dumps({ | |
| "model": OLLAMA_MODEL, | |
| "prompt": CLEANUP_PROMPT.format(text=text), | |
| "stream": False, | |
| }).encode() | |
| req = urllib.request.Request( | |
| f"{OLLAMA_HOST}/api/generate", | |
| data=payload, | |
| headers={"Content-Type": "application/json"}, | |
| ) | |
| with urllib.request.urlopen(req, timeout=30) as resp: | |
| result = json.loads(resp.read().decode()) | |
| cleaned = result.get("response", "").strip() | |
| if cleaned and len(cleaned) < len(text) * 3: | |
| return cleaned | |
| print(" (cleanup produced invalid output, using raw)", end="") | |
| return text | |
| except Exception as e: | |
| print(f" (cleanup failed: {e}, using raw)", end="") | |
| return text | |
| print(f"Loading {MODEL} model...") | |
| model = whisper.load_model(MODEL) | |
| ensure_ollama_running() | |
| print("Ready. F2=transcribe, F3=transcribe+cleanup") | |
| recording = False | |
| audio_chunks = [] | |
| pending_cleanup = False | |
| lock = threading.Lock() | |
| def on_press(key): | |
| global recording, audio_chunks, pending_cleanup | |
| if key in (keyboard.Key.f2, keyboard.Key.f3) and not recording: | |
| with lock: | |
| recording = True | |
| audio_chunks = [] | |
| pending_cleanup = (key == keyboard.Key.f3) | |
| mode = "Recording+cleanup" if pending_cleanup else "Recording" | |
| print(f"{mode}...", end="", flush=True) | |
| def on_release(key): | |
| global recording, audio_chunks, pending_cleanup | |
| if key in (keyboard.Key.f2, keyboard.Key.f3) and recording: | |
| with lock: | |
| recording = False | |
| chunks = audio_chunks.copy() | |
| audio_chunks = [] | |
| should_cleanup = pending_cleanup | |
| pending_cleanup = False | |
| if chunks: | |
| threading.Thread(target=transcribe_and_paste, args=(chunks, should_cleanup), daemon=True).start() | |
| else: | |
| print(" (no audio)") | |
| def audio_callback(indata, frames, time, status): | |
| if recording: | |
| with lock: | |
| if recording: | |
| audio_chunks.append(indata.copy()) | |
| def transcribe_and_paste(chunks, cleanup=False): | |
| print(" transcribing...", end="", flush=True) | |
| audio = np.concatenate(chunks).flatten().astype(np.float32) | |
| duration = len(audio) / SAMPLE_RATE | |
| if duration < MIN_DURATION: | |
| print(f" (too short: {duration:.1f}s)") | |
| return | |
| result = model.transcribe(audio, fp16=True, task="transcribe", initial_prompt=PROMPT) | |
| text = result["text"].strip() | |
| if cleanup and text: | |
| print(" cleaning...", end="", flush=True) | |
| text = cleanup_text(text) | |
| text = postprocess(text) | |
| if text: | |
| subprocess.run(["xdotool", "type", "--clearmodifiers", "--", text], check=True) | |
| print(f" [{text}]") | |
| else: | |
| print(" (empty)") | |
| stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype=np.float32, callback=audio_callback) | |
| stream.start() | |
| with keyboard.Listener(on_press=on_press, on_release=on_release) as listener: | |
| listener.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment