yiwenlu66/whisper-dictate

## whisper-dictate
#!/usr/bin/env python3
"""Whisper dictation: F2 to transcribe, F3 to transcribe + cleanup with Ollama."""

import json
import re
import subprocess
import threading
import time
import urllib.request
import numpy as np
import sounddevice as sd
import whisper
from pynput import keyboard

MODEL = "turbo"
SAMPLE_RATE = 16000
MIN_DURATION = 0.5  # Ignore recordings shorter than this (seconds)
PROMPT = None  # Optional context for transcription

# Ollama configuration for text cleanup (F3)
OLLAMA_HOST = "http://localhost:11434"
OLLAMA_MODEL = "qwen2.5:3b"
CLEANUP_PROMPT = """Clean up the following transcribed speech:
- Add proper punctuation and capitalization
- Remove spoken disfluencies (um, uh, er, like, you know, repeated words, false starts)
- Fix minor grammar issues
- Keep the original language, do NOT translate
- Preserve the original wording, meaning, and tone; only make minimal edits for readability

Output ONLY the cleaned text, nothing else.

Text: {text}

Cleaned:"""

# Post-processing replacements (case-insensitive)
REPLACEMENTS = {
    "claude.md": "CLAUDE.md",
}

def postprocess(text):
    for wrong, right in REPLACEMENTS.items():
        text = re.sub(re.escape(wrong), right, text, flags=re.IGNORECASE)
    return text

def ensure_ollama_running():
    """Start Ollama server if not already running."""
    try:
        req = urllib.request.Request(f"{OLLAMA_HOST}/api/tags")
        urllib.request.urlopen(req, timeout=2)
        return True
    except Exception:
        print("Starting Ollama server...", end="", flush=True)
        subprocess.Popen(
            ["ollama", "serve"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        for _ in range(30):
            time.sleep(0.1)
            try:
                urllib.request.urlopen(req, timeout=1)
                print(" ready.")
                return True
            except Exception:
                pass
        print(" failed to start.")
        return False

def cleanup_text(text):
    """Use Ollama + Qwen to fix punctuation and grammar."""
    try:
        payload = json.dumps({
            "model": OLLAMA_MODEL,
            "prompt": CLEANUP_PROMPT.format(text=text),
            "stream": False,
        }).encode()

        req = urllib.request.Request(
            f"{OLLAMA_HOST}/api/generate",
            data=payload,
            headers={"Content-Type": "application/json"},
        )

        with urllib.request.urlopen(req, timeout=30) as resp:
            result = json.loads(resp.read().decode())
            cleaned = result.get("response", "").strip()

            if cleaned and len(cleaned) < len(text) * 3:
                return cleaned

        print(" (cleanup produced invalid output, using raw)", end="")
        return text
    except Exception as e:
        print(f" (cleanup failed: {e}, using raw)", end="")
        return text

print(f"Loading {MODEL} model...")
model = whisper.load_model(MODEL)
ensure_ollama_running()
print("Ready. F2=transcribe, F3=transcribe+cleanup")

recording = False
audio_chunks = []
pending_cleanup = False
lock = threading.Lock()

def on_press(key):
    global recording, audio_chunks, pending_cleanup
    if key in (keyboard.Key.f2, keyboard.Key.f3) and not recording:
        with lock:
            recording = True
            audio_chunks = []
            pending_cleanup = (key == keyboard.Key.f3)
        mode = "Recording+cleanup" if pending_cleanup else "Recording"
        print(f"{mode}...", end="", flush=True)

def on_release(key):
    global recording, audio_chunks, pending_cleanup
    if key in (keyboard.Key.f2, keyboard.Key.f3) and recording:
        with lock:
            recording = False
            chunks = audio_chunks.copy()
            audio_chunks = []
            should_cleanup = pending_cleanup
            pending_cleanup = False

        if chunks:
            threading.Thread(target=transcribe_and_paste, args=(chunks, should_cleanup), daemon=True).start()
        else:
            print(" (no audio)")

def audio_callback(indata, frames, time, status):
    if recording:
        with lock:
            if recording:
                audio_chunks.append(indata.copy())

def transcribe_and_paste(chunks, cleanup=False):
    print(" transcribing...", end="", flush=True)
    audio = np.concatenate(chunks).flatten().astype(np.float32)

    duration = len(audio) / SAMPLE_RATE
    if duration < MIN_DURATION:
        print(f" (too short: {duration:.1f}s)")
        return

    result = model.transcribe(audio, fp16=True, task="transcribe", initial_prompt=PROMPT)
    text = result["text"].strip()

    if cleanup and text:
        print(" cleaning...", end="", flush=True)
        text = cleanup_text(text)

    text = postprocess(text)

    if text:
        subprocess.run(["xdotool", "type", "--clearmodifiers", "--", text], check=True)
        print(f" [{text}]")
    else:
        print(" (empty)")

stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype=np.float32, callback=audio_callback)
stream.start()

with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
    listener.join()
	#!/usr/bin/env python3
	"""Whisper dictation: F2 to transcribe, F3 to transcribe + cleanup with Ollama."""

	import json
	import re
	import subprocess
	import threading
	import time
	import urllib.request
	import numpy as np
	import sounddevice as sd
	import whisper
	from pynput import keyboard

	MODEL = "turbo"
	SAMPLE_RATE = 16000
	MIN_DURATION = 0.5 # Ignore recordings shorter than this (seconds)
	PROMPT = None # Optional context for transcription

	# Ollama configuration for text cleanup (F3)
	OLLAMA_HOST = "http://localhost:11434"
	OLLAMA_MODEL = "qwen2.5:3b"
	CLEANUP_PROMPT = """Clean up the following transcribed speech:
	- Add proper punctuation and capitalization
	- Remove spoken disfluencies (um, uh, er, like, you know, repeated words, false starts)
	- Fix minor grammar issues
	- Keep the original language, do NOT translate
	- Preserve the original wording, meaning, and tone; only make minimal edits for readability

	Output ONLY the cleaned text, nothing else.

	Text: {text}

	Cleaned:"""

	# Post-processing replacements (case-insensitive)
	REPLACEMENTS = {
	"claude.md": "CLAUDE.md",
	}

	def postprocess(text):
	for wrong, right in REPLACEMENTS.items():
	text = re.sub(re.escape(wrong), right, text, flags=re.IGNORECASE)
	return text

	def ensure_ollama_running():
	"""Start Ollama server if not already running."""
	try:
	req = urllib.request.Request(f"{OLLAMA_HOST}/api/tags")
	urllib.request.urlopen(req, timeout=2)
	return True
	except Exception:
	print("Starting Ollama server...", end="", flush=True)
	subprocess.Popen(
	["ollama", "serve"],
	stdout=subprocess.DEVNULL,
	stderr=subprocess.DEVNULL,
	)
	for _ in range(30):
	time.sleep(0.1)
	try:
	urllib.request.urlopen(req, timeout=1)
	print(" ready.")
	return True
	except Exception:
	pass
	print(" failed to start.")
	return False

	def cleanup_text(text):
	"""Use Ollama + Qwen to fix punctuation and grammar."""
	try:
	payload = json.dumps({
	"model": OLLAMA_MODEL,
	"prompt": CLEANUP_PROMPT.format(text=text),
	"stream": False,
	}).encode()

	req = urllib.request.Request(
	f"{OLLAMA_HOST}/api/generate",
	data=payload,
	headers={"Content-Type": "application/json"},
	)

	with urllib.request.urlopen(req, timeout=30) as resp:
	result = json.loads(resp.read().decode())
	cleaned = result.get("response", "").strip()

	if cleaned and len(cleaned) < len(text) * 3:
	return cleaned

	print(" (cleanup produced invalid output, using raw)", end="")
	return text
	except Exception as e:
	print(f" (cleanup failed: {e}, using raw)", end="")
	return text

	print(f"Loading {MODEL} model...")
	model = whisper.load_model(MODEL)
	ensure_ollama_running()
	print("Ready. F2=transcribe, F3=transcribe+cleanup")

	recording = False
	audio_chunks = []
	pending_cleanup = False
	lock = threading.Lock()

	def on_press(key):
	global recording, audio_chunks, pending_cleanup
	if key in (keyboard.Key.f2, keyboard.Key.f3) and not recording:
	with lock:
	recording = True
	audio_chunks = []
	pending_cleanup = (key == keyboard.Key.f3)
	mode = "Recording+cleanup" if pending_cleanup else "Recording"
	print(f"{mode}...", end="", flush=True)

	def on_release(key):
	global recording, audio_chunks, pending_cleanup
	if key in (keyboard.Key.f2, keyboard.Key.f3) and recording:
	with lock:
	recording = False
	chunks = audio_chunks.copy()
	audio_chunks = []
	should_cleanup = pending_cleanup
	pending_cleanup = False

	if chunks:
	threading.Thread(target=transcribe_and_paste, args=(chunks, should_cleanup), daemon=True).start()
	else:
	print(" (no audio)")

	def audio_callback(indata, frames, time, status):
	if recording:
	with lock:
	if recording:
	audio_chunks.append(indata.copy())

	def transcribe_and_paste(chunks, cleanup=False):
	print(" transcribing...", end="", flush=True)
	audio = np.concatenate(chunks).flatten().astype(np.float32)

	duration = len(audio) / SAMPLE_RATE
	if duration < MIN_DURATION:
	print(f" (too short: {duration:.1f}s)")
	return

	result = model.transcribe(audio, fp16=True, task="transcribe", initial_prompt=PROMPT)
	text = result["text"].strip()

	if cleanup and text:
	print(" cleaning...", end="", flush=True)
	text = cleanup_text(text)

	text = postprocess(text)

	if text:
	subprocess.run(["xdotool", "type", "--clearmodifiers", "--", text], check=True)
	print(f" [{text}]")
	else:
	print(" (empty)")

	stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype=np.float32, callback=audio_callback)
	stream.start()

	with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
	listener.join()
No results found