InTEGr8or/GlobalTTS.ahk

## README.md

      
    Raw
  

              README.md
            
          
    Global Windows AI TTS Reader

Select any text in Windows and have it read aloud by a modern AI voice. This setup uses AutoHotkey for shortcuts and uv/Python for high-quality, low-latency streaming TTS.
🚀 Key Features


Modern AI Voices: Uses Microsoft Edge's Neural TTS (en-US-AndrewNeural).
Zero-Latency Streaming: Audio starts playing as it streams, no waiting for full downloads.
Local Caching: Repeatedly read text plays instantly from the local audio_cache.
Atomic Dependencies: Uses uv script metadata to handle all Python requirements automatically.
Intelligent Logging: Automatic log rotation at 1MB to prevent bloat.

⌨️ Shortcuts


AppsKey (Menu Key): Read selected text aloud.
Ctrl + AppsKey: Force Read (Useful for Neovim/Terminals by sending 'y' + 'Ctrl+C').
Shift + AppsKey: Stop reading immediately (kills the player process).

🛠️ Requirements


AutoHotkey v2: scoop install autohotkey
Python 3.12+: scoop install python
uv: scoop install uv
ffmpeg: scoop install ffmpeg (Required for ffplay)

📦 Setup Instructions


Save tts_reader_edge.py and GlobalTTS.ahk to your preferred script folder.
Update the uvPath and scriptPath in GlobalTTS.ahk to match your local file system.
Add GlobalTTS.ahk to your Windows Startup folder (shell:startup) for global availability.

🎙️ Voice Customization

To change the voice, edit tts_reader_edge.py and change the VOICE variable. Popular options:

en-US-AndrewNeural (Male, Very Natural)
en-US-AvaNeural (Female, Very Natural)
en-GB-SoniaNeural (Female, British)


## GlobalTTS.ahk
#Requires AutoHotkey v2.0

; Global TTS Reader
; Uses explicit hotkey definitions to ensure native functions are suppressed.

; 1. Standard Read
$AppsKey::
{
    HandleTTS(false)
}

; 2. Force Read (Ctrl + AppsKey)
$^AppsKey::
{
    HandleTTS(true)
}

; 3. Stop (Shift + AppsKey)
$+AppsKey::
{
    StopTTS()
}

HandleTTS(force)
{
    oldClipboard := ClipboardAll()
    A_Clipboard := ""

    if (force) {
        Send("y")
        Sleep(50)
        Send("^c")
    } else {
        SendEvent("^c")
    }

    if !ClipWait(1)
    {
        A_Clipboard := oldClipboard
        return
    }

    selectedText := A_Clipboard
    A_Clipboard := oldClipboard

    tempFile := A_Temp . "\tts_input.txt"
    try {
        if FileExist(tempFile)
            FileDelete(tempFile)
        FileAppend(selectedText, tempFile, "UTF-8")
    } catch Error as e {
        return
    }

    ToolTip("Reading AI voice...")
    SetTimer () => ToolTip(), -2000

    uvPath := "C:\Users\xgenx\.local\bin\uv.exe"
    scriptPath := "\\wsl$\\Ubuntu\home\mstouffer\.config\windows-tts\tts_reader_edge.py"

    try {
        Run(A_ComSpec . ' /c "' . uvPath . ' run --python 3.12 "' . scriptPath . '" "' . tempFile . '""', , "Hide")
    }

    ; Wait for key release to prevent menu "leakage"
    KeyWait("AppsKey")
}

StopTTS()
{
    try {
        Run(A_ComSpec . " /c taskkill /F /IM ffplay.exe", , "Hide")
        ToolTip("Stopped TTS")
        SetTimer () => ToolTip(), -1000
    }
    KeyWait("AppsKey")
}

## tts_reader_edge.py
# /// script
# dependencies = [
#   "edge-tts",
# ]
# ///

import os
import sys
import asyncio
import hashlib
import subprocess
import edge_tts
from datetime import datetime

# Logging and Cache setup
BASE_DIR = os.path.dirname(__file__)
LOG_FILE = os.path.join(BASE_DIR, "debug.log")
CACHE_DIR = os.path.join(BASE_DIR, "audio_cache")

os.makedirs(CACHE_DIR, exist_ok=True)

def log(message):
    # Log rotation: If file > 1MB, clear it
    if os.path.exists(LOG_FILE) and os.path.getsize(LOG_FILE) > 1024 * 1024:
        with open(LOG_FILE, "w", encoding="utf-8") as f:
            f.write(f"--- Log Rotated at {datetime.now()} ---\n")

    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"{datetime.now()}: {message}\n")

async def amain(text) -> None:
    text_hash = hashlib.md5(text.encode('utf-8')).hexdigest()
    cache_path = os.path.join(CACHE_DIR, f"{text_hash}.mp3")

    VOICE = "en-US-AndrewNeural"

    # 1. Check if it's already in the cache
    if os.path.exists(cache_path):
        log(f"Cache hit: {text_hash}")
        # Use subprocess.run for cached files as it's simpler
        subprocess.run(["ffplay", "-nodisp", "-autoexit", cache_path],
                       capture_output=True, text=True)
        return

    log(f"Cache miss. Streaming TTS for hash: {text_hash}")

    try:
        communicate = edge_tts.Communicate(text, VOICE)

        # 2. Start ffplay reading from stdin for zero-latency streaming
        # Reverting stderr to DEVNULL to avoid potential deadlocks
        player = subprocess.Popen(
            ["ffplay", "-nodisp", "-autoexit", "-"],
            stdin=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
            stdout=subprocess.DEVNULL
        )

        with open(cache_path, "wb") as cache_file:
            async for chunk in communicate.stream():
                if chunk["type"] == "audio":
                    data = chunk["data"]
                    try:
                        player.stdin.write(data)
                        cache_file.write(data)
                    except BrokenPipeError:
                        log("Broken pipe: ffplay likely closed.")
                        break

        if player.stdin:
            player.stdin.close()
        log("Streaming loop finished.")
        player.wait()

    except Exception as e:
        log(f"Exception in amain: {str(e)}")
        # If streaming fails, we might have a partial file, so clean it up
        if os.path.exists(cache_path):
            os.unlink(cache_path)

if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.exit(1)

    input_arg = sys.argv[1]

    if os.path.isfile(input_arg):
        try:
            with open(input_arg, "r", encoding="utf-8") as f:
                text_to_read = f.read()
        except Exception as e:
            log(f"Error reading file {input_arg}: {e}")
            sys.exit(1)
    else:
        text_to_read = input_arg

    if text_to_read.strip():
        asyncio.run(amain(text_to_read))
	#Requires AutoHotkey v2.0

	; Global TTS Reader
	; Uses explicit hotkey definitions to ensure native functions are suppressed.

	; 1. Standard Read
	$AppsKey::
	{
	HandleTTS(false)
	}

	; 2. Force Read (Ctrl + AppsKey)
	$^AppsKey::
	{
	HandleTTS(true)
	}

	; 3. Stop (Shift + AppsKey)
	$+AppsKey::
	{
	StopTTS()
	}

	HandleTTS(force)
	{
	oldClipboard := ClipboardAll()
	A_Clipboard := ""

	if (force) {
	Send("y")
	Sleep(50)
	Send("^c")
	} else {
	SendEvent("^c")
	}

	if !ClipWait(1)
	{
	A_Clipboard := oldClipboard
	return
	}

	selectedText := A_Clipboard
	A_Clipboard := oldClipboard

	tempFile := A_Temp . "\tts_input.txt"
	try {
	if FileExist(tempFile)
	FileDelete(tempFile)
	FileAppend(selectedText, tempFile, "UTF-8")
	} catch Error as e {
	return
	}

	ToolTip("Reading AI voice...")
	SetTimer () => ToolTip(), -2000

	uvPath := "C:\Users\xgenx\.local\bin\uv.exe"
	scriptPath := "\\wsl$\\Ubuntu\home\mstouffer\.config\windows-tts\tts_reader_edge.py"

	try {
	Run(A_ComSpec . ' /c "' . uvPath . ' run --python 3.12 "' . scriptPath . '" "' . tempFile . '""', , "Hide")
	}

	; Wait for key release to prevent menu "leakage"
	KeyWait("AppsKey")
	}

	StopTTS()
	{
	try {
	Run(A_ComSpec . " /c taskkill /F /IM ffplay.exe", , "Hide")
	ToolTip("Stopped TTS")
	SetTimer () => ToolTip(), -1000
	}
	KeyWait("AppsKey")
	}
	# /// script
	# dependencies = [
	# "edge-tts",
	# ]
	# ///

	import os
	import sys
	import asyncio
	import hashlib
	import subprocess
	import edge_tts
	from datetime import datetime

	# Logging and Cache setup
	BASE_DIR = os.path.dirname(__file__)
	LOG_FILE = os.path.join(BASE_DIR, "debug.log")
	CACHE_DIR = os.path.join(BASE_DIR, "audio_cache")

	os.makedirs(CACHE_DIR, exist_ok=True)

	def log(message):
	# Log rotation: If file > 1MB, clear it
	if os.path.exists(LOG_FILE) and os.path.getsize(LOG_FILE) > 1024 * 1024:
	with open(LOG_FILE, "w", encoding="utf-8") as f:
	f.write(f"--- Log Rotated at {datetime.now()} ---\n")

	with open(LOG_FILE, "a", encoding="utf-8") as f:
	f.write(f"{datetime.now()}: {message}\n")

	async def amain(text) -> None:
	text_hash = hashlib.md5(text.encode('utf-8')).hexdigest()
	cache_path = os.path.join(CACHE_DIR, f"{text_hash}.mp3")

	VOICE = "en-US-AndrewNeural"

	# 1. Check if it's already in the cache
	if os.path.exists(cache_path):
	log(f"Cache hit: {text_hash}")
	# Use subprocess.run for cached files as it's simpler
	subprocess.run(["ffplay", "-nodisp", "-autoexit", cache_path],
	capture_output=True, text=True)
	return

	log(f"Cache miss. Streaming TTS for hash: {text_hash}")

	try:
	communicate = edge_tts.Communicate(text, VOICE)

	# 2. Start ffplay reading from stdin for zero-latency streaming
	# Reverting stderr to DEVNULL to avoid potential deadlocks
	player = subprocess.Popen(
	["ffplay", "-nodisp", "-autoexit", "-"],
	stdin=subprocess.PIPE,
	stderr=subprocess.DEVNULL,
	stdout=subprocess.DEVNULL
	)

	with open(cache_path, "wb") as cache_file:
	async for chunk in communicate.stream():
	if chunk["type"] == "audio":
	data = chunk["data"]
	try:
	player.stdin.write(data)
	cache_file.write(data)
	except BrokenPipeError:
	log("Broken pipe: ffplay likely closed.")
	break

	if player.stdin:
	player.stdin.close()
	log("Streaming loop finished.")
	player.wait()

	except Exception as e:
	log(f"Exception in amain: {str(e)}")
	# If streaming fails, we might have a partial file, so clean it up
	if os.path.exists(cache_path):
	os.unlink(cache_path)

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	sys.exit(1)

	input_arg = sys.argv[1]

	if os.path.isfile(input_arg):
	try:
	with open(input_arg, "r", encoding="utf-8") as f:
	text_to_read = f.read()
	except Exception as e:
	log(f"Error reading file {input_arg}: {e}")
	sys.exit(1)
	else:
	text_to_read = input_arg

	if text_to_read.strip():
	asyncio.run(amain(text_to_read))