Skip to content

Instantly share code, notes, and snippets.

@mseri
Created January 23, 2026 16:12
Show Gist options
  • Select an option

  • Save mseri/38d1c0a4e90bbb5bcdbf2eaa294ec2d4 to your computer and use it in GitHub Desktop.

Select an option

Save mseri/38d1c0a4e90bbb5bcdbf2eaa294ec2d4 to your computer and use it in GitHub Desktop.
qwen-3 tts player
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "transformers>=5.0.0rc1",
# "mlx-audio==0.3.0rc1",
# "click",
# "numpy",
# "soundfile",
# "sounddevice",
# ]
# ///
import sys
from pathlib import Path
import click
import numpy as np
import soundfile as sf
import sounddevice as sd
from mlx_audio.tts.utils import load_model
def get_unique_filename(base_path: Path) -> Path:
"""Return a unique filename, adding -2, -3, etc. if the file already exists."""
if not base_path.exists():
return base_path
stem = base_path.stem
suffix = base_path.suffix
parent = base_path.parent
counter = 2
while True:
new_path = parent / f"{stem}-{counter}{suffix}"
if not new_path.exists():
return new_path
counter += 1
class CustomHelpCommand(click.Command):
def format_help(self, ctx, formatter):
super().format_help(ctx, formatter)
prog = ctx.info_name
with formatter.section("Examples"):
formatter.write_paragraph()
formatter.write_text(f"{prog} \"say this text out loud\"")
formatter.write_text(f"{prog} -o saved.wav \"hello world\"")
formatter.write_text(f"{prog} -p \"hello world\" # play without saving")
formatter.write_text(f"{prog} -p -o saved.wav \"hello\" # play and save")
formatter.write_text(f"{prog} -l Chinese \"你好世界\"")
formatter.write_text(f"{prog} -i \"deep low voice\" \"hello\"")
formatter.write_text(f"echo \"piped text\" | {prog}")
@click.command(cls=CustomHelpCommand)
@click.argument("text", required=False)
@click.option("-o", "--output", default=None, help="Output filename (default: output.wav if not playing)")
@click.option("-p", "--play", is_flag=True, help="Play audio instead of (or in addition to) saving")
@click.option("-l", "--language", default="English", help="Language for TTS (default: English)")
@click.option("-i", "--instruct", default=None, help="Voice instruction (e.g., 'deep low voice')")
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output")
def main(text: str | None, output: str | None, play: bool, language: str, instruct: str | None, verbose: bool):
"""Generate audio using Qwen3-TTS and MLX Audio."""
# Handle piped input
if text is None:
if not sys.stdin.isatty():
text = sys.stdin.read().strip()
else:
raise click.UsageError("No text provided. Pass text as an argument or pipe it via stdin.")
if not text:
raise click.UsageError("Text cannot be empty.")
# Determine behavior: play, save, or both
if not play and output is None:
output = "output.wav"
# Determine output path if saving
output_path = None
if output:
output_path = Path(output)
if output == "output.wav":
output_path = get_unique_filename(output_path)
if verbose:
click.echo(f"Loading model...")
model = load_model("Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign")
if verbose:
click.echo(f"Generating audio for: {text[:50]}{'...' if len(text) > 50 else ''}")
# Build generation kwargs
gen_kwargs = {
"text": text,
"language": language,
"verbose": verbose,
"instruct": instruct or "",
}
# Generate with voice description
results = list(model.generate_voice_design(**gen_kwargs))
audio = np.array(results[0].audio)
# Play audio if requested
if play:
if verbose:
click.echo("Playing audio...")
sd.play(audio, model.sample_rate)
sd.wait()
# Save to file if requested
if output_path:
sf.write(str(output_path), audio, model.sample_rate)
if verbose:
click.echo(f"Audio saved to: {output_path}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment