Created
January 23, 2026 16:12
-
-
Save mseri/38d1c0a4e90bbb5bcdbf2eaa294ec2d4 to your computer and use it in GitHub Desktop.
qwen-3 tts player
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.12" | |
| # dependencies = [ | |
| # "transformers>=5.0.0rc1", | |
| # "mlx-audio==0.3.0rc1", | |
| # "click", | |
| # "numpy", | |
| # "soundfile", | |
| # "sounddevice", | |
| # ] | |
| # /// | |
| import sys | |
| from pathlib import Path | |
| import click | |
| import numpy as np | |
| import soundfile as sf | |
| import sounddevice as sd | |
| from mlx_audio.tts.utils import load_model | |
| def get_unique_filename(base_path: Path) -> Path: | |
| """Return a unique filename, adding -2, -3, etc. if the file already exists.""" | |
| if not base_path.exists(): | |
| return base_path | |
| stem = base_path.stem | |
| suffix = base_path.suffix | |
| parent = base_path.parent | |
| counter = 2 | |
| while True: | |
| new_path = parent / f"{stem}-{counter}{suffix}" | |
| if not new_path.exists(): | |
| return new_path | |
| counter += 1 | |
| class CustomHelpCommand(click.Command): | |
| def format_help(self, ctx, formatter): | |
| super().format_help(ctx, formatter) | |
| prog = ctx.info_name | |
| with formatter.section("Examples"): | |
| formatter.write_paragraph() | |
| formatter.write_text(f"{prog} \"say this text out loud\"") | |
| formatter.write_text(f"{prog} -o saved.wav \"hello world\"") | |
| formatter.write_text(f"{prog} -p \"hello world\" # play without saving") | |
| formatter.write_text(f"{prog} -p -o saved.wav \"hello\" # play and save") | |
| formatter.write_text(f"{prog} -l Chinese \"你好世界\"") | |
| formatter.write_text(f"{prog} -i \"deep low voice\" \"hello\"") | |
| formatter.write_text(f"echo \"piped text\" | {prog}") | |
| @click.command(cls=CustomHelpCommand) | |
| @click.argument("text", required=False) | |
| @click.option("-o", "--output", default=None, help="Output filename (default: output.wav if not playing)") | |
| @click.option("-p", "--play", is_flag=True, help="Play audio instead of (or in addition to) saving") | |
| @click.option("-l", "--language", default="English", help="Language for TTS (default: English)") | |
| @click.option("-i", "--instruct", default=None, help="Voice instruction (e.g., 'deep low voice')") | |
| @click.option("-v", "--verbose", is_flag=True, help="Enable verbose output") | |
| def main(text: str | None, output: str | None, play: bool, language: str, instruct: str | None, verbose: bool): | |
| """Generate audio using Qwen3-TTS and MLX Audio.""" | |
| # Handle piped input | |
| if text is None: | |
| if not sys.stdin.isatty(): | |
| text = sys.stdin.read().strip() | |
| else: | |
| raise click.UsageError("No text provided. Pass text as an argument or pipe it via stdin.") | |
| if not text: | |
| raise click.UsageError("Text cannot be empty.") | |
| # Determine behavior: play, save, or both | |
| if not play and output is None: | |
| output = "output.wav" | |
| # Determine output path if saving | |
| output_path = None | |
| if output: | |
| output_path = Path(output) | |
| if output == "output.wav": | |
| output_path = get_unique_filename(output_path) | |
| if verbose: | |
| click.echo(f"Loading model...") | |
| model = load_model("Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign") | |
| if verbose: | |
| click.echo(f"Generating audio for: {text[:50]}{'...' if len(text) > 50 else ''}") | |
| # Build generation kwargs | |
| gen_kwargs = { | |
| "text": text, | |
| "language": language, | |
| "verbose": verbose, | |
| "instruct": instruct or "", | |
| } | |
| # Generate with voice description | |
| results = list(model.generate_voice_design(**gen_kwargs)) | |
| audio = np.array(results[0].audio) | |
| # Play audio if requested | |
| if play: | |
| if verbose: | |
| click.echo("Playing audio...") | |
| sd.play(audio, model.sample_rate) | |
| sd.wait() | |
| # Save to file if requested | |
| if output_path: | |
| sf.write(str(output_path), audio, model.sample_rate) | |
| if verbose: | |
| click.echo(f"Audio saved to: {output_path}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment