# 1. https://docs.beam.cloud/v2/getting-started/quickstart#installation
# 2. beam deploy kokoro_beam.py:generate_speech
from beam import endpoint, env, Image, Output
if env.is_remote():
from kokoro import KPipeline
import subprocess
import uuid
def load_model():
pipeline = KPipeline("a", device="cuda:0")
pipeline.load_single_voice("af_alloy")
pipeline.load_single_voice("am_onyx")
pipeline.load_single_voice("af_heart")
return pipeline
kokoro_image = (
Image(python_version="python3.11")
.add_python_packages(["kokoro"])
.add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)
@endpoint(
name="kokoro-tts",
on_start=load_model,
#####################################################################
# 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro
#####################################################################
# cpu=1,
# workers=1,
# memory="4Gi",
#####################################################################
cpu=10,
workers=10,
memory="24Gi",
#####################################################################
gpu=["RTX4090", "A10G", "A100-40"],
gpu_count=1,
image=kokoro_image,
)
def generate_speech(context, **inputs):
pipeline = context.on_start_value
text = inputs.pop("text", None)
voice = inputs.pop("voice", "af_alloy")
if not text:
return {"error": "Please provide text to generate speech"}
generator = pipeline(text, voice=voice)
mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"
# Use ffmpeg as a pipe (no intermediate WAV files)
ffmpeg_cmd = [
"ffmpeg",
"-y", # Overwrite if exists
"-f",
"s16le", # Raw PCM 16-bit little-endian
"-ar",
"24000", # Sample rate
"-ac",
"1", # Mono audio
"-i",
"pipe:0", # Read from stdin (raw audio)
"-codec:a",
"libmp3lame", # MP3 codec
"-b:a",
"48k", # Bitrate
"-write_xing", # Add proper MP3 header
"0", # Disable Xing header (which contains duration info)
"-write_id3v2",
"1", # Add ID3v2 header for file recognition
mp3_file,
]
try:
with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
for result in generator:
# Convert tensor to bytes and scale to 16-bit PCM format
audio_bytes = (
(result.audio.cpu().numpy() * 32767)
.clip(-32768, 32767)
.astype("int16")
.tobytes()
)
ffmpeg_proc.stdin.write(audio_bytes)
ffmpeg_proc.stdin.close()
ffmpeg_proc.wait() # Ensure ffmpeg finishes encoding
except subprocess.CalledProcessError:
return {"error": "Failed to convert audio to MP3"}
output_file = Output(path=mp3_file)
output_file.save()
public_url = output_file.public_url(expires=3600)
del pipeline
return {"output_url": public_url}
Last active
March 20, 2025 15:22
-
-
Save efemaer/cedbc629d2cf5bfe1eb35276e5531c4b to your computer and use it in GitHub Desktop.
Author
my only question is - is it working as intended? thanks for the previous guy to fix it of course :)
Author
Yes, tested just now works as expected
Fixed bug with mp3 metadata encoding. And set the beam config for max concurrency possible on 24GB GPU. This config costs $3/hr. Reduce to 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro. This minimum config costs $0.8/hr.
from beam import endpoint, env, Image, Output
if env.is_remote():
from kokoro import KPipeline
import subprocess
import uuid
import os
def load_model():
pipeline = KPipeline("a", device="cuda:0")
pipeline.load_single_voice("af_alloy")
pipeline.load_single_voice("am_onyx")
pipeline.load_single_voice("af_heart")
return pipeline
kokor_image = (
Image(python_version="python3.11")
.add_python_packages(["kokoro"])
.add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)
@endpoint(
name="kokoro-tts",
on_start=load_model,
cpu=10,
workers=10,
memory="24Gi",
gpu=["RTX4090", "A10G", "A100-40"],
gpu_count=1,
image=kokor_image,
)
def generate_speech(context, **inputs):
pipeline = context.on_start_value
text = inputs.pop("text", None)
voice = inputs.pop("voice", "af_alloy")
if not text:
return {"error": "Please provide text to generate speech"}
generator = pipeline(text, voice=voice)
mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"
# Use ffmpeg as a pipe (no intermediate WAV files)
ffmpeg_cmd = [
"ffmpeg",
"-y", # Overwrite if exists
"-f",
"s16le", # Raw PCM 16-bit little-endian
"-ar",
"24000", # Sample rate
"-ac",
"1", # Mono audio
"-i",
"pipe:0", # Read from stdin (raw audio)
"-codec:a",
"libmp3lame", # MP3 codec
"-b:a",
"48k", # Bitrate
"-write_xing", # Add proper MP3 header
"0", # Disable Xing header (which contains duration info)
"-write_id3v2",
"1", # Add ID3v2 header for file recognition
mp3_file,
]
try:
with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
for result in generator:
# Convert tensor to bytes and scale to 16-bit PCM format
audio_bytes = (
(result.audio.cpu().numpy() * 32767)
.clip(-32768, 32767)
.astype("int16")
.tobytes()
)
ffmpeg_proc.stdin.write(audio_bytes)
ffmpeg_proc.stdin.close()
ffmpeg_proc.wait() # Ensure ffmpeg finishes encoding
except subprocess.CalledProcessError:
return {"error": "Failed to convert audio to MP3"}
output_file = Output(path=mp3_file)
output_file.save()
public_url = output_file.public_url(expires=3600)
# Cleanup pipeline memory (optional)
del pipeline
return {"output_url": public_url}
Author
Thanks for the input @chandradeepc ! Updated the gist to reflect that as an option
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you @chandradeepc @AcTePuKc @etrotta for the input! Updated the code.