- Clone repo (https://github.com/hexgrad/kokoro)
- Install brew dependencies
brew install python@3.9 espeak libsndfile portaudio- Create a Python 3.9 virtual environment + install Python dependencies
/opt/homebrew/opt/python@3.9/bin/python3.9 -m venv fresh_venv && \
source fresh_venv/bin/activate && \
pip install kokoro>=0.9.4 soundfile misaki[en]- Add the test script:
demo/test_m4.py - Activate venv, set the environment variable to enable MPS fallback + run the script
source fresh_venv/bin/activate && PYTORCH_ENABLE_MPS_FALLBACK=1 python demo/english.py/demo/portuguese.py
import soundfile as sf
from kokoro import KPipeline
# Initialize the pipeline for Brazilian Portuguese
pipeline = KPipeline(lang_code='p')
# Test text in Brazilian Portuguese
text = 'A doença de Parkinson é um distúrbio neurodegenerativo crônico e progressivo que afeta principalmente o sistema nervoso central, causado pela perda de neurônios produtores de dopamina na substância negra do cérebro.'
# Available Brazilian Portuguese voices:
# - pf_dora (Female)
# - pm_alex (Male)
# - pm_santa (Male)
voices = ['pf_dora', 'pm_alex', 'pm_santa']
for voice in voices:
print(f"\n{'='*60}")
print(f"Testing voice: {voice}")
print(f"{'='*60}\n")
# Generate audio
generator = pipeline(text, voice=voice)
# Process and save the audio
for i, (gs, ps, audio) in enumerate(generator):
print(f"Generated segment {i}:")
print(f"Text: {gs}")
print(f"Phonemes: {ps}")
# Save the audio file with voice name in filename
output_filename = f'test_output_{voice}_{i}.wav'
sf.write(output_filename, audio, 24000)
print(f"Saved audio file: {output_filename}\n")
print("\nAll Brazilian Portuguese voices tested successfully!")/demo/english.py
import soundfile as sf
from kokoro import KPipeline
# Initialize the pipeline for American English
pipeline = KPipeline(lang_code='a')
# Test text
text = "Though these conversations have invigorated the podcast business — an exciting new buyer! — people familiar with the conversations also tell me that Netflix’s budget is low, particularly for the medium’s biggest shows. The streamer is offering up to high seven-figure terms, but the shows being offered such flashy numbers tend to be already making more than that on an annual basis. "
# Generate audio
generator = pipeline(text, voice='af_heart')
# Process and save the audio
for i, (gs, ps, audio) in enumerate(generator):
print(f"Generated segment {i}:")
print(f"Text: {gs}")
print(f"Phonemes: {ps}")
# Save the audio file
sf.write(f'test_output_{i}.wav', audio, 24000)
print(f"Saved audio file: test_output_{i}.wav\n")