Skip to content

Instantly share code, notes, and snippets.

@Yugsolanki
Created November 22, 2025 14:17
Show Gist options
  • Select an option

  • Save Yugsolanki/fb3e46b5b770a5d637806bff0f58e844 to your computer and use it in GitHub Desktop.

Select an option

Save Yugsolanki/fb3e46b5b770a5d637806bff0f58e844 to your computer and use it in GitHub Desktop.
For running .nemo model with live transcription.
import gradio as gr
import nemo.collections.asr as nemo_asr
import numpy as np
import soundfile as sf
import tempfile
import os
import librosa # NeMo installs this by default, we use it for resampling
# --- 1. Load and Configure Your Model ---
print("Loading model...")
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from("finetuned_model2.nemo")
asr_model.encoder.set_default_att_context_size([70, 13])
asr_model.change_decoding_strategy(decoder_type='rnnt')
print("Model loaded. Ready to transcribe.")
# Constants
TARGET_SR = 16000
# --- 2. Define the Streaming Function ---
def transcribe_stream(stream, new_chunk):
"""
stream: The accumulated audio data so far (16kHz numpy array)
new_chunk: The new incoming audio data (Variable SR)
"""
sr, y = new_chunk
# --- A. Force Mono ---
if len(y.shape) > 1:
y = np.mean(y, axis=1)
# --- B. Normalize to Float32 ---
y = y.astype(np.float32)
if np.abs(y).max() > 1.0:
y = y / 32768.0
# --- C. Resample to 16000 Hz ---
# We explicitly convert whatever the mic gives us (e.g., 44100, 48000) to 16000
if sr != TARGET_SR:
# librosa.resample expects (channels, samples) or just (samples)
y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
# --- D. Accumulate Stream ---
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
# --- E. Write Temp File ---
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
sf.write(tmp.name, stream, TARGET_SR)
tmp_path = tmp.name
try:
output = asr_model.transcribe([tmp_path], verbose=False)
transcription = output[0].text
except Exception as e:
transcription = f"Error: {str(e)}"
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
return stream, transcription
# --- 3. Build the GUI ---
with gr.Blocks(title="NeMo 16k ASR") as demo:
gr.Markdown(f"## 🎙️ NeMo Real-Time Transcription")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
streaming=True,
label="Speak Here"
# Removed 'sample_rate' to fix the TypeError
)
text_output = gr.Textbox(label="Transcription", lines=4)
state = gr.State()
audio_input.stream(
fn=transcribe_stream,
inputs=[state, audio_input],
outputs=[state, text_output]
)
if __name__ == "__main__":
demo.launch()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment