Created
November 22, 2025 14:17
-
-
Save Yugsolanki/fb3e46b5b770a5d637806bff0f58e844 to your computer and use it in GitHub Desktop.
For running .nemo model with live transcription.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import gradio as gr | |
| import nemo.collections.asr as nemo_asr | |
| import numpy as np | |
| import soundfile as sf | |
| import tempfile | |
| import os | |
| import librosa # NeMo installs this by default, we use it for resampling | |
| # --- 1. Load and Configure Your Model --- | |
| print("Loading model...") | |
| asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from("finetuned_model2.nemo") | |
| asr_model.encoder.set_default_att_context_size([70, 13]) | |
| asr_model.change_decoding_strategy(decoder_type='rnnt') | |
| print("Model loaded. Ready to transcribe.") | |
| # Constants | |
| TARGET_SR = 16000 | |
| # --- 2. Define the Streaming Function --- | |
| def transcribe_stream(stream, new_chunk): | |
| """ | |
| stream: The accumulated audio data so far (16kHz numpy array) | |
| new_chunk: The new incoming audio data (Variable SR) | |
| """ | |
| sr, y = new_chunk | |
| # --- A. Force Mono --- | |
| if len(y.shape) > 1: | |
| y = np.mean(y, axis=1) | |
| # --- B. Normalize to Float32 --- | |
| y = y.astype(np.float32) | |
| if np.abs(y).max() > 1.0: | |
| y = y / 32768.0 | |
| # --- C. Resample to 16000 Hz --- | |
| # We explicitly convert whatever the mic gives us (e.g., 44100, 48000) to 16000 | |
| if sr != TARGET_SR: | |
| # librosa.resample expects (channels, samples) or just (samples) | |
| y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR) | |
| # --- D. Accumulate Stream --- | |
| if stream is not None: | |
| stream = np.concatenate([stream, y]) | |
| else: | |
| stream = y | |
| # --- E. Write Temp File --- | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: | |
| sf.write(tmp.name, stream, TARGET_SR) | |
| tmp_path = tmp.name | |
| try: | |
| output = asr_model.transcribe([tmp_path], verbose=False) | |
| transcription = output[0].text | |
| except Exception as e: | |
| transcription = f"Error: {str(e)}" | |
| finally: | |
| if os.path.exists(tmp_path): | |
| os.remove(tmp_path) | |
| return stream, transcription | |
| # --- 3. Build the GUI --- | |
| with gr.Blocks(title="NeMo 16k ASR") as demo: | |
| gr.Markdown(f"## 🎙️ NeMo Real-Time Transcription") | |
| with gr.Row(): | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="numpy", | |
| streaming=True, | |
| label="Speak Here" | |
| # Removed 'sample_rate' to fix the TypeError | |
| ) | |
| text_output = gr.Textbox(label="Transcription", lines=4) | |
| state = gr.State() | |
| audio_input.stream( | |
| fn=transcribe_stream, | |
| inputs=[state, audio_input], | |
| outputs=[state, text_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment