Yugsolanki/nemo_runner.py

## nemo_runner.py
import gradio as gr
import nemo.collections.asr as nemo_asr
import numpy as np
import soundfile as sf
import tempfile
import os
import librosa # NeMo installs this by default, we use it for resampling

# --- 1. Load and Configure Your Model ---
print("Loading model...")
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from("finetuned_model2.nemo")

asr_model.encoder.set_default_att_context_size([70, 13])
asr_model.change_decoding_strategy(decoder_type='rnnt')

print("Model loaded. Ready to transcribe.")

# Constants
TARGET_SR = 16000

# --- 2. Define the Streaming Function ---
def transcribe_stream(stream, new_chunk):
    """
    stream: The accumulated audio data so far (16kHz numpy array)
    new_chunk: The new incoming audio data (Variable SR)
    """
    sr, y = new_chunk

    # --- A. Force Mono ---
    if len(y.shape) > 1:
        y = np.mean(y, axis=1)

    # --- B. Normalize to Float32 ---
    y = y.astype(np.float32)
    if np.abs(y).max() > 1.0:
        y = y / 32768.0

    # --- C. Resample to 16000 Hz ---
    # We explicitly convert whatever the mic gives us (e.g., 44100, 48000) to 16000
    if sr != TARGET_SR:
        # librosa.resample expects (channels, samples) or just (samples)
        y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)

    # --- D. Accumulate Stream ---
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    # --- E. Write Temp File ---
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
        sf.write(tmp.name, stream, TARGET_SR)
        tmp_path = tmp.name

    try:
        output = asr_model.transcribe([tmp_path], verbose=False)
        transcription = output[0].text
    except Exception as e:
        transcription = f"Error: {str(e)}"
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

    return stream, transcription

# --- 3. Build the GUI ---
with gr.Blocks(title="NeMo 16k ASR") as demo:
    gr.Markdown(f"## 🎙️ NeMo Real-Time Transcription")

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone"],
            type="numpy",
            streaming=True,
            label="Speak Here"
            # Removed 'sample_rate' to fix the TypeError
        )
        text_output = gr.Textbox(label="Transcription", lines=4)

    state = gr.State()

    audio_input.stream(
        fn=transcribe_stream,
        inputs=[state, audio_input],
        outputs=[state, text_output]
    )

if __name__ == "__main__":
    demo.launch()
	import gradio as gr
	import nemo.collections.asr as nemo_asr
	import numpy as np
	import soundfile as sf
	import tempfile
	import os
	import librosa # NeMo installs this by default, we use it for resampling

	# --- 1. Load and Configure Your Model ---
	print("Loading model...")
	asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from("finetuned_model2.nemo")

	asr_model.encoder.set_default_att_context_size([70, 13])
	asr_model.change_decoding_strategy(decoder_type='rnnt')

	print("Model loaded. Ready to transcribe.")

	# Constants
	TARGET_SR = 16000

	# --- 2. Define the Streaming Function ---
	def transcribe_stream(stream, new_chunk):
	"""
	stream: The accumulated audio data so far (16kHz numpy array)
	new_chunk: The new incoming audio data (Variable SR)
	"""
	sr, y = new_chunk

	# --- A. Force Mono ---
	if len(y.shape) > 1:
	y = np.mean(y, axis=1)

	# --- B. Normalize to Float32 ---
	y = y.astype(np.float32)
	if np.abs(y).max() > 1.0:
	y = y / 32768.0

	# --- C. Resample to 16000 Hz ---
	# We explicitly convert whatever the mic gives us (e.g., 44100, 48000) to 16000
	if sr != TARGET_SR:
	# librosa.resample expects (channels, samples) or just (samples)
	y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)

	# --- D. Accumulate Stream ---
	if stream is not None:
	stream = np.concatenate([stream, y])
	else:
	stream = y

	# --- E. Write Temp File ---
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
	sf.write(tmp.name, stream, TARGET_SR)
	tmp_path = tmp.name

	try:
	output = asr_model.transcribe([tmp_path], verbose=False)
	transcription = output[0].text
	except Exception as e:
	transcription = f"Error: {str(e)}"
	finally:
	if os.path.exists(tmp_path):
	os.remove(tmp_path)

	return stream, transcription

	# --- 3. Build the GUI ---
	with gr.Blocks(title="NeMo 16k ASR") as demo:
	gr.Markdown(f"## 🎙️ NeMo Real-Time Transcription")

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	streaming=True,
	label="Speak Here"
	# Removed 'sample_rate' to fix the TypeError
	)
	text_output = gr.Textbox(label="Transcription", lines=4)

	state = gr.State()

	audio_input.stream(
	fn=transcribe_stream,
	inputs=[state, audio_input],
	outputs=[state, text_output]
	)

	if __name__ == "__main__":
	demo.launch()
No results found