Created
February 24, 2026 05:45
-
-
Save ducnh1022/51848345be0d4defcb65b40ddbb76e1c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import collections | |
| import sys | |
| import wave | |
| import time | |
| import webrtcvad | |
| import pyaudio | |
| # Audio settings | |
| FORMAT = pyaudio.paInt16 | |
| CHANNELS = 1 | |
| RATE = 16000 | |
| FRAME_DURATION = 30 # ms (10, 20, 30 allowed) | |
| FRAME_SIZE = int(RATE * FRAME_DURATION / 1000) | |
| PADDING_DURATION = 300 # ms | |
| VAD_MODE = 2 # 0-3 (3 aggressive) | |
| vad = webrtcvad.Vad(VAD_MODE) | |
| audio = pyaudio.PyAudio() | |
| stream = audio.open(format=FORMAT, | |
| channels=CHANNELS, | |
| rate=RATE, | |
| input=True, | |
| frames_per_buffer=FRAME_SIZE) | |
| ring_buffer = collections.deque(maxlen=int(PADDING_DURATION / FRAME_DURATION)) | |
| triggered = False | |
| voiced_frames = [] | |
| print("π Listening...") | |
| def save_wave(frames): | |
| filename = f"speech_{int(time.time())}.wav" | |
| wf = wave.open(filename, 'wb') | |
| wf.setnchannels(CHANNELS) | |
| wf.setsampwidth(audio.get_sample_size(FORMAT)) | |
| wf.setframerate(RATE) | |
| wf.writeframes(b''.join(frames)) | |
| wf.close() | |
| print(f"πΎ Saved: {filename}") | |
| try: | |
| while True: | |
| frame = stream.read(FRAME_SIZE, exception_on_overflow=False) | |
| is_speech = vad.is_speech(frame, RATE) | |
| if not triggered: | |
| ring_buffer.append((frame, is_speech)) | |
| num_voiced = len([f for f, speech in ring_buffer if speech]) | |
| if num_voiced > 0.8 * ring_buffer.maxlen: | |
| triggered = True | |
| print("π£ Speech started") | |
| voiced_frames.extend([f for f, s in ring_buffer]) | |
| ring_buffer.clear() | |
| else: | |
| voiced_frames.append(frame) | |
| ring_buffer.append((frame, is_speech)) | |
| num_unvoiced = len([f for f, speech in ring_buffer if not speech]) | |
| if num_unvoiced > 0.8 * ring_buffer.maxlen: | |
| print("π Speech ended") | |
| save_wave(voiced_frames) | |
| triggered = False | |
| ring_buffer.clear() | |
| voiced_frames = [] | |
| except KeyboardInterrupt: | |
| print("Stopping...") | |
| stream.stop_stream() | |
| stream.close() | |
| audio.terminate() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment