Created
February 24, 2026 06:44
-
-
Save ducnh1022/06355de13df1975f8decbc57951f1b9e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import webrtcvad | |
| import collections | |
| import sys | |
| import wave | |
| import contextlib | |
| from pydub import AudioSegment | |
| def read_wave(path): | |
| with contextlib.closing(wave.open(path, 'rb')) as wf: | |
| num_channels = wf.getnchannels() | |
| assert num_channels == 1, "Audio must be mono" | |
| sample_width = wf.getsampwidth() | |
| assert sample_width == 2, "Audio must be 16-bit PCM" | |
| sample_rate = wf.getframerate() | |
| assert sample_rate in (8000, 16000, 32000, 48000) | |
| pcm_data = wf.readframes(wf.getnframes()) | |
| return pcm_data, sample_rate | |
| def write_wave(path, audio, sample_rate): | |
| with wave.open(path, 'wb') as wf: | |
| wf.setnchannels(1) | |
| wf.setsampwidth(2) | |
| wf.setframerate(sample_rate) | |
| wf.writeframes(audio) | |
| def frame_generator(frame_duration_ms, audio, sample_rate): | |
| n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) | |
| offset = 0 | |
| while offset + n < len(audio): | |
| yield audio[offset:offset + n] | |
| offset += n | |
| def remove_long_silences(input_path, output_path, silence_threshold_sec=3): | |
| # Convert to 16k mono wav if needed | |
| audio = AudioSegment.from_file(input_path) | |
| audio = audio.set_channels(1).set_frame_rate(16000).set_sample_width(2) | |
| audio.export("temp.wav", format="wav") | |
| pcm_data, sample_rate = read_wave("temp.wav") | |
| vad = webrtcvad.Vad(2) | |
| frames = list(frame_generator(30, pcm_data, sample_rate)) | |
| voiced_frames = [] | |
| silence_buffer = [] | |
| silence_duration = 0 | |
| frame_duration = 0.03 # 30ms | |
| for frame in frames: | |
| is_speech = vad.is_speech(frame, sample_rate) | |
| if is_speech: | |
| # Nếu trước đó silence < threshold → vẫn giữ | |
| if silence_duration < silence_threshold_sec: | |
| voiced_frames.extend(silence_buffer) | |
| silence_buffer = [] | |
| silence_duration = 0 | |
| voiced_frames.append(frame) | |
| else: | |
| silence_buffer.append(frame) | |
| silence_duration += frame_duration | |
| # Nếu silence vượt threshold → bỏ luôn | |
| if silence_duration >= silence_threshold_sec: | |
| silence_buffer = [] | |
| # Ghép lại | |
| output_audio = b''.join(voiced_frames) | |
| write_wave(output_path, output_audio, sample_rate) | |
| print("Done. Saved to:", output_path) | |
| if __name__ == "__main__": | |
| remove_long_silences( | |
| input_path="input.wav", | |
| output_path="output_trimmed.wav", | |
| silence_threshold_sec=3 | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment