Skip to content

Instantly share code, notes, and snippets.

@ducnh1022
Created February 24, 2026 06:44
Show Gist options
  • Select an option

  • Save ducnh1022/06355de13df1975f8decbc57951f1b9e to your computer and use it in GitHub Desktop.

Select an option

Save ducnh1022/06355de13df1975f8decbc57951f1b9e to your computer and use it in GitHub Desktop.
import webrtcvad
import collections
import sys
import wave
import contextlib
from pydub import AudioSegment
def read_wave(path):
with contextlib.closing(wave.open(path, 'rb')) as wf:
num_channels = wf.getnchannels()
assert num_channels == 1, "Audio must be mono"
sample_width = wf.getsampwidth()
assert sample_width == 2, "Audio must be 16-bit PCM"
sample_rate = wf.getframerate()
assert sample_rate in (8000, 16000, 32000, 48000)
pcm_data = wf.readframes(wf.getnframes())
return pcm_data, sample_rate
def write_wave(path, audio, sample_rate):
with wave.open(path, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(audio)
def frame_generator(frame_duration_ms, audio, sample_rate):
n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
offset = 0
while offset + n < len(audio):
yield audio[offset:offset + n]
offset += n
def remove_long_silences(input_path, output_path, silence_threshold_sec=3):
# Convert to 16k mono wav if needed
audio = AudioSegment.from_file(input_path)
audio = audio.set_channels(1).set_frame_rate(16000).set_sample_width(2)
audio.export("temp.wav", format="wav")
pcm_data, sample_rate = read_wave("temp.wav")
vad = webrtcvad.Vad(2)
frames = list(frame_generator(30, pcm_data, sample_rate))
voiced_frames = []
silence_buffer = []
silence_duration = 0
frame_duration = 0.03 # 30ms
for frame in frames:
is_speech = vad.is_speech(frame, sample_rate)
if is_speech:
# Nếu trước đó silence < threshold → vẫn giữ
if silence_duration < silence_threshold_sec:
voiced_frames.extend(silence_buffer)
silence_buffer = []
silence_duration = 0
voiced_frames.append(frame)
else:
silence_buffer.append(frame)
silence_duration += frame_duration
# Nếu silence vượt threshold → bỏ luôn
if silence_duration >= silence_threshold_sec:
silence_buffer = []
# Ghép lại
output_audio = b''.join(voiced_frames)
write_wave(output_path, output_audio, sample_rate)
print("Done. Saved to:", output_path)
if __name__ == "__main__":
remove_long_silences(
input_path="input.wav",
output_path="output_trimmed.wav",
silence_threshold_sec=3
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment