ducnh1022/gist:06355de13df1975f8decbc57951f1b9e

## gistfile1.txt
import webrtcvad
import collections
import sys
import wave
import contextlib
from pydub import AudioSegment


def read_wave(path):
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1, "Audio must be mono"
        sample_width = wf.getsampwidth()
        assert sample_width == 2, "Audio must be 16-bit PCM"
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate


def write_wave(path, audio, sample_rate):
    with wave.open(path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)


def frame_generator(frame_duration_ms, audio, sample_rate):
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    while offset + n < len(audio):
        yield audio[offset:offset + n]
        offset += n


def remove_long_silences(input_path, output_path, silence_threshold_sec=3):
    # Convert to 16k mono wav if needed
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_channels(1).set_frame_rate(16000).set_sample_width(2)
    audio.export("temp.wav", format="wav")

    pcm_data, sample_rate = read_wave("temp.wav")
    vad = webrtcvad.Vad(2)

    frames = list(frame_generator(30, pcm_data, sample_rate))

    voiced_frames = []
    silence_buffer = []
    silence_duration = 0

    frame_duration = 0.03  # 30ms

    for frame in frames:
        is_speech = vad.is_speech(frame, sample_rate)

        if is_speech:
            # Nếu trước đó silence < threshold → vẫn giữ
            if silence_duration < silence_threshold_sec:
                voiced_frames.extend(silence_buffer)
            silence_buffer = []
            silence_duration = 0
            voiced_frames.append(frame)
        else:
            silence_buffer.append(frame)
            silence_duration += frame_duration

            # Nếu silence vượt threshold → bỏ luôn
            if silence_duration >= silence_threshold_sec:
                silence_buffer = []

    # Ghép lại
    output_audio = b''.join(voiced_frames)
    write_wave(output_path, output_audio, sample_rate)

    print("Done. Saved to:", output_path)


if __name__ == "__main__":
    remove_long_silences(
        input_path="input.wav",
        output_path="output_trimmed.wav",
        silence_threshold_sec=3
    )
	import webrtcvad
	import collections
	import sys
	import wave
	import contextlib
	from pydub import AudioSegment


	def read_wave(path):
	with contextlib.closing(wave.open(path, 'rb')) as wf:
	num_channels = wf.getnchannels()
	assert num_channels == 1, "Audio must be mono"
	sample_width = wf.getsampwidth()
	assert sample_width == 2, "Audio must be 16-bit PCM"
	sample_rate = wf.getframerate()
	assert sample_rate in (8000, 16000, 32000, 48000)
	pcm_data = wf.readframes(wf.getnframes())
	return pcm_data, sample_rate


	def write_wave(path, audio, sample_rate):
	with wave.open(path, 'wb') as wf:
	wf.setnchannels(1)
	wf.setsampwidth(2)
	wf.setframerate(sample_rate)
	wf.writeframes(audio)


	def frame_generator(frame_duration_ms, audio, sample_rate):
	n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
	offset = 0
	while offset + n < len(audio):
	yield audio[offset:offset + n]
	offset += n


	def remove_long_silences(input_path, output_path, silence_threshold_sec=3):
	# Convert to 16k mono wav if needed
	audio = AudioSegment.from_file(input_path)
	audio = audio.set_channels(1).set_frame_rate(16000).set_sample_width(2)
	audio.export("temp.wav", format="wav")

	pcm_data, sample_rate = read_wave("temp.wav")
	vad = webrtcvad.Vad(2)

	frames = list(frame_generator(30, pcm_data, sample_rate))

	voiced_frames = []
	silence_buffer = []
	silence_duration = 0

	frame_duration = 0.03 # 30ms

	for frame in frames:
	is_speech = vad.is_speech(frame, sample_rate)

	if is_speech:
	# Nếu trước đó silence < threshold → vẫn giữ
	if silence_duration < silence_threshold_sec:
	voiced_frames.extend(silence_buffer)
	silence_buffer = []
	silence_duration = 0
	voiced_frames.append(frame)
	else:
	silence_buffer.append(frame)
	silence_duration += frame_duration

	# Nếu silence vượt threshold → bỏ luôn
	if silence_duration >= silence_threshold_sec:
	silence_buffer = []

	# Ghép lại
	output_audio = b''.join(voiced_frames)
	write_wave(output_path, output_audio, sample_rate)

	print("Done. Saved to:", output_path)


	if __name__ == "__main__":
	remove_long_silences(
	input_path="input.wav",
	output_path="output_trimmed.wav",
	silence_threshold_sec=3
	)
No results found