Skip to content

Instantly share code, notes, and snippets.

@ducnh1022
Created February 24, 2026 10:46
Show Gist options
  • Select an option

  • Save ducnh1022/701b48cc874a2a617d227ef7b4697413 to your computer and use it in GitHub Desktop.

Select an option

Save ducnh1022/701b48cc874a2a617d227ef7b4697413 to your computer and use it in GitHub Desktop.
import torch
import torchaudio
import numpy as np
import soundfile as sf
from speechbrain.pretrained import SpeakerRecognition
from scipy.spatial.distance import cosine
# ============================
# CONFIG
# ============================
SEGMENT_DURATION = 2.0 # giây
THRESHOLD = 0.75 # chỉnh theo thực nghiệm
# ============================
# Load model
# ============================
print("Loading speaker model...")
verification = SpeakerRecognition.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir="pretrained_model"
)
# ============================
# Extract embedding
# ============================
def get_embedding(file_path):
signal, fs = torchaudio.load(file_path)
embedding = verification.encode_batch(signal)
return embedding.squeeze().detach().cpu().numpy()
# ============================
# Extract owner voice from long file
# ============================
def extract_owner_segments(owner_embedding, long_audio_path, output_path):
audio, sr = torchaudio.load(long_audio_path)
total_samples = audio.shape[1]
segment_samples = int(sr * SEGMENT_DURATION)
owner_segments = []
print("Scanning long audio...")
for start in range(0, total_samples, segment_samples):
end = start + segment_samples
if end > total_samples:
break
segment = audio[:, start:end]
with torch.no_grad():
emb = verification.encode_batch(segment)
emb = emb.squeeze().cpu().numpy()
similarity = 1 - cosine(owner_embedding, emb)
print(f"Similarity: {similarity:.3f}")
if similarity > THRESHOLD:
owner_segments.append(segment)
if owner_segments:
final_audio = torch.cat(owner_segments, dim=1)
sf.write(output_path, final_audio.squeeze().numpy(), sr)
print("Owner voice saved to:", output_path)
else:
print("No owner voice detected.")
# ============================
# MAIN
# ============================
if __name__ == "__main__":
owner_sample = "owner.wav"
long_audio = "meeting.wav"
print("Extracting owner embedding...")
owner_embedding = get_embedding(owner_sample)
extract_owner_segments(
owner_embedding,
long_audio,
output_path="owner_only.wav"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment