Created
February 24, 2026 10:46
-
-
Save ducnh1022/701b48cc874a2a617d227ef7b4697413 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torchaudio | |
| import numpy as np | |
| import soundfile as sf | |
| from speechbrain.pretrained import SpeakerRecognition | |
| from scipy.spatial.distance import cosine | |
| # ============================ | |
| # CONFIG | |
| # ============================ | |
| SEGMENT_DURATION = 2.0 # giây | |
| THRESHOLD = 0.75 # chỉnh theo thực nghiệm | |
| # ============================ | |
| # Load model | |
| # ============================ | |
| print("Loading speaker model...") | |
| verification = SpeakerRecognition.from_hparams( | |
| source="speechbrain/spkrec-ecapa-voxceleb", | |
| savedir="pretrained_model" | |
| ) | |
| # ============================ | |
| # Extract embedding | |
| # ============================ | |
| def get_embedding(file_path): | |
| signal, fs = torchaudio.load(file_path) | |
| embedding = verification.encode_batch(signal) | |
| return embedding.squeeze().detach().cpu().numpy() | |
| # ============================ | |
| # Extract owner voice from long file | |
| # ============================ | |
| def extract_owner_segments(owner_embedding, long_audio_path, output_path): | |
| audio, sr = torchaudio.load(long_audio_path) | |
| total_samples = audio.shape[1] | |
| segment_samples = int(sr * SEGMENT_DURATION) | |
| owner_segments = [] | |
| print("Scanning long audio...") | |
| for start in range(0, total_samples, segment_samples): | |
| end = start + segment_samples | |
| if end > total_samples: | |
| break | |
| segment = audio[:, start:end] | |
| with torch.no_grad(): | |
| emb = verification.encode_batch(segment) | |
| emb = emb.squeeze().cpu().numpy() | |
| similarity = 1 - cosine(owner_embedding, emb) | |
| print(f"Similarity: {similarity:.3f}") | |
| if similarity > THRESHOLD: | |
| owner_segments.append(segment) | |
| if owner_segments: | |
| final_audio = torch.cat(owner_segments, dim=1) | |
| sf.write(output_path, final_audio.squeeze().numpy(), sr) | |
| print("Owner voice saved to:", output_path) | |
| else: | |
| print("No owner voice detected.") | |
| # ============================ | |
| # MAIN | |
| # ============================ | |
| if __name__ == "__main__": | |
| owner_sample = "owner.wav" | |
| long_audio = "meeting.wav" | |
| print("Extracting owner embedding...") | |
| owner_embedding = get_embedding(owner_sample) | |
| extract_owner_segments( | |
| owner_embedding, | |
| long_audio, | |
| output_path="owner_only.wav" | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment