ducnh1022/gist:701b48cc874a2a617d227ef7b4697413

## gistfile1.txt
import torch
import torchaudio
import numpy as np
import soundfile as sf
from speechbrain.pretrained import SpeakerRecognition
from scipy.spatial.distance import cosine


# ============================
# CONFIG
# ============================
SEGMENT_DURATION = 2.0  # giây
THRESHOLD = 0.75        # chỉnh theo thực nghiệm


# ============================
# Load model
# ============================
print("Loading speaker model...")
verification = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_model"
)


# ============================
# Extract embedding
# ============================
def get_embedding(file_path):
    signal, fs = torchaudio.load(file_path)
    embedding = verification.encode_batch(signal)
    return embedding.squeeze().detach().cpu().numpy()


# ============================
# Extract owner voice from long file
# ============================
def extract_owner_segments(owner_embedding, long_audio_path, output_path):

    audio, sr = torchaudio.load(long_audio_path)
    total_samples = audio.shape[1]

    segment_samples = int(sr * SEGMENT_DURATION)
    owner_segments = []

    print("Scanning long audio...")

    for start in range(0, total_samples, segment_samples):
        end = start + segment_samples
        if end > total_samples:
            break

        segment = audio[:, start:end]

        with torch.no_grad():
            emb = verification.encode_batch(segment)

        emb = emb.squeeze().cpu().numpy()

        similarity = 1 - cosine(owner_embedding, emb)

        print(f"Similarity: {similarity:.3f}")

        if similarity > THRESHOLD:
            owner_segments.append(segment)

    if owner_segments:
        final_audio = torch.cat(owner_segments, dim=1)
        sf.write(output_path, final_audio.squeeze().numpy(), sr)
        print("Owner voice saved to:", output_path)
    else:
        print("No owner voice detected.")


# ============================
# MAIN
# ============================
if __name__ == "__main__":

    owner_sample = "owner.wav"
    long_audio = "meeting.wav"

    print("Extracting owner embedding...")
    owner_embedding = get_embedding(owner_sample)

    extract_owner_segments(
        owner_embedding,
        long_audio,
        output_path="owner_only.wav"
    )
	import torch
	import torchaudio
	import numpy as np
	import soundfile as sf
	from speechbrain.pretrained import SpeakerRecognition
	from scipy.spatial.distance import cosine


	# ============================
	# CONFIG
	# ============================
	SEGMENT_DURATION = 2.0 # giây
	THRESHOLD = 0.75 # chỉnh theo thực nghiệm


	# ============================
	# Load model
	# ============================
	print("Loading speaker model...")
	verification = SpeakerRecognition.from_hparams(
	source="speechbrain/spkrec-ecapa-voxceleb",
	savedir="pretrained_model"
	)


	# ============================
	# Extract embedding
	# ============================
	def get_embedding(file_path):
	signal, fs = torchaudio.load(file_path)
	embedding = verification.encode_batch(signal)
	return embedding.squeeze().detach().cpu().numpy()


	# ============================
	# Extract owner voice from long file
	# ============================
	def extract_owner_segments(owner_embedding, long_audio_path, output_path):

	audio, sr = torchaudio.load(long_audio_path)
	total_samples = audio.shape[1]

	segment_samples = int(sr * SEGMENT_DURATION)
	owner_segments = []

	print("Scanning long audio...")

	for start in range(0, total_samples, segment_samples):
	end = start + segment_samples
	if end > total_samples:
	break

	segment = audio[:, start:end]

	with torch.no_grad():
	emb = verification.encode_batch(segment)

	emb = emb.squeeze().cpu().numpy()

	similarity = 1 - cosine(owner_embedding, emb)

	print(f"Similarity: {similarity:.3f}")

	if similarity > THRESHOLD:
	owner_segments.append(segment)

	if owner_segments:
	final_audio = torch.cat(owner_segments, dim=1)
	sf.write(output_path, final_audio.squeeze().numpy(), sr)
	print("Owner voice saved to:", output_path)
	else:
	print("No owner voice detected.")


	# ============================
	# MAIN
	# ============================
	if __name__ == "__main__":

	owner_sample = "owner.wav"
	long_audio = "meeting.wav"

	print("Extracting owner embedding...")
	owner_embedding = get_embedding(owner_sample)

	extract_owner_segments(
	owner_embedding,
	long_audio,
	output_path="owner_only.wav"
	)
No results found