#!/opt/homebrew/bin/python3.11 """ Speaker Encoder - 聲紋特徵提取 使用 ECAPA-TDNN 模型提取聲紋嵌入向量 技術來源: - ECAPA-TDNN: Desplanques et al. (2020), Interspeech - 論文:https://arxiv.org/abs/2005.07143 - 模型:SpeechBrain spkrec-ecapa-voxceleb - 準確度:EER 0.80% (VoxCeleb1) """ import torch import numpy as np from speechbrain.inference.speaker import EncoderClassifier def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"): """ 載入聲紋編碼器模型 Args: model_name: 模型名稱(HuggingFace) Returns: classifier: 聲紋編碼器 """ print(f"[SpeakerEncoder] Loading model: {model_name}") classifier = EncoderClassifier.from_hparams( source=model_name, run_opts={"device": "cpu"}, # 使用 CPU ) # 獲取模型資訊 print("[SpeakerEncoder] Model loaded successfully") print("[SpeakerEncoder] Embedding dimension: 192") return classifier def extract_speaker_embedding(classifier, audio_waveform, sample_rate=16000): """ 從音頻波形提取聲紋嵌入 Args: classifier: 聲紋編碼器 audio_waveform: 音頻波形 (numpy array) sample_rate: 採樣率 Returns: embedding: 聲紋嵌入向量 (192 維) """ # 轉換為 torch tensor if isinstance(audio_waveform, np.ndarray): audio_tensor = torch.from_numpy(audio_waveform).float() else: audio_tensor = audio_waveform # 確保是 2D [batch, time] if audio_tensor.dim() == 1: audio_tensor = audio_tensor.unsqueeze(0) # 提取嵌入 with torch.no_grad(): embedding = classifier.encode_batch(audio_tensor) # 轉換為 numpy embedding = embedding.squeeze().cpu().numpy() return embedding def extract_speaker_embeddings_batch(classifier, audio_segments, sample_rate=16000): """ 批量提取多個語音片段的聲紋嵌入 Args: classifier: 聲紋編碼器 audio_segments: 音頻片段列表 [numpy array, ...] sample_rate: 採樣率 Returns: embeddings: 嵌入矩陣 [n_segments, 192] """ embeddings = [] for i, audio in enumerate(audio_segments): emb = extract_speaker_embedding(classifier, audio, sample_rate) embeddings.append(emb) if (i + 1) % 50 == 0: print(f"[SpeakerEncoder] Processed {i + 1} segments") embeddings = np.vstack(embeddings) print(f"[SpeakerEncoder] Extracted {embeddings.shape[0]} embeddings") return embeddings def compute_similarity_matrix(embeddings, method="cosine"): """ 計算聲紋相似度矩陣 Args: embeddings: 嵌入矩陣 [n_segments, 192] method: 相似度計算方法 ('cosine', 'euclidean') Returns: similarity_matrix: 相似度矩陣 [n_segments, n_segments] """ from sklearn.metrics.pairwise import cosine_similarity # 清洗數據:移除 NaN 和 Inf embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0) # 正規化 embeddings = normalize_embeddings(embeddings) # 再次清洗 embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0) if method == "cosine": similarity = cosine_similarity(embeddings) elif method == "euclidean": from sklearn.metrics.pairwise import euclidean_distances # 將距離轉換為相似度 distances = euclidean_distances(embeddings) similarity = 1 / (1 + distances) else: raise ValueError(f"Unknown method: {method}") # 確保沒有 NaN similarity = np.nan_to_num(similarity, nan=0.5) return similarity def normalize_embeddings(embeddings): """ 正規化嵌入向量(單位長度) Args: embeddings: 嵌入矩陣 [n_segments, 192] Returns: normalized: 正規化後的嵌入矩陣 """ from sklearn.preprocessing import normalize return normalize(embeddings, norm="l2") if __name__ == "__main__": # 測試聲紋編碼器 import sys import torchaudio if len(sys.argv) < 2: print("Usage: python3 speaker_encoder.py ") sys.exit(1) audio_path = sys.argv[1] print("[Test] Loading speaker encoder...") classifier = load_speaker_encoder() print(f"\n[Test] Loading audio: {audio_path}") wav, sr = torchaudio.load(audio_path) # 重採樣到 16kHz if sr != 16000: transform = torchaudio.transforms.Resample(sr, 16000) wav = transform(wav) print(f"[Test] Audio shape: {wav.shape}") print(f"[Test] Duration: {wav.shape[1] / 16000:.2f}s") # 提取嵌入 print("\n[Test] Extracting speaker embedding...") embedding = extract_speaker_embedding(classifier, wav.numpy()) print(f"[Test] Embedding shape: {embedding.shape}") print(f"[Test] Embedding norm: {np.linalg.norm(embedding):.4f}") print(f"[Test] Embedding mean: {embedding.mean():.4f}") print(f"[Test] Embedding std: {embedding.std():.4f}") # 顯示部分嵌入值 print("\n[Test] First 10 embedding values:") print(f" {embedding[:10]}")