Files
momentry_core/scripts/asrx_self/speaker_classifier.py

66 lines
1.9 KiB
Python

"""
Speaker Classifier - 聲紋品質評估與性別分類
提供品質計算與性別分類功能,作為 main_fixed.py 的輔助模組。
"""
import numpy as np
def compute_embedding_quality(embeddings, labels):
"""每個 embedding 到所屬 cluster centroid 的餘弦相似度
Args:
embeddings: [n_segments, 192] 聲紋向量矩陣
labels: [n_segments] 聚類標籤
Returns:
qualities: [n_segments] 品質分數 (0-1)
"""
from sklearn.metrics.pairwise import cosine_similarity
unique_labels = set(labels)
centroids = {}
for label in unique_labels:
mask = labels == label
centroid = np.mean(embeddings[mask], axis=0)
norm = np.linalg.norm(centroid)
if norm > 0:
centroid = centroid / norm
centroids[label] = centroid
qualities = []
for emb, label in zip(embeddings, labels):
sim = cosine_similarity([emb], [centroids[label]])[0][0]
qualities.append(sim)
return np.array(qualities)
def classify_gender(audio_wav, sample_rate, classifier):
"""從音頻段分類性別
Args:
audio_wav: 音頻波形 (numpy array)
sample_rate: 採樣率
classifier: SpeechBrain EncoderClassifier (gender-recognition-ecapa)
Returns:
dict: {"gender": "male"|"female"|"unknown", "confidence": float}
"""
default = {"gender": "unknown", "confidence": 0.0}
if classifier is None or len(audio_wav) == 0:
return default
try:
import torch
seg_tensor = torch.from_numpy(audio_wav).float().unsqueeze(0)
out = classifier.classify_batch(seg_tensor)
probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
if len(probs) >= 2:
idx = int(np.argmax(probs))
label = "male" if idx == 0 else "female"
return {"gender": label, "confidence": float(probs[idx])}
except Exception as e:
pass
return default