66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
"""
|
|
Speaker Classifier - 聲紋品質評估與性別分類
|
|
|
|
提供品質計算與性別分類功能,作為 main_fixed.py 的輔助模組。
|
|
"""
|
|
|
|
import numpy as np
|
|
|
|
|
|
def compute_embedding_quality(embeddings, labels):
|
|
"""每個 embedding 到所屬 cluster centroid 的餘弦相似度
|
|
|
|
Args:
|
|
embeddings: [n_segments, 192] 聲紋向量矩陣
|
|
labels: [n_segments] 聚類標籤
|
|
|
|
Returns:
|
|
qualities: [n_segments] 品質分數 (0-1)
|
|
"""
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
unique_labels = set(labels)
|
|
centroids = {}
|
|
for label in unique_labels:
|
|
mask = labels == label
|
|
centroid = np.mean(embeddings[mask], axis=0)
|
|
norm = np.linalg.norm(centroid)
|
|
if norm > 0:
|
|
centroid = centroid / norm
|
|
centroids[label] = centroid
|
|
|
|
qualities = []
|
|
for emb, label in zip(embeddings, labels):
|
|
sim = cosine_similarity([emb], [centroids[label]])[0][0]
|
|
qualities.append(sim)
|
|
|
|
return np.array(qualities)
|
|
|
|
|
|
def classify_gender(audio_wav, sample_rate, classifier):
|
|
"""從音頻段分類性別
|
|
|
|
Args:
|
|
audio_wav: 音頻波形 (numpy array)
|
|
sample_rate: 採樣率
|
|
classifier: SpeechBrain EncoderClassifier (gender-recognition-ecapa)
|
|
|
|
Returns:
|
|
dict: {"gender": "male"|"female"|"unknown", "confidence": float}
|
|
"""
|
|
default = {"gender": "unknown", "confidence": 0.0}
|
|
if classifier is None or len(audio_wav) == 0:
|
|
return default
|
|
try:
|
|
import torch
|
|
seg_tensor = torch.from_numpy(audio_wav).float().unsqueeze(0)
|
|
out = classifier.classify_batch(seg_tensor)
|
|
probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
|
|
if len(probs) >= 2:
|
|
idx = int(np.argmax(probs))
|
|
label = "male" if idx == 0 else "female"
|
|
return {"gender": label, "confidence": float(probs[idx])}
|
|
except Exception as e:
|
|
pass
|
|
return default
|