feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
This commit is contained in:
191
scripts/asrx_self/speaker_encoder.py
Normal file
191
scripts/asrx_self/speaker_encoder.py
Normal file
@@ -0,0 +1,191 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Encoder - 聲紋特徵提取
|
||||
使用 ECAPA-TDNN 模型提取聲紋嵌入向量
|
||||
|
||||
技術來源:
|
||||
- ECAPA-TDNN: Desplanques et al. (2020), Interspeech
|
||||
- 論文:https://arxiv.org/abs/2005.07143
|
||||
- 模型:SpeechBrain spkrec-ecapa-voxceleb
|
||||
- 準確度:EER 0.80% (VoxCeleb1)
|
||||
"""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from speechbrain.inference.speaker import EncoderClassifier
|
||||
|
||||
|
||||
def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
|
||||
"""
|
||||
載入聲紋編碼器模型
|
||||
|
||||
Args:
|
||||
model_name: 模型名稱(HuggingFace)
|
||||
|
||||
Returns:
|
||||
classifier: 聲紋編碼器
|
||||
"""
|
||||
print(f"[SpeakerEncoder] Loading model: {model_name}")
|
||||
|
||||
classifier = EncoderClassifier.from_hparams(
|
||||
source=model_name,
|
||||
run_opts={"device": "cpu"}, # 使用 CPU
|
||||
)
|
||||
|
||||
# 獲取模型資訊
|
||||
print(f"[SpeakerEncoder] Model loaded successfully")
|
||||
print(f"[SpeakerEncoder] Embedding dimension: 192")
|
||||
|
||||
return classifier
|
||||
|
||||
|
||||
def extract_speaker_embedding(classifier, audio_waveform, sample_rate=16000):
|
||||
"""
|
||||
從音頻波形提取聲紋嵌入
|
||||
|
||||
Args:
|
||||
classifier: 聲紋編碼器
|
||||
audio_waveform: 音頻波形 (numpy array)
|
||||
sample_rate: 採樣率
|
||||
|
||||
Returns:
|
||||
embedding: 聲紋嵌入向量 (192 維)
|
||||
"""
|
||||
# 轉換為 torch tensor
|
||||
if isinstance(audio_waveform, np.ndarray):
|
||||
audio_tensor = torch.from_numpy(audio_waveform).float()
|
||||
else:
|
||||
audio_tensor = audio_waveform
|
||||
|
||||
# 確保是 2D [batch, time]
|
||||
if audio_tensor.dim() == 1:
|
||||
audio_tensor = audio_tensor.unsqueeze(0)
|
||||
|
||||
# 提取嵌入
|
||||
with torch.no_grad():
|
||||
embedding = classifier.encode_batch(audio_tensor)
|
||||
|
||||
# 轉換為 numpy
|
||||
embedding = embedding.squeeze().cpu().numpy()
|
||||
|
||||
return embedding
|
||||
|
||||
|
||||
def extract_speaker_embeddings_batch(classifier, audio_segments, sample_rate=16000):
|
||||
"""
|
||||
批量提取多個語音片段的聲紋嵌入
|
||||
|
||||
Args:
|
||||
classifier: 聲紋編碼器
|
||||
audio_segments: 音頻片段列表 [numpy array, ...]
|
||||
sample_rate: 採樣率
|
||||
|
||||
Returns:
|
||||
embeddings: 嵌入矩陣 [n_segments, 192]
|
||||
"""
|
||||
embeddings = []
|
||||
|
||||
for i, audio in enumerate(audio_segments):
|
||||
emb = extract_speaker_embedding(classifier, audio, sample_rate)
|
||||
embeddings.append(emb)
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f"[SpeakerEncoder] Processed {i + 1} segments")
|
||||
|
||||
embeddings = np.vstack(embeddings)
|
||||
print(f"[SpeakerEncoder] Extracted {embeddings.shape[0]} embeddings")
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def compute_similarity_matrix(embeddings, method="cosine"):
|
||||
"""
|
||||
計算聲紋相似度矩陣
|
||||
|
||||
Args:
|
||||
embeddings: 嵌入矩陣 [n_segments, 192]
|
||||
method: 相似度計算方法 ('cosine', 'euclidean')
|
||||
|
||||
Returns:
|
||||
similarity_matrix: 相似度矩陣 [n_segments, n_segments]
|
||||
"""
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
# 清洗數據:移除 NaN 和 Inf
|
||||
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
|
||||
|
||||
# 正規化
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
|
||||
# 再次清洗
|
||||
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
|
||||
|
||||
if method == "cosine":
|
||||
similarity = cosine_similarity(embeddings)
|
||||
elif method == "euclidean":
|
||||
from sklearn.metrics.pairwise import euclidean_distances
|
||||
|
||||
# 將距離轉換為相似度
|
||||
distances = euclidean_distances(embeddings)
|
||||
similarity = 1 / (1 + distances)
|
||||
else:
|
||||
raise ValueError(f"Unknown method: {method}")
|
||||
|
||||
# 確保沒有 NaN
|
||||
similarity = np.nan_to_num(similarity, nan=0.5)
|
||||
|
||||
return similarity
|
||||
|
||||
|
||||
def normalize_embeddings(embeddings):
|
||||
"""
|
||||
正規化嵌入向量(單位長度)
|
||||
|
||||
Args:
|
||||
embeddings: 嵌入矩陣 [n_segments, 192]
|
||||
|
||||
Returns:
|
||||
normalized: 正規化後的嵌入矩陣
|
||||
"""
|
||||
from sklearn.preprocessing import normalize
|
||||
|
||||
return normalize(embeddings, norm="l2")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 測試聲紋編碼器
|
||||
import sys
|
||||
import torchaudio
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 speaker_encoder.py <audio_path>")
|
||||
sys.exit(1)
|
||||
|
||||
audio_path = sys.argv[1]
|
||||
|
||||
print("[Test] Loading speaker encoder...")
|
||||
classifier = load_speaker_encoder()
|
||||
|
||||
print(f"\n[Test] Loading audio: {audio_path}")
|
||||
wav, sr = torchaudio.load(audio_path)
|
||||
|
||||
# 重採樣到 16kHz
|
||||
if sr != 16000:
|
||||
transform = torchaudio.transforms.Resample(sr, 16000)
|
||||
wav = transform(wav)
|
||||
|
||||
print(f"[Test] Audio shape: {wav.shape}")
|
||||
print(f"[Test] Duration: {wav.shape[1] / 16000:.2f}s")
|
||||
|
||||
# 提取嵌入
|
||||
print("\n[Test] Extracting speaker embedding...")
|
||||
embedding = extract_speaker_embedding(classifier, wav.numpy())
|
||||
|
||||
print(f"[Test] Embedding shape: {embedding.shape}")
|
||||
print(f"[Test] Embedding norm: {np.linalg.norm(embedding):.4f}")
|
||||
print(f"[Test] Embedding mean: {embedding.mean():.4f}")
|
||||
print(f"[Test] Embedding std: {embedding.std():.4f}")
|
||||
|
||||
# 顯示部分嵌入值
|
||||
print(f"\n[Test] First 10 embedding values:")
|
||||
print(f" {embedding[:10]}")
|
||||
Reference in New Issue
Block a user