feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/asrx_self/speaker_encoder.py
+++ b/scripts/asrx_self/speaker_encoder.py
@@ -0,0 +1,191 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Speaker Encoder - 聲紋特徵提取
+使用 ECAPA-TDNN 模型提取聲紋嵌入向量
+
+技術來源:
+- ECAPA-TDNN: Desplanques et al. (2020), Interspeech
+- 論文：https://arxiv.org/abs/2005.07143
+- 模型：SpeechBrain spkrec-ecapa-voxceleb
+- 準確度：EER 0.80% (VoxCeleb1)
+"""
+
+import torch
+import numpy as np
+from speechbrain.inference.speaker import EncoderClassifier
+
+
+def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
+    """
+    載入聲紋編碼器模型
+
+    Args:
+        model_name: 模型名稱（HuggingFace）
+
+    Returns:
+        classifier: 聲紋編碼器
+    """
+    print(f"[SpeakerEncoder] Loading model: {model_name}")
+
+    classifier = EncoderClassifier.from_hparams(
+        source=model_name,
+        run_opts={"device": "cpu"},  # 使用 CPU
+    )
+
+    # 獲取模型資訊
+    print(f"[SpeakerEncoder] Model loaded successfully")
+    print(f"[SpeakerEncoder] Embedding dimension: 192")
+
+    return classifier
+
+
+def extract_speaker_embedding(classifier, audio_waveform, sample_rate=16000):
+    """
+    從音頻波形提取聲紋嵌入
+
+    Args:
+        classifier: 聲紋編碼器
+        audio_waveform: 音頻波形 (numpy array)
+        sample_rate: 採樣率
+
+    Returns:
+        embedding: 聲紋嵌入向量 (192 維)
+    """
+    # 轉換為 torch tensor
+    if isinstance(audio_waveform, np.ndarray):
+        audio_tensor = torch.from_numpy(audio_waveform).float()
+    else:
+        audio_tensor = audio_waveform
+
+    # 確保是 2D [batch, time]
+    if audio_tensor.dim() == 1:
+        audio_tensor = audio_tensor.unsqueeze(0)
+
+    # 提取嵌入
+    with torch.no_grad():
+        embedding = classifier.encode_batch(audio_tensor)
+
+    # 轉換為 numpy
+    embedding = embedding.squeeze().cpu().numpy()
+
+    return embedding
+
+
+def extract_speaker_embeddings_batch(classifier, audio_segments, sample_rate=16000):
+    """
+    批量提取多個語音片段的聲紋嵌入
+
+    Args:
+        classifier: 聲紋編碼器
+        audio_segments: 音頻片段列表 [numpy array, ...]
+        sample_rate: 採樣率
+
+    Returns:
+        embeddings: 嵌入矩陣 [n_segments, 192]
+    """
+    embeddings = []
+
+    for i, audio in enumerate(audio_segments):
+        emb = extract_speaker_embedding(classifier, audio, sample_rate)
+        embeddings.append(emb)
+
+        if (i + 1) % 50 == 0:
+            print(f"[SpeakerEncoder] Processed {i + 1} segments")
+
+    embeddings = np.vstack(embeddings)
+    print(f"[SpeakerEncoder] Extracted {embeddings.shape[0]} embeddings")
+
+    return embeddings
+
+
+def compute_similarity_matrix(embeddings, method="cosine"):
+    """
+    計算聲紋相似度矩陣
+
+    Args:
+        embeddings: 嵌入矩陣 [n_segments, 192]
+        method: 相似度計算方法 ('cosine', 'euclidean')
+
+    Returns:
+        similarity_matrix: 相似度矩陣 [n_segments, n_segments]
+    """
+    from sklearn.metrics.pairwise import cosine_similarity
+
+    # 清洗數據：移除 NaN 和 Inf
+    embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
+
+    # 正規化
+    embeddings = normalize_embeddings(embeddings)
+
+    # 再次清洗
+    embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
+
+    if method == "cosine":
+        similarity = cosine_similarity(embeddings)
+    elif method == "euclidean":
+        from sklearn.metrics.pairwise import euclidean_distances
+
+        # 將距離轉換為相似度
+        distances = euclidean_distances(embeddings)
+        similarity = 1 / (1 + distances)
+    else:
+        raise ValueError(f"Unknown method: {method}")
+
+    # 確保沒有 NaN
+    similarity = np.nan_to_num(similarity, nan=0.5)
+
+    return similarity
+
+
+def normalize_embeddings(embeddings):
+    """
+    正規化嵌入向量（單位長度）
+
+    Args:
+        embeddings: 嵌入矩陣 [n_segments, 192]
+
+    Returns:
+        normalized: 正規化後的嵌入矩陣
+    """
+    from sklearn.preprocessing import normalize
+
+    return normalize(embeddings, norm="l2")
+
+
+if __name__ == "__main__":
+    # 測試聲紋編碼器
+    import sys
+    import torchaudio
+
+    if len(sys.argv) < 2:
+        print("Usage: python3 speaker_encoder.py <audio_path>")
+        sys.exit(1)
+
+    audio_path = sys.argv[1]
+
+    print("[Test] Loading speaker encoder...")
+    classifier = load_speaker_encoder()
+
+    print(f"\n[Test] Loading audio: {audio_path}")
+    wav, sr = torchaudio.load(audio_path)
+
+    # 重採樣到 16kHz
+    if sr != 16000:
+        transform = torchaudio.transforms.Resample(sr, 16000)
+        wav = transform(wav)
+
+    print(f"[Test] Audio shape: {wav.shape}")
+    print(f"[Test] Duration: {wav.shape[1] / 16000:.2f}s")
+
+    # 提取嵌入
+    print("\n[Test] Extracting speaker embedding...")
+    embedding = extract_speaker_embedding(classifier, wav.numpy())
+
+    print(f"[Test] Embedding shape: {embedding.shape}")
+    print(f"[Test] Embedding norm: {np.linalg.norm(embedding):.4f}")
+    print(f"[Test] Embedding mean: {embedding.mean():.4f}")
+    print(f"[Test] Embedding std: {embedding.std():.4f}")
+
+    # 顯示部分嵌入值
+    print(f"\n[Test] First 10 embedding values:")
+    print(f"  {embedding[:10]}")