feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
This commit is contained in:
Warren
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions

View File

@@ -0,0 +1,191 @@
#!/opt/homebrew/bin/python3.11
"""
Speaker Encoder - 聲紋特徵提取
使用 ECAPA-TDNN 模型提取聲紋嵌入向量
技術來源:
- ECAPA-TDNN: Desplanques et al. (2020), Interspeech
- 論文https://arxiv.org/abs/2005.07143
- 模型SpeechBrain spkrec-ecapa-voxceleb
- 準確度EER 0.80% (VoxCeleb1)
"""
import torch
import numpy as np
from speechbrain.inference.speaker import EncoderClassifier
def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
"""
載入聲紋編碼器模型
Args:
model_name: 模型名稱HuggingFace
Returns:
classifier: 聲紋編碼器
"""
print(f"[SpeakerEncoder] Loading model: {model_name}")
classifier = EncoderClassifier.from_hparams(
source=model_name,
run_opts={"device": "cpu"}, # 使用 CPU
)
# 獲取模型資訊
print(f"[SpeakerEncoder] Model loaded successfully")
print(f"[SpeakerEncoder] Embedding dimension: 192")
return classifier
def extract_speaker_embedding(classifier, audio_waveform, sample_rate=16000):
"""
從音頻波形提取聲紋嵌入
Args:
classifier: 聲紋編碼器
audio_waveform: 音頻波形 (numpy array)
sample_rate: 採樣率
Returns:
embedding: 聲紋嵌入向量 (192 維)
"""
# 轉換為 torch tensor
if isinstance(audio_waveform, np.ndarray):
audio_tensor = torch.from_numpy(audio_waveform).float()
else:
audio_tensor = audio_waveform
# 確保是 2D [batch, time]
if audio_tensor.dim() == 1:
audio_tensor = audio_tensor.unsqueeze(0)
# 提取嵌入
with torch.no_grad():
embedding = classifier.encode_batch(audio_tensor)
# 轉換為 numpy
embedding = embedding.squeeze().cpu().numpy()
return embedding
def extract_speaker_embeddings_batch(classifier, audio_segments, sample_rate=16000):
"""
批量提取多個語音片段的聲紋嵌入
Args:
classifier: 聲紋編碼器
audio_segments: 音頻片段列表 [numpy array, ...]
sample_rate: 採樣率
Returns:
embeddings: 嵌入矩陣 [n_segments, 192]
"""
embeddings = []
for i, audio in enumerate(audio_segments):
emb = extract_speaker_embedding(classifier, audio, sample_rate)
embeddings.append(emb)
if (i + 1) % 50 == 0:
print(f"[SpeakerEncoder] Processed {i + 1} segments")
embeddings = np.vstack(embeddings)
print(f"[SpeakerEncoder] Extracted {embeddings.shape[0]} embeddings")
return embeddings
def compute_similarity_matrix(embeddings, method="cosine"):
"""
計算聲紋相似度矩陣
Args:
embeddings: 嵌入矩陣 [n_segments, 192]
method: 相似度計算方法 ('cosine', 'euclidean')
Returns:
similarity_matrix: 相似度矩陣 [n_segments, n_segments]
"""
from sklearn.metrics.pairwise import cosine_similarity
# 清洗數據:移除 NaN 和 Inf
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
# 正規化
embeddings = normalize_embeddings(embeddings)
# 再次清洗
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
if method == "cosine":
similarity = cosine_similarity(embeddings)
elif method == "euclidean":
from sklearn.metrics.pairwise import euclidean_distances
# 將距離轉換為相似度
distances = euclidean_distances(embeddings)
similarity = 1 / (1 + distances)
else:
raise ValueError(f"Unknown method: {method}")
# 確保沒有 NaN
similarity = np.nan_to_num(similarity, nan=0.5)
return similarity
def normalize_embeddings(embeddings):
"""
正規化嵌入向量(單位長度)
Args:
embeddings: 嵌入矩陣 [n_segments, 192]
Returns:
normalized: 正規化後的嵌入矩陣
"""
from sklearn.preprocessing import normalize
return normalize(embeddings, norm="l2")
if __name__ == "__main__":
# 測試聲紋編碼器
import sys
import torchaudio
if len(sys.argv) < 2:
print("Usage: python3 speaker_encoder.py <audio_path>")
sys.exit(1)
audio_path = sys.argv[1]
print("[Test] Loading speaker encoder...")
classifier = load_speaker_encoder()
print(f"\n[Test] Loading audio: {audio_path}")
wav, sr = torchaudio.load(audio_path)
# 重採樣到 16kHz
if sr != 16000:
transform = torchaudio.transforms.Resample(sr, 16000)
wav = transform(wav)
print(f"[Test] Audio shape: {wav.shape}")
print(f"[Test] Duration: {wav.shape[1] / 16000:.2f}s")
# 提取嵌入
print("\n[Test] Extracting speaker embedding...")
embedding = extract_speaker_embedding(classifier, wav.numpy())
print(f"[Test] Embedding shape: {embedding.shape}")
print(f"[Test] Embedding norm: {np.linalg.norm(embedding):.4f}")
print(f"[Test] Embedding mean: {embedding.mean():.4f}")
print(f"[Test] Embedding std: {embedding.std():.4f}")
# 顯示部分嵌入值
print(f"\n[Test] First 10 embedding values:")
print(f" {embedding[:10]}")