feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/test_pyannote_multilingual.py
+++ b/scripts/test_pyannote_multilingual.py
@@ -0,0 +1,119 @@
+#!/opt/homebrew/bin/python3.11
+"""
+測試 pyannote.audio 的多語種說話人分離能力
+"""
+
+print("=== pyannote.audio 多語種測試 ===\n")
+
+# 1. 檢查 pyannote.audio 版本
+try:
+    import pyannote
+    print(f"✅ pyannote.audio 版本：{pyannote.__version__}")
+except Exception as e:
+    print(f"❌ 無法導入 pyannote.audio: {e}")
+
+# 2. 檢查模型
+try:
+    from pyannote.audio import Pipeline
+    print("✅ Pipeline 導入成功")
+    
+    # 檢查可用模型
+    print("\n可用模型:")
+    print("- pyannote/speaker-diarization-3.1 (最新版)")
+    print("- pyannote/speaker-diarization (穩定版)")
+    
+except Exception as e:
+    print(f"❌ Pipeline 導入失敗：{e}")
+
+# 3. 多語種支援說明
+print("\n=== 多語種支援說明 ===\n")
+
+print("pyannote.audio 的說話人分離原理：")
+print("1. 基於聲紋特徵（非語言內容）")
+print("2. 分析音色、音調、語速等")
+print("3. 不依賴語言識別")
+print("")
+print("✅ 支援所有語言（因為不分析語意）")
+print("✅ 中文 + 英文混合也可以")
+print("✅ 粵語 + 國語混合也可以")
+print("")
+print("限制：")
+print("⚠️ 重疊說話時準確度下降")
+print("⚠️ 背景噪音影響準確度")
+print("⚠️ 需要 HuggingFace token")
+
+# 4. 使用範例
+print("\n=== 使用範例 ===\n")
+
+print("""
+程式碼範例:
+
+from pyannote.audio import Pipeline
+
+# 載入模型
+pipeline = Pipeline.from_pretrained(
+    "pyannote/speaker-diarization-3.1",
+    use_auth_token="hf_xxxxx"  # 需要 token
+)
+
+# 執行說話人分離（支援任何語言）
+diarization = pipeline("audio.wav")
+
+# 輸出結果
+for turn, _, speaker in diarization.itertracks(yield_label=True):
+    print(f"[{turn.start:.2f}s - {turn.end:.2f}s] {speaker}")
+
+輸出範例:
+[0.00s - 5.32s] SPEAKER_00  (中文)
+[5.50s - 12.18s] SPEAKER_01 (英文)
+[12.50s - 18.75s] SPEAKER_00 (中文)
+[19.00s - 25.43s] SPEAKER_02 (日文)
+""")
+
+# 5. 與 Whisper 整合
+print("\n=== 與 Whisper 整合（多語種 ASR + 說話人分離）===\n")
+
+print("""
+完整流程:
+
+1. Whisper 轉錄（支援多語種識別）
+2. pyannote 說話人分離（支援多語種）
+3. 整合結果
+
+程式碼:
+
+import whisper
+from pyannote.audio import Pipeline
+
+# Whisper ASR
+whisper_model = whisper.load_model("base")
+result = whisper_model.transcribe("audio.wav")
+
+# pyannote 說話人分離
+pipeline = Pipeline.from_pretrained(
+    "pyannote/speaker-diarization-3.1",
+    use_auth_token="hf_xxxxx"
+)
+diarization = pipeline("audio.wav")
+
+# 整合
+for segment in result["segments"]:
+    # 找到重疊的說話人
+    for turn, _, speaker in diarization.itertracks(yield_label=True):
+        if segment["start"] < turn.end and segment["end"] > turn.start:
+            print(f"[{speaker}] ({result['language']}) {segment['text']}")
+            break
+
+輸出範例:
+[SPEAKER_00] (zh) 你好，歡迎來到今天的會議。
+[SPEAKER_01] (en) Hello, let's start the meeting.
+[SPEAKER_00] (zh) 首先討論第一季度的業績。
+[SPEAKER_02] (ja) 私は反対です。
+""")
+
+print("\n=== 結論 ===\n")
+print("✅ pyannote.audio 支援多語種說話人分離")
+print("✅ 因為基於聲紋，不依賴語言")
+print("✅ 適合多語言混合場景")
+print("⚠️ 需要 HuggingFace token")
+print("⚠️ 需要接受使用條款")