Files
momentry_core/scripts/test_pyannote_multilingual.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

119 lines
3.2 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
測試 pyannote.audio 的多語種說話人分離能力
"""
print("=== pyannote.audio 多語種測試 ===\n")
# 1. 檢查 pyannote.audio 版本
try:
import pyannote
print(f"✅ pyannote.audio 版本:{pyannote.__version__}")
except Exception as e:
print(f"❌ 無法導入 pyannote.audio: {e}")
# 2. 檢查模型
try:
print("✅ Pipeline 導入成功")
# 檢查可用模型
print("\n可用模型:")
print("- pyannote/speaker-diarization-3.1 (最新版)")
print("- pyannote/speaker-diarization (穩定版)")
except Exception as e:
print(f"❌ Pipeline 導入失敗:{e}")
# 3. 多語種支援說明
print("\n=== 多語種支援說明 ===\n")
print("pyannote.audio 的說話人分離原理:")
print("1. 基於聲紋特徵(非語言內容)")
print("2. 分析音色、音調、語速等")
print("3. 不依賴語言識別")
print("")
print("✅ 支援所有語言(因為不分析語意)")
print("✅ 中文 + 英文混合也可以")
print("✅ 粵語 + 國語混合也可以")
print("")
print("限制:")
print("⚠️ 重疊說話時準確度下降")
print("⚠️ 背景噪音影響準確度")
print("⚠️ 需要 HuggingFace token")
# 4. 使用範例
print("\n=== 使用範例 ===\n")
print("""
程式碼範例:
from pyannote.audio import Pipeline
# 載入模型
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token="hf_xxxxx" # 需要 token
)
# 執行說話人分離(支援任何語言)
diarization = pipeline("audio.wav")
# 輸出結果
for turn, _, speaker in diarization.itertracks(yield_label=True):
print(f"[{turn.start:.2f}s - {turn.end:.2f}s] {speaker}")
輸出範例:
[0.00s - 5.32s] SPEAKER_00 (中文)
[5.50s - 12.18s] SPEAKER_01 (英文)
[12.50s - 18.75s] SPEAKER_00 (中文)
[19.00s - 25.43s] SPEAKER_02 (日文)
""")
# 5. 與 Whisper 整合
print("\n=== 與 Whisper 整合(多語種 ASR + 說話人分離)===\n")
print("""
完整流程:
1. Whisper 轉錄(支援多語種識別)
2. pyannote 說話人分離(支援多語種)
3. 整合結果
程式碼:
import whisper
from pyannote.audio import Pipeline
# Whisper ASR
whisper_model = whisper.load_model("base")
result = whisper_model.transcribe("audio.wav")
# pyannote 說話人分離
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token="hf_xxxxx"
)
diarization = pipeline("audio.wav")
# 整合
for segment in result["segments"]:
# 找到重疊的說話人
for turn, _, speaker in diarization.itertracks(yield_label=True):
if segment["start"] < turn.end and segment["end"] > turn.start:
print(f"[{speaker}] ({result['language']}) {segment['text']}")
break
輸出範例:
[SPEAKER_00] (zh) 你好,歡迎來到今天的會議。
[SPEAKER_01] (en) Hello, let's start the meeting.
[SPEAKER_00] (zh) 首先討論第一季度的業績。
[SPEAKER_02] (ja) 私は反対です。
""")
print("\n=== 結論 ===\n")
print("✅ pyannote.audio 支援多語種說話人分離")
print("✅ 因為基於聲紋,不依賴語言")
print("✅ 適合多語言混合場景")
print("⚠️ 需要 HuggingFace token")
print("⚠️ 需要接受使用條款")