feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
This commit is contained in:
119
scripts/test_pyannote_multilingual.py
Normal file
119
scripts/test_pyannote_multilingual.py
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
測試 pyannote.audio 的多語種說話人分離能力
|
||||
"""
|
||||
|
||||
print("=== pyannote.audio 多語種測試 ===\n")
|
||||
|
||||
# 1. 檢查 pyannote.audio 版本
|
||||
try:
|
||||
import pyannote
|
||||
print(f"✅ pyannote.audio 版本:{pyannote.__version__}")
|
||||
except Exception as e:
|
||||
print(f"❌ 無法導入 pyannote.audio: {e}")
|
||||
|
||||
# 2. 檢查模型
|
||||
try:
|
||||
from pyannote.audio import Pipeline
|
||||
print("✅ Pipeline 導入成功")
|
||||
|
||||
# 檢查可用模型
|
||||
print("\n可用模型:")
|
||||
print("- pyannote/speaker-diarization-3.1 (最新版)")
|
||||
print("- pyannote/speaker-diarization (穩定版)")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Pipeline 導入失敗:{e}")
|
||||
|
||||
# 3. 多語種支援說明
|
||||
print("\n=== 多語種支援說明 ===\n")
|
||||
|
||||
print("pyannote.audio 的說話人分離原理:")
|
||||
print("1. 基於聲紋特徵(非語言內容)")
|
||||
print("2. 分析音色、音調、語速等")
|
||||
print("3. 不依賴語言識別")
|
||||
print("")
|
||||
print("✅ 支援所有語言(因為不分析語意)")
|
||||
print("✅ 中文 + 英文混合也可以")
|
||||
print("✅ 粵語 + 國語混合也可以")
|
||||
print("")
|
||||
print("限制:")
|
||||
print("⚠️ 重疊說話時準確度下降")
|
||||
print("⚠️ 背景噪音影響準確度")
|
||||
print("⚠️ 需要 HuggingFace token")
|
||||
|
||||
# 4. 使用範例
|
||||
print("\n=== 使用範例 ===\n")
|
||||
|
||||
print("""
|
||||
程式碼範例:
|
||||
|
||||
from pyannote.audio import Pipeline
|
||||
|
||||
# 載入模型
|
||||
pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.1",
|
||||
use_auth_token="hf_xxxxx" # 需要 token
|
||||
)
|
||||
|
||||
# 執行說話人分離(支援任何語言)
|
||||
diarization = pipeline("audio.wav")
|
||||
|
||||
# 輸出結果
|
||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||
print(f"[{turn.start:.2f}s - {turn.end:.2f}s] {speaker}")
|
||||
|
||||
輸出範例:
|
||||
[0.00s - 5.32s] SPEAKER_00 (中文)
|
||||
[5.50s - 12.18s] SPEAKER_01 (英文)
|
||||
[12.50s - 18.75s] SPEAKER_00 (中文)
|
||||
[19.00s - 25.43s] SPEAKER_02 (日文)
|
||||
""")
|
||||
|
||||
# 5. 與 Whisper 整合
|
||||
print("\n=== 與 Whisper 整合(多語種 ASR + 說話人分離)===\n")
|
||||
|
||||
print("""
|
||||
完整流程:
|
||||
|
||||
1. Whisper 轉錄(支援多語種識別)
|
||||
2. pyannote 說話人分離(支援多語種)
|
||||
3. 整合結果
|
||||
|
||||
程式碼:
|
||||
|
||||
import whisper
|
||||
from pyannote.audio import Pipeline
|
||||
|
||||
# Whisper ASR
|
||||
whisper_model = whisper.load_model("base")
|
||||
result = whisper_model.transcribe("audio.wav")
|
||||
|
||||
# pyannote 說話人分離
|
||||
pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.1",
|
||||
use_auth_token="hf_xxxxx"
|
||||
)
|
||||
diarization = pipeline("audio.wav")
|
||||
|
||||
# 整合
|
||||
for segment in result["segments"]:
|
||||
# 找到重疊的說話人
|
||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||
if segment["start"] < turn.end and segment["end"] > turn.start:
|
||||
print(f"[{speaker}] ({result['language']}) {segment['text']}")
|
||||
break
|
||||
|
||||
輸出範例:
|
||||
[SPEAKER_00] (zh) 你好,歡迎來到今天的會議。
|
||||
[SPEAKER_01] (en) Hello, let's start the meeting.
|
||||
[SPEAKER_00] (zh) 首先討論第一季度的業績。
|
||||
[SPEAKER_02] (ja) 私は反対です。
|
||||
""")
|
||||
|
||||
print("\n=== 結論 ===\n")
|
||||
print("✅ pyannote.audio 支援多語種說話人分離")
|
||||
print("✅ 因為基於聲紋,不依賴語言")
|
||||
print("✅ 適合多語言混合場景")
|
||||
print("⚠️ 需要 HuggingFace token")
|
||||
print("⚠️ 需要接受使用條款")
|
||||
Reference in New Issue
Block a user