feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
This commit is contained in:
198
scripts/asrx_self/main_fixed.py
Executable file
198
scripts/asrx_self/main_fixed.py
Executable file
@@ -0,0 +1,198 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Self-implemented ASRX - Fixed Version
|
||||
使用魯棒的聚類算法
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# 導入自定義模組
|
||||
from vad import load_vad_model, extract_speech_segments
|
||||
from speaker_encoder import (
|
||||
load_speaker_encoder,
|
||||
extract_speaker_embeddings_batch,
|
||||
normalize_embeddings
|
||||
)
|
||||
from speaker_cluster_fixed import robust_speaker_clustering
|
||||
|
||||
|
||||
class SelfASRXFixed:
|
||||
"""自實作說話人分離系統(修復版)"""
|
||||
|
||||
def __init__(self):
|
||||
print("[SelfASRX-Fixed] Initializing models...")
|
||||
|
||||
# 載入 VAD 模型
|
||||
print("[SelfASRX-Fixed] Loading VAD model (Silero)...")
|
||||
self.vad_model, self.vad_utils = load_vad_model()
|
||||
|
||||
# 載入聲紋模型
|
||||
print("[SelfASRX-Fixed] Loading speaker encoder (ECAPA-TDNN)...")
|
||||
self.speaker_encoder = load_speaker_encoder()
|
||||
|
||||
print("[SelfASRX-Fixed] Models loaded successfully")
|
||||
|
||||
def process(self, audio_path, output_path=None,
|
||||
min_speech_duration_ms=500,
|
||||
n_speakers=None,
|
||||
max_speakers=10):
|
||||
"""處理音頻文件"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX-Fixed] Processing: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 步驟 1: VAD
|
||||
print("\n[Step 1] Voice Activity Detection...")
|
||||
step1_start = time.time()
|
||||
|
||||
speech_segments, wav, sample_rate = extract_speech_segments(
|
||||
audio_path, self.vad_model, self.vad_utils,
|
||||
min_speech_duration_ms=min_speech_duration_ms
|
||||
)
|
||||
|
||||
step1_time = time.time() - step1_start
|
||||
print(f" Speech segments: {len(speech_segments)}")
|
||||
print(f" Total duration: {len(wav)/sample_rate:.2f}s")
|
||||
print(f" VAD time: {step1_time:.2f}s")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX-Fixed] No speech detected!")
|
||||
return {"error": "No speech detected", "segments": []}
|
||||
|
||||
# 步驟 2: 聲紋特徵提取
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
|
||||
# 提取語音片段音頻
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[start_sample:end_sample])
|
||||
|
||||
# 批量提取嵌入
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
|
||||
# 正規化
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
|
||||
# 步驟 3: 魯棒聚類
|
||||
print("\n[Step 3] Robust speaker clustering...")
|
||||
step3_start = time.time()
|
||||
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings,
|
||||
n_speakers=n_speakers,
|
||||
max_speakers=max_speakers
|
||||
)
|
||||
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Clustering time: {step3_time:.2f}s")
|
||||
|
||||
# 步驟 4: 建立輸出
|
||||
print("\n[Step 4] Building output...")
|
||||
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": []
|
||||
}
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append({
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}"
|
||||
})
|
||||
|
||||
# 統計每個說話人的總時長
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
|
||||
result["speaker_stats"] = speaker_stats
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print(f"\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
|
||||
# 保存結果
|
||||
if output_path:
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f" Results saved to: {output_path}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Self-implemented ASRX (Fixed)")
|
||||
parser.add_argument("audio_path", help="Path to audio file")
|
||||
parser.add_argument("-o", "--output", help="Output JSON path")
|
||||
parser.add_argument("--min-speech-duration", type=int, default=500)
|
||||
parser.add_argument("--n-speakers", type=int, default=None)
|
||||
parser.add_argument("--max-speakers", type=int, default=10)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
sys.exit(1)
|
||||
|
||||
asrx = SelfASRXFixed()
|
||||
result = asrx.process(
|
||||
args.audio_path,
|
||||
args.output,
|
||||
min_speech_duration_ms=args.min_speech_duration,
|
||||
n_speakers=args.n_speakers,
|
||||
max_speakers=args.max_speakers
|
||||
)
|
||||
|
||||
if "error" not in result:
|
||||
print(f"\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print(f"\n[Speaker Statistics]")
|
||||
for speaker, stats in result['speaker_stats'].items():
|
||||
pct = stats['duration'] / result['total_duration'] * 100
|
||||
print(f" {speaker}: {stats['count']} segments, " +
|
||||
f"{stats['duration']:.2f}s ({pct:.1f}%)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user