feat: ASRX hybrid pipeline, identity history, worker fixes, checkpoint system
This commit is contained in:
@@ -1,178 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
整合 Face + ASRX 說話人分離(版本 3 - 修復 face_detected 檢查)
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def load_json(path: str):
|
||||
"""載入 JSON 文件"""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def match_face_with_speaker_v3(face_data: Dict, asrx_data: Dict,
|
||||
time_threshold: float = 3.0) -> List[Dict]:
|
||||
"""
|
||||
匹配人臉與說話人(版本 3 - 修復版)
|
||||
|
||||
修復:Face 數據沒有 face_detected 欄位,改用 faces 列表是否為空判斷
|
||||
"""
|
||||
face_frames = face_data.get('frames', [])
|
||||
asrx_segments = asrx_data.get('segments', [])
|
||||
|
||||
# 將 Face 幀按時間排序
|
||||
face_frames_sorted = sorted(face_frames, key=lambda x: x.get('timestamp', 0))
|
||||
|
||||
print(f" Face frames: {len(face_frames_sorted)}")
|
||||
print(f" ASRX segments: {len(asrx_segments)}")
|
||||
|
||||
# 匹配
|
||||
integrated = []
|
||||
|
||||
for i, seg in enumerate(asrx_segments):
|
||||
start = seg['start']
|
||||
end = seg['end']
|
||||
speaker = seg['speaker']
|
||||
mid_time = (start + end) / 2
|
||||
|
||||
# 找到時間範圍內的人臉
|
||||
faces_in_range = []
|
||||
for frame in face_frames_sorted:
|
||||
ts = frame.get('timestamp', 0)
|
||||
|
||||
# 檢查是否在時間範圍內
|
||||
if start - time_threshold <= ts <= end + time_threshold:
|
||||
# 檢查是否有人臉(faces 列表不為空)
|
||||
faces = frame.get('faces', [])
|
||||
if faces and len(faces) > 0:
|
||||
faces_in_range.append({
|
||||
'timestamp': ts,
|
||||
'faces': faces,
|
||||
'distance_from_mid': abs(ts - mid_time)
|
||||
})
|
||||
|
||||
# 選擇最接近片段中間的人臉
|
||||
if faces_in_range:
|
||||
faces_in_range.sort(key=lambda x: x['distance_from_mid'])
|
||||
best_face = faces_in_range[0]
|
||||
else:
|
||||
best_face = None
|
||||
|
||||
# 建立整合結果
|
||||
integrated.append({
|
||||
'start': start,
|
||||
'end': end,
|
||||
'duration': seg.get('duration', end - start),
|
||||
'speaker': speaker,
|
||||
'has_face': best_face is not None,
|
||||
'face_timestamp': best_face['timestamp'] if best_face else None,
|
||||
'face_location': best_face['faces'][0] if best_face and best_face['faces'] else None,
|
||||
'face_count_in_range': len(faces_in_range)
|
||||
})
|
||||
|
||||
# 進度顯示
|
||||
if (i + 1) % 200 == 0:
|
||||
print(f" Processed {i+1}/{len(asrx_segments)} segments...")
|
||||
|
||||
return integrated
|
||||
|
||||
|
||||
def analyze_speaker_face(integrated: List[Dict]):
|
||||
"""分析說話人與人臉的對應"""
|
||||
speaker_stats = {}
|
||||
|
||||
for item in integrated:
|
||||
speaker = item['speaker']
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {
|
||||
'total_segments': 0,
|
||||
'with_face': 0,
|
||||
'without_face': 0,
|
||||
'total_duration': 0
|
||||
}
|
||||
|
||||
speaker_stats[speaker]['total_segments'] += 1
|
||||
speaker_stats[speaker]['total_duration'] += item['duration']
|
||||
|
||||
if item['has_face']:
|
||||
speaker_stats[speaker]['with_face'] += 1
|
||||
else:
|
||||
speaker_stats[speaker]['without_face'] += 1
|
||||
|
||||
return speaker_stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='整合 Face + ASRX 說話人')
|
||||
parser.add_argument('face_json', help='Face 檢測結果 JSON')
|
||||
parser.add_argument('asrx_json', help='ASRX 說話人分離 JSON')
|
||||
parser.add_argument('-o', '--output', help='輸出整合結果 JSON')
|
||||
parser.add_argument('--threshold', type=float, default=3.0,
|
||||
help='時間閾值(秒)')
|
||||
parser.add_argument('--stats', action='store_true', help='只显示統計')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 載入數據
|
||||
print(f"[Load] Face: {args.face_json}")
|
||||
face_data = load_json(args.face_json)
|
||||
|
||||
print(f"[Load] ASRX: {args.asrx_json}")
|
||||
asrx_data = load_json(args.asrx_json)
|
||||
|
||||
# 匹配
|
||||
print(f"\n[Match] Matching faces with speakers (threshold={args.threshold}s)...")
|
||||
integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
|
||||
|
||||
# 分析
|
||||
print("\n[Analyze] Analyzing speaker-face correspondence...")
|
||||
speaker_stats = analyze_speaker_face(integrated)
|
||||
|
||||
# 顯示統計
|
||||
print(f"\n{'='*70}")
|
||||
print("說話人 - 人臉對應統計")
|
||||
print(f"{'='*70}")
|
||||
|
||||
total_segments = len(integrated)
|
||||
total_with_face = sum(1 for item in integrated if item['has_face'])
|
||||
|
||||
for speaker, stats in sorted(speaker_stats.items()):
|
||||
with_face_pct = stats['with_face'] / stats['total_segments'] * 100 if stats['total_segments'] > 0 else 0
|
||||
print(f"\n🔊 {speaker}:")
|
||||
print(f" 總片段:{stats['total_segments']}")
|
||||
print(f" 有人臉:{stats['with_face']} ({with_face_pct:.1f}%)")
|
||||
print(f" 無人臉:{stats['without_face']}")
|
||||
print(f" 總時長:{stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f}分鐘)")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"總計:{total_segments} 片段,{total_with_face} 片段有人臉 ({total_with_face/total_segments*100:.1f}%)")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# 保存結果
|
||||
if args.output:
|
||||
output_path = Path(args.output)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = {
|
||||
'face_source': str(args.face_json),
|
||||
'asrx_source': str(args.asrx_json),
|
||||
'time_threshold': args.threshold,
|
||||
'integrated_segments': integrated,
|
||||
'speaker_stats': speaker_stats
|
||||
}
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n[Save] Results saved to: {output_path}")
|
||||
|
||||
return integrated, speaker_stats
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,268 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Self-implemented ASRX - 自實作說話人分離系統
|
||||
基於聲紋嵌入 + 譜聚類
|
||||
|
||||
技術架構:
|
||||
1. VAD (Silero VAD) - 語音活動檢測
|
||||
2. Speaker Encoder (ECAPA-TDNN) - 聲紋特徵提取
|
||||
3. Spectral Clustering - 譜聚類
|
||||
4. Post-processing - 後處理
|
||||
|
||||
流程:
|
||||
音頻 → VAD → 語音片段 → 聲紋嵌入 → 相似度矩陣 → 譜聚類 → 說話人 ID
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# 導入自定義模組
|
||||
from vad import load_vad_model, extract_speech_segments
|
||||
from speaker_encoder import (
|
||||
load_speaker_encoder,
|
||||
extract_speaker_embeddings_batch,
|
||||
compute_similarity_matrix,
|
||||
normalize_embeddings,
|
||||
)
|
||||
from speaker_cluster import spectral_clustering_speaker, smooth_speaker_labels
|
||||
|
||||
|
||||
class SelfASRX:
|
||||
"""
|
||||
自實作說話人分離系統
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化模型"""
|
||||
print("[SelfASRX] Initializing models...")
|
||||
|
||||
# 載入 VAD 模型
|
||||
print("[SelfASRX] Loading VAD model (Silero)...")
|
||||
self.vad_model, self.vad_utils = load_vad_model()
|
||||
|
||||
# 載入聲紋模型
|
||||
print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
|
||||
self.speaker_encoder = load_speaker_encoder()
|
||||
|
||||
print("[SelfASRX] Models loaded successfully")
|
||||
|
||||
def process(
|
||||
self,
|
||||
audio_path,
|
||||
output_path=None,
|
||||
min_speech_duration_ms=500,
|
||||
n_speakers=None,
|
||||
smooth_window=5,
|
||||
):
|
||||
"""
|
||||
處理音頻文件進行說話人分離
|
||||
|
||||
Args:
|
||||
audio_path: 音頻文件路徑
|
||||
output_path: 輸出 JSON 路徑(可選)
|
||||
min_speech_duration_ms: 最小語音持續時間
|
||||
n_speakers: 說話人數量(None=自動估計)
|
||||
smooth_window: 平滑窗口大小
|
||||
|
||||
Returns:
|
||||
result: 說話人分離結果
|
||||
"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX] Processing: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 步驟 1: VAD - 語音活動檢測
|
||||
print("\n[Step 1] Voice Activity Detection...")
|
||||
step1_start = time.time()
|
||||
|
||||
speech_segments, wav, sample_rate = extract_speech_segments(
|
||||
audio_path,
|
||||
self.vad_model,
|
||||
self.vad_utils,
|
||||
min_speech_duration_ms=min_speech_duration_ms,
|
||||
)
|
||||
|
||||
step1_time = time.time() - step1_start
|
||||
print(f" Speech segments: {len(speech_segments)}")
|
||||
print(f" Total duration: {len(wav) / sample_rate:.2f}s")
|
||||
print(f" VAD time: {step1_time:.2f}s")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX] No speech detected!")
|
||||
return {"error": "No speech detected", "segments": []}
|
||||
|
||||
# 步驟 2: 聲紋特徵提取
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
|
||||
# 提取語音片段音頻
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[start_sample:end_sample])
|
||||
|
||||
# 批量提取嵌入
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
|
||||
# 正規化
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
|
||||
# 步驟 3: 計算相似度矩陣
|
||||
print("\n[Step 3] Computing similarity matrix...")
|
||||
step3_start = time.time()
|
||||
|
||||
similarity_matrix = compute_similarity_matrix(embeddings, method="cosine")
|
||||
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Similarity matrix shape: {similarity_matrix.shape}")
|
||||
print(f" Similarity time: {step3_time:.2f}s")
|
||||
|
||||
# 步驟 4: 譜聚類
|
||||
print("\n[Step 4] Spectral clustering...")
|
||||
step4_start = time.time()
|
||||
|
||||
speaker_labels, estimated_n_speakers = spectral_clustering_speaker(
|
||||
similarity_matrix, n_speakers=n_speakers, auto_estimate=(n_speakers is None)
|
||||
)
|
||||
|
||||
# 平滑標籤
|
||||
if smooth_window > 1:
|
||||
speaker_labels = smooth_speaker_labels(
|
||||
speaker_labels, window_size=smooth_window
|
||||
)
|
||||
|
||||
step4_time = time.time() - step4_start
|
||||
print(f" Estimated speakers: {estimated_n_speakers}")
|
||||
print(f" Clustering time: {step4_time:.2f}s")
|
||||
|
||||
# 步驟 5: 建立輸出結果
|
||||
print("\n[Step 5] Building output...")
|
||||
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": [],
|
||||
}
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append(
|
||||
{
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}",
|
||||
}
|
||||
)
|
||||
|
||||
# 統計每個說話人的總時長
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
|
||||
result["speaker_stats"] = speaker_stats
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print("\n[SelfASRX] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
|
||||
# 保存結果
|
||||
if output_path:
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f" Results saved to: {output_path}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""主函數"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Self-implemented ASRX - Speaker Diarization"
|
||||
)
|
||||
parser.add_argument("audio_path", help="Path to audio file")
|
||||
parser.add_argument("-o", "--output", help="Output JSON path")
|
||||
parser.add_argument(
|
||||
"--min-speech-duration",
|
||||
type=int,
|
||||
default=500,
|
||||
help="Minimum speech duration in ms (default: 500)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n-speakers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of speakers (default: auto-estimate)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--smooth-window",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Smoothing window size (default: 5)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 檢查文件是否存在
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# 創建 ASRX 實例並處理
|
||||
asrx = SelfASRX()
|
||||
result = asrx.process(
|
||||
args.audio_path,
|
||||
args.output,
|
||||
min_speech_duration_ms=args.min_speech_duration,
|
||||
n_speakers=args.n_speakers,
|
||||
smooth_window=args.smooth_window,
|
||||
)
|
||||
|
||||
# 顯示結果摘要
|
||||
if "error" not in result:
|
||||
print("\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print("\n[Speaker Statistics]")
|
||||
for speaker, stats in result["speaker_stats"].items():
|
||||
pct = stats["duration"] / result["total_duration"] * 100
|
||||
print(
|
||||
f" {speaker}: {stats['count']} segments, "
|
||||
+ f"{stats['duration']:.2f}s ({pct:.1f}%)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,308 +1,728 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Self-implemented ASRX - Fixed Version
|
||||
使用魯棒的聚類算法
|
||||
SelfASRXFixed - 7 步 Hybrid Speaker Diarization Pipeline
|
||||
|
||||
Pipeline:
|
||||
1. whisper.transcribe(full_audio) → rough segments + text + language
|
||||
2. VAD scan each rough segment → refined segments
|
||||
3. whisper per refined segment → {text, language, lang_prob}
|
||||
4. ECAPA-TDNN per refined segment → 192-dim embeddings
|
||||
5. AgglomerativeClustering → speaker_labels
|
||||
6. Store all embeddings in Qdrant (payload: file_uuid, speaker_id, text, ...)
|
||||
7. High-quality embeddings → gender classify + store reference in Qdrant
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import URLError
|
||||
|
||||
# 導入自定義模組
|
||||
from vad import load_vad_model, extract_speech_segments
|
||||
from speaker_encoder import (
|
||||
load_speaker_encoder,
|
||||
extract_speaker_embeddings_batch,
|
||||
normalize_embeddings
|
||||
)
|
||||
from speaker_cluster_fixed import robust_speaker_clustering
|
||||
|
||||
def _load_audio(path):
|
||||
"""載入音頻文件,回傳 (wav_numpy, sample_rate)"""
|
||||
import soundfile as sf
|
||||
wav, sr = sf.read(path)
|
||||
if len(wav.shape) > 1:
|
||||
wav = np.mean(wav, axis=1)
|
||||
return wav, sr
|
||||
|
||||
|
||||
def _load_whisper_model(size="small"):
|
||||
from whisper_local import load_model
|
||||
return load_model(size)
|
||||
|
||||
|
||||
def _load_vad():
|
||||
from vad import load_vad_model
|
||||
return load_vad_model()
|
||||
|
||||
|
||||
def _load_speaker_encoder():
|
||||
from speaker_encoder import load_speaker_encoder
|
||||
return load_speaker_encoder()
|
||||
|
||||
|
||||
def _load_gender_classifier():
|
||||
try:
|
||||
from speechbrain.inference.classifiers import EncoderClassifier
|
||||
classifier = EncoderClassifier.from_hparams(
|
||||
source="speechbrain/gender-recognition-ecapa",
|
||||
run_opts={"device": "cpu"},
|
||||
)
|
||||
print("[Gender] Classifier loaded: speechbrain/gender-recognition-ecapa")
|
||||
return classifier
|
||||
except Exception as e:
|
||||
print(f"[Gender] Classifier not available: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _ensure_speaker_collection(qdrant_url, api_key, collection):
|
||||
"""確認 Qdrant speaker collection 存在,不存在則建立 (dim=192, cosine)"""
|
||||
try:
|
||||
url = f"{qdrant_url}/collections/{collection}"
|
||||
req = Request(url, method="GET",
|
||||
headers={"api-key": api_key} if api_key else {})
|
||||
try:
|
||||
urlopen(req)
|
||||
return True
|
||||
except URLError as e:
|
||||
if getattr(e, "code", None) == 404:
|
||||
body = json.dumps({
|
||||
"vectors": {
|
||||
"size": 192,
|
||||
"distance": "Cosine"
|
||||
}
|
||||
}).encode()
|
||||
req = Request(url, data=body, method="PUT",
|
||||
headers={"Content-Type": "application/json",
|
||||
**({"api-key": api_key} if api_key else {})})
|
||||
urlopen(req)
|
||||
print(f"[Qdrant] Created collection: {collection} (dim=192)")
|
||||
return True
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"[Qdrant] Cannot access Qdrant: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _qdrant_upsert(qdrant_url, api_key, collection, points):
|
||||
"""批量寫入 Qdrant points"""
|
||||
try:
|
||||
url = f"{qdrant_url}/collections/{collection}/points?wait=true"
|
||||
body = json.dumps({"points": points}).encode()
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
headers["api-key"] = api_key
|
||||
req = Request(url, data=body, headers=headers, method="PUT")
|
||||
urlopen(req)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"[Qdrant] Upsert failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _hash_point_id(file_uuid, label):
|
||||
"""產生一致的 point ID"""
|
||||
s = f"{file_uuid}_{label}"
|
||||
return hash(s) & 0x7FFFFFFFFFFFFFFF
|
||||
|
||||
|
||||
def _save_checkpoint(path: str, data: dict):
|
||||
"""原子寫入 checkpoint(先 .tmp 再 rename)"""
|
||||
tmp = path + ".tmp"
|
||||
Path(tmp).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(tmp, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
os.replace(tmp, path)
|
||||
|
||||
|
||||
def compute_embedding_quality(embeddings, labels):
|
||||
"""每個 embedding 到所屬 cluster centroid 的餘弦相似度"""
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
unique_labels = set(labels)
|
||||
centroids = {}
|
||||
for label in unique_labels:
|
||||
mask = labels == label
|
||||
centroid = np.mean(embeddings[mask], axis=0)
|
||||
norm = np.linalg.norm(centroid)
|
||||
if norm > 0:
|
||||
centroid = centroid / norm
|
||||
centroids[label] = centroid
|
||||
qualities = []
|
||||
for emb, label in zip(embeddings, labels):
|
||||
sim = cosine_similarity([emb], [centroids[label]])[0][0]
|
||||
qualities.append(sim)
|
||||
return np.array(qualities)
|
||||
|
||||
|
||||
class SelfASRXFixed:
|
||||
"""自實作說話人分離系統(修復版)"""
|
||||
|
||||
"""7 步 Hybrid Speaker Diarization Pipeline"""
|
||||
|
||||
def __init__(self):
|
||||
print("[SelfASRX-Fixed] Initializing models...")
|
||||
|
||||
# 載入 VAD 模型
|
||||
print("[SelfASRX-Fixed] Loading VAD model (Silero)...")
|
||||
self.vad_model, self.vad_utils = load_vad_model()
|
||||
|
||||
# 載入聲紋模型
|
||||
print("[SelfASRX-Fixed] Loading speaker encoder (ECAPA-TDNN)...")
|
||||
self.speaker_encoder = load_speaker_encoder()
|
||||
|
||||
print("[SelfASRX-Fixed] Models loaded successfully")
|
||||
|
||||
def process(self, audio_path, output_path=None,
|
||||
min_speech_duration_ms=500,
|
||||
n_speakers=None,
|
||||
max_speakers=10):
|
||||
"""處理音頻文件"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX-Fixed] Processing: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 步驟 1: VAD
|
||||
print("\n[Step 1] Voice Activity Detection...")
|
||||
step1_start = time.time()
|
||||
|
||||
speech_segments, wav, sample_rate = extract_speech_segments(
|
||||
audio_path, self.vad_model, self.vad_utils,
|
||||
min_speech_duration_ms=min_speech_duration_ms
|
||||
)
|
||||
|
||||
step1_time = time.time() - step1_start
|
||||
print(f" Speech segments: {len(speech_segments)}")
|
||||
print(f" Total duration: {len(wav)/sample_rate:.2f}s")
|
||||
print(f" VAD time: {step1_time:.2f}s")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX-Fixed] No speech detected!")
|
||||
return {"error": "No speech detected", "segments": []}
|
||||
|
||||
# 步驟 2: 聲紋特徵提取
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
|
||||
# 提取語音片段音頻
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[start_sample:end_sample])
|
||||
|
||||
# 批量提取嵌入
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
|
||||
# 正規化
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
|
||||
# 步驟 3: 魯棒聚類
|
||||
print("\n[Step 3] Robust speaker clustering...")
|
||||
step3_start = time.time()
|
||||
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings,
|
||||
n_speakers=n_speakers,
|
||||
max_speakers=max_speakers
|
||||
)
|
||||
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Clustering time: {step3_time:.2f}s")
|
||||
|
||||
# 步驟 4: 建立輸出
|
||||
print("\n[Step 4] Building output...")
|
||||
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": []
|
||||
}
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append({
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}"
|
||||
})
|
||||
|
||||
# 統計每個說話人的總時長
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
|
||||
result["speaker_stats"] = speaker_stats
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print("\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
|
||||
# 保存結果
|
||||
if output_path:
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f" Results saved to: {output_path}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
return result
|
||||
print("[SelfASRX] Initializing models...")
|
||||
|
||||
print("[SelfASRX] Loading whisper model...")
|
||||
self.whisper = _load_whisper_model("small")
|
||||
|
||||
print("[SelfASRX] Loading VAD model (Silero)...")
|
||||
self.vad_model, self.vad_utils = _load_vad()
|
||||
|
||||
print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
|
||||
self.speaker_encoder = _load_speaker_encoder()
|
||||
|
||||
print("[SelfASRX] Loading gender classifier...")
|
||||
self.gender_classifier = _load_gender_classifier()
|
||||
|
||||
# Qdrant 設定
|
||||
self.qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333")
|
||||
self.qdrant_api_key = os.environ.get("QDRANT_API_KEY", "")
|
||||
schema = os.environ.get("DATABASE_SCHEMA", "public")
|
||||
self.qdrant_collection = os.environ.get(
|
||||
"QDRANT_SPEAKER_COLLECTION",
|
||||
f"momentry_{schema}_speaker"
|
||||
)
|
||||
self._qdrant_ok = False
|
||||
|
||||
print("[SelfASRX] Models loaded successfully")
|
||||
|
||||
def process(self, audio_path, output_path=None, file_uuid=None,
|
||||
max_speakers=10, quality_threshold=0.85,
|
||||
checkpoint_path=None):
|
||||
"""7 步 speaker diarization pipeline
|
||||
|
||||
def process_with_segments(self, audio_path, asr_segments, output_path=None):
|
||||
"""
|
||||
使用 ASR segment 邊界進行 speaker diarization,取代 VAD 步驟。
|
||||
|
||||
Args:
|
||||
audio_path: 音頻文件路徑(WAV)
|
||||
asr_segments: ASR segment 列表,每個包含 start/end(秒)
|
||||
output_path: 輸出 JSON 路徑(可選)
|
||||
audio_path: 音頻文件路徑 (WAV 16kHz mono)
|
||||
output_path: 輸出 JSON 路徑 (可選)
|
||||
file_uuid: 檔案 UUID (用於 Qdrant 儲存)
|
||||
max_speakers: 最大說話人數
|
||||
quality_threshold: 高品質聲紋門檻 (0-1)
|
||||
checkpoint_path: Step 3 完成後儲存 checkpoint 路徑
|
||||
|
||||
Returns:
|
||||
dict: segments, speaker_stats, n_speakers, total_duration, references
|
||||
"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
|
||||
print(f"\n[SelfASRX] Processing: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 載入完整音頻
|
||||
import soundfile as sf
|
||||
wav, sample_rate = sf.read(audio_path)
|
||||
if len(wav.shape) > 1:
|
||||
wav = np.mean(wav, axis=1) # 轉 mono
|
||||
print(f" Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
|
||||
# 載入音頻
|
||||
wav, sample_rate = _load_audio(audio_path)
|
||||
total_duration = len(wav) / sample_rate
|
||||
print(f" Audio: {total_duration:.2f}s, {sample_rate}Hz")
|
||||
|
||||
# 使用 ASR segments 取代 VAD (audio处理用time)
|
||||
speech_segments = [(s["start_time"], s["end_time"]) for s in asr_segments]
|
||||
print(f" Speech segments from ASR: {len(speech_segments)}")
|
||||
# ── Step 1: whisper 粗略定位 (faster-whisper) ──
|
||||
print("\n[Step 1] Initial whisper transcription...")
|
||||
t1 = time.time()
|
||||
seg_gen, info = self.whisper.transcribe(audio_path)
|
||||
rough_segments = []
|
||||
for seg in seg_gen:
|
||||
rough_segments.append({"start": seg.start, "end": seg.end, "text": seg.text})
|
||||
language = info.language if info else None
|
||||
print(f" Rough segments: {len(rough_segments)}")
|
||||
print(f" Language: {language}")
|
||||
print(f" Step 1 time: {time.time() - t1:.2f}s")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX-Fixed] No ASR segments provided!")
|
||||
return {"error": "No ASR segments", "segments": []}
|
||||
if not rough_segments:
|
||||
print("[SelfASRX] No speech detected by whisper!")
|
||||
return {"error": "No speech detected", "segments": []}
|
||||
|
||||
# 提取語音片段
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
if start_sample >= len(wav):
|
||||
# ── Step 2: VAD scan 每個 rough segment 細切 ──
|
||||
print("\n[Step 2] VAD scan for refined segmentation...")
|
||||
t2 = time.time()
|
||||
refined_segments = []
|
||||
for seg in rough_segments:
|
||||
s = seg["start"]
|
||||
e = seg["end"]
|
||||
sub = self._vad_scan_segment(wav, sample_rate, s, e)
|
||||
if sub:
|
||||
refined_segments.extend(sub)
|
||||
else:
|
||||
refined_segments.append((s, e))
|
||||
print(f" Refined segments: {len(refined_segments)}")
|
||||
print(f" Step 2 time: {time.time() - t2:.2f}s")
|
||||
|
||||
if not refined_segments:
|
||||
return {"error": "No segments after VAD scan", "segments": []}
|
||||
|
||||
# ── Step 3: whisper per refined segment ──
|
||||
print("\n[Step 3] Per-segment transcription...")
|
||||
t3 = time.time()
|
||||
CHECKPOINT_INTERVAL = 50
|
||||
|
||||
segment_texts = []
|
||||
resume_from = 0
|
||||
|
||||
# 載入既有 partial checkpoint(中斷續接)
|
||||
if checkpoint_path and os.path.exists(checkpoint_path):
|
||||
try:
|
||||
with open(checkpoint_path, "r") as f:
|
||||
cp = json.load(f)
|
||||
if cp.get("checkpoint_version") == 2 and not cp.get("step3_completed"):
|
||||
saved = cp.get("segment_texts", [])
|
||||
if saved:
|
||||
resume_from = len(saved)
|
||||
segment_texts = saved
|
||||
print(f"[Step 3] Resuming from #{resume_from}/{len(refined_segments)}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for i, (start_sec, end_sec) in enumerate(refined_segments):
|
||||
if i < resume_from:
|
||||
continue
|
||||
audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
|
||||
seg_text = self._transcribe_segment(wav, sample_rate, start_sec, end_sec)
|
||||
segment_texts.append(seg_text)
|
||||
|
||||
print(f" Audio segments extracted: {len(audio_segments)}")
|
||||
if checkpoint_path and (i + 1) % CHECKPOINT_INTERVAL == 0:
|
||||
_save_checkpoint(checkpoint_path, {
|
||||
"checkpoint_version": 2,
|
||||
"step3_completed": False,
|
||||
"step3_progress": i + 1,
|
||||
"language": language,
|
||||
"total_duration": total_duration,
|
||||
"refined_segments": [[s, e] for s, e in refined_segments],
|
||||
"segment_texts": [{
|
||||
"text": st["text"],
|
||||
"language": st["language"],
|
||||
"lang_prob": st["lang_prob"],
|
||||
} for st in segment_texts],
|
||||
"file_uuid": file_uuid,
|
||||
"max_speakers": max_speakers,
|
||||
"quality_threshold": quality_threshold,
|
||||
})
|
||||
print(f"[Checkpoint] Step 3: {i+1}/{len(refined_segments)}")
|
||||
|
||||
# 批量提取聲紋嵌入
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
print(f" Step 3 time: {time.time() - t3:.2f}s")
|
||||
|
||||
# ── Save final checkpoint after Step 3 ──
|
||||
if checkpoint_path:
|
||||
_save_checkpoint(checkpoint_path, {
|
||||
"checkpoint_version": 2,
|
||||
"step3_completed": True,
|
||||
"language": language,
|
||||
"total_duration": total_duration,
|
||||
"refined_segments": [[s, e] for s, e in refined_segments],
|
||||
"segment_texts": [{
|
||||
"text": st["text"],
|
||||
"language": st["language"],
|
||||
"lang_prob": st["lang_prob"],
|
||||
} for st in segment_texts],
|
||||
"file_uuid": file_uuid,
|
||||
"max_speakers": max_speakers,
|
||||
"quality_threshold": quality_threshold,
|
||||
})
|
||||
print(f"[Checkpoint] Step 3 complete, saved to {checkpoint_path}")
|
||||
|
||||
# ── Step 4: ECAPA-TDNN per refined segment ──
|
||||
print("\n[Step 4] Speaker embedding extraction...")
|
||||
t4 = time.time()
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in refined_segments:
|
||||
s = int(start_sec * sample_rate)
|
||||
e = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[s:min(e, len(wav))])
|
||||
|
||||
from speaker_encoder import extract_speaker_embeddings_batch, normalize_embeddings
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
print(f" Embeddings: {embeddings.shape}")
|
||||
print(f" Step 4 time: {time.time() - t4:.2f}s")
|
||||
|
||||
# 聚類
|
||||
print("\n[Step 3] Robust speaker clustering...")
|
||||
step3_start = time.time()
|
||||
# ── Step 5: AgglomerativeClustering ──
|
||||
print("\n[Step 5] Speaker clustering...")
|
||||
t5 = time.time()
|
||||
from speaker_cluster_fixed import robust_speaker_clustering
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings, n_speakers=None, max_speakers=10
|
||||
embeddings, n_speakers=None, max_speakers=max_speakers
|
||||
)
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Clustering time: {step3_time:.2f}s")
|
||||
print(f" Speakers: {estimated_n_speakers}")
|
||||
print(f" Step 5 time: {time.time() - t5:.2f}s")
|
||||
|
||||
# 建立輸出
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": []
|
||||
}
|
||||
# 品質計算
|
||||
qualities = compute_embedding_quality(embeddings, speaker_labels)
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append({
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}"
|
||||
})
|
||||
|
||||
# 加入 embeddings(每個 segment 對應的 192-D speaker embedding)
|
||||
result["embeddings"] = []
|
||||
for emb in embeddings:
|
||||
result["embeddings"].append(emb.tolist())
|
||||
# 建立輸出 segments
|
||||
segments = []
|
||||
for i, ((start_sec, end_sec), label) in enumerate(
|
||||
zip(refined_segments, speaker_labels)):
|
||||
seg = {
|
||||
"start": round(start_sec, 3),
|
||||
"end": round(end_sec, 3),
|
||||
"start_frame": int(start_sec * 30),
|
||||
"end_frame": int(end_sec * 30),
|
||||
"text": segment_texts[i]["text"],
|
||||
"language": segment_texts[i]["language"],
|
||||
"lang_prob": segment_texts[i]["lang_prob"],
|
||||
"speaker": f"SPEAKER_{int(label)}",
|
||||
"speaker_id": f"SPEAKER_{int(label)}",
|
||||
"quality": float(qualities[i]),
|
||||
}
|
||||
segments.append(seg)
|
||||
|
||||
# 統計
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
result["speaker_stats"] = speaker_stats
|
||||
for seg in segments:
|
||||
spk = seg["speaker_id"]
|
||||
dur = seg["end"] - seg["start"]
|
||||
if spk not in speaker_stats:
|
||||
speaker_stats[spk] = {"count": 0, "duration": 0}
|
||||
speaker_stats[spk]["count"] += 1
|
||||
speaker_stats[spk]["duration"] += dur
|
||||
|
||||
result = {
|
||||
"language": language or "",
|
||||
"segments": segments,
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"speaker_stats": speaker_stats,
|
||||
"total_duration": total_duration,
|
||||
"n_segments": len(segments),
|
||||
}
|
||||
|
||||
# ── Step 6: Store embeddings in Qdrant ──
|
||||
if file_uuid:
|
||||
print("\n[Step 6] Storing embeddings in Qdrant...")
|
||||
t6 = time.time()
|
||||
self._store_speaker_embeddings(segments, embeddings, speaker_labels,
|
||||
file_uuid)
|
||||
print(f" Step 6 time: {time.time() - t6:.2f}s")
|
||||
|
||||
# ── Step 7: High-quality classification ──
|
||||
if file_uuid:
|
||||
print("\n[Step 7] Classifying high-quality embeddings...")
|
||||
t7 = time.time()
|
||||
references = self._classify_high_quality_speakers(
|
||||
segments, embeddings, speaker_labels, file_uuid,
|
||||
wav, sample_rate, quality_threshold
|
||||
)
|
||||
if references:
|
||||
result["references"] = references
|
||||
print(f" Step 7 time: {time.time() - t7:.2f}s")
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print("\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
if total_duration > 0:
|
||||
result["realtime_factor"] = round(total_duration / total_time, 2)
|
||||
|
||||
# 保存輸出
|
||||
if output_path:
|
||||
import json
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
print(f" Results saved to: {output_path}")
|
||||
print(f"\n[SelfASRX] Saved to: {output_path}")
|
||||
|
||||
print(f"\n[SelfASRX] Done! {len(segments)} segments, "
|
||||
f"{estimated_n_speakers} speakers, "
|
||||
f"{total_time:.2f}s")
|
||||
|
||||
print("=" * 60)
|
||||
return result
|
||||
|
||||
def resume_from_checkpoint(self, checkpoint_path, audio_path,
|
||||
output_path=None):
|
||||
"""從 checkpoint 載入 Steps 1-3 結果,執行 Steps 4-7"""
|
||||
print(f"\n[SelfASRX] Resuming from checkpoint: {checkpoint_path}")
|
||||
print("=" * 60)
|
||||
|
||||
with open(checkpoint_path, "r", encoding="utf-8") as f:
|
||||
cp = json.load(f)
|
||||
|
||||
if not cp.get("step3_completed"):
|
||||
error_msg = f"Checkpoint step3 not completed (progress: {cp.get('step3_progress', '?')})"
|
||||
print(f"[SelfASRX] {error_msg}")
|
||||
return {"error": error_msg, "segments": []}
|
||||
|
||||
wav, sample_rate = _load_audio(audio_path)
|
||||
refined_segments = [tuple(s) for s in cp["refined_segments"]]
|
||||
segment_texts = cp["segment_texts"]
|
||||
language = cp.get("language", "")
|
||||
total_duration = cp.get("total_duration", 0)
|
||||
file_uuid = cp.get("file_uuid")
|
||||
max_speakers = cp.get("max_speakers", 10)
|
||||
quality_threshold = cp.get("quality_threshold", 0.85)
|
||||
|
||||
print(f" Loaded checkpoint: {len(refined_segments)} segments, "
|
||||
f"language={language}, duration={total_duration:.2f}s")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# ── Step 4: ECAPA-TDNN per refined segment ──
|
||||
print("\n[Step 4] Speaker embedding extraction...")
|
||||
t4 = time.time()
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in refined_segments:
|
||||
s = int(start_sec * sample_rate)
|
||||
e = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[s:min(e, len(wav))])
|
||||
|
||||
from speaker_encoder import extract_speaker_embeddings_batch, normalize_embeddings
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
print(f" Embeddings: {embeddings.shape}")
|
||||
print(f" Step 4 time: {time.time() - t4:.2f}s")
|
||||
|
||||
# ── Step 5: AgglomerativeClustering ──
|
||||
print("\n[Step 5] Speaker clustering...")
|
||||
t5 = time.time()
|
||||
from speaker_cluster_fixed import robust_speaker_clustering
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings, n_speakers=None, max_speakers=max_speakers
|
||||
)
|
||||
print(f" Speakers: {estimated_n_speakers}")
|
||||
print(f" Step 5 time: {time.time() - t5:.2f}s")
|
||||
|
||||
# 品質計算
|
||||
qualities = compute_embedding_quality(embeddings, speaker_labels)
|
||||
|
||||
# 建立輸出 segments
|
||||
segments = []
|
||||
for i, ((start_sec, end_sec), label) in enumerate(
|
||||
zip(refined_segments, speaker_labels)):
|
||||
seg = {
|
||||
"start": round(start_sec, 3),
|
||||
"end": round(end_sec, 3),
|
||||
"start_frame": int(start_sec * 30),
|
||||
"end_frame": int(end_sec * 30),
|
||||
"text": segment_texts[i]["text"],
|
||||
"language": segment_texts[i]["language"],
|
||||
"lang_prob": segment_texts[i]["lang_prob"],
|
||||
"speaker": f"SPEAKER_{int(label)}",
|
||||
"speaker_id": f"SPEAKER_{int(label)}",
|
||||
"quality": float(qualities[i]),
|
||||
}
|
||||
segments.append(seg)
|
||||
|
||||
# 統計
|
||||
speaker_stats = {}
|
||||
for seg in segments:
|
||||
spk = seg["speaker_id"]
|
||||
dur = seg["end"] - seg["start"]
|
||||
if spk not in speaker_stats:
|
||||
speaker_stats[spk] = {"count": 0, "duration": 0}
|
||||
speaker_stats[spk]["count"] += 1
|
||||
speaker_stats[spk]["duration"] += dur
|
||||
|
||||
result = {
|
||||
"language": language or "",
|
||||
"segments": segments,
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"speaker_stats": speaker_stats,
|
||||
"total_duration": total_duration,
|
||||
"n_segments": len(segments),
|
||||
}
|
||||
|
||||
# ── Step 6: Store embeddings in Qdrant ──
|
||||
if file_uuid:
|
||||
print("\n[Step 6] Storing embeddings in Qdrant...")
|
||||
t6 = time.time()
|
||||
self._store_speaker_embeddings(segments, embeddings, speaker_labels,
|
||||
file_uuid)
|
||||
print(f" Step 6 time: {time.time() - t6:.2f}s")
|
||||
|
||||
# ── Step 7: High-quality classification ──
|
||||
if file_uuid:
|
||||
print("\n[Step 7] Classifying high-quality embeddings...")
|
||||
t7 = time.time()
|
||||
references = self._classify_high_quality_speakers(
|
||||
segments, embeddings, speaker_labels, file_uuid,
|
||||
wav, sample_rate, quality_threshold
|
||||
)
|
||||
if references:
|
||||
result["references"] = references
|
||||
print(f" Step 7 time: {time.time() - t7:.2f}s")
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
if total_duration > 0:
|
||||
result["realtime_factor"] = round(total_duration / total_time, 2)
|
||||
|
||||
# 保存輸出
|
||||
if output_path:
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n[SelfASRX] Saved to: {output_path}")
|
||||
|
||||
print(f"\n[SelfASRX] Done! {len(segments)} segments, "
|
||||
f"{estimated_n_speakers} speakers, "
|
||||
f"{total_time:.2f}s")
|
||||
|
||||
return result
|
||||
|
||||
# ── Internal helpers ──
|
||||
|
||||
def _vad_scan_segment(self, wav, sample_rate, start_sec, end_sec):
|
||||
"""VAD 細切單一段落"""
|
||||
from vad import scan_within_segment
|
||||
return scan_within_segment(
|
||||
wav, sample_rate, start_sec, end_sec,
|
||||
self.vad_model, self.vad_utils
|
||||
)
|
||||
|
||||
def _transcribe_segment(self, wav, sample_rate, start_sec, end_sec):
|
||||
"""轉錄單一段落"""
|
||||
from whisper_local import transcribe_segment
|
||||
return transcribe_segment(wav, sample_rate, start_sec, end_sec, self.whisper)
|
||||
|
||||
def _store_speaker_embeddings(self, segments, embeddings, labels, file_uuid):
|
||||
"""Step 6: 所有 embedding 存入 Qdrant"""
|
||||
if not self._ensure_qdrant():
|
||||
return
|
||||
|
||||
points = []
|
||||
for i, (seg, emb, label) in enumerate(
|
||||
zip(segments, embeddings, labels)):
|
||||
point_id = _hash_point_id(file_uuid, f"{i}")
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": emb.tolist(),
|
||||
"payload": {
|
||||
"type": "speaker_embedding",
|
||||
"file_uuid": file_uuid,
|
||||
"speaker_id": seg["speaker_id"],
|
||||
"text": seg["text"],
|
||||
"language": seg["language"],
|
||||
"start_time": seg["start"],
|
||||
"end_time": seg["end"],
|
||||
}
|
||||
})
|
||||
|
||||
ok = _qdrant_upsert(self.qdrant_url, self.qdrant_api_key,
|
||||
self.qdrant_collection, points)
|
||||
if ok:
|
||||
print(f" Stored {len(points)} speaker embeddings to Qdrant")
|
||||
return ok
|
||||
|
||||
def _classify_high_quality_speakers(self, segments, embeddings, labels,
|
||||
file_uuid, wav, sample_rate,
|
||||
threshold=0.85):
|
||||
"""Step 7: 高品質聲紋分級 + 性別分類 → Qdrant reference"""
|
||||
qualities = compute_embedding_quality(embeddings, labels)
|
||||
high_mask = qualities >= threshold
|
||||
|
||||
if not np.any(high_mask):
|
||||
print(" No high-quality embeddings found")
|
||||
return []
|
||||
|
||||
unique_labels = set(labels)
|
||||
references = []
|
||||
for label in unique_labels:
|
||||
mask = (labels == label) & high_mask
|
||||
if not np.any(mask):
|
||||
continue
|
||||
high_indices = [i for i in range(len(segments)) if mask[i]]
|
||||
high_segs = [segments[i] for i in high_indices]
|
||||
|
||||
# 取品質最高的 segment index
|
||||
best_idx = high_indices[int(np.argmax(qualities[mask]))]
|
||||
best_seg = segments[best_idx]
|
||||
|
||||
centroid = np.mean(embeddings[mask], axis=0)
|
||||
norm = np.linalg.norm(centroid)
|
||||
if norm > 0:
|
||||
centroid = centroid / norm
|
||||
|
||||
avg_quality = float(np.mean(qualities[mask]))
|
||||
speaker_id = f"SPEAKER_{int(label)}"
|
||||
text_samples = [s["text"] for s in high_segs[:5] if s["text"]]
|
||||
total_dur = sum(s["end"] - s["start"] for s in high_segs)
|
||||
|
||||
ref_id = _hash_point_id(file_uuid, f"ref_{label}")
|
||||
ref_payload = {
|
||||
"type": "speaker_reference",
|
||||
"file_uuid": file_uuid,
|
||||
"speaker_id": speaker_id,
|
||||
"n_segments": int(np.sum(mask)),
|
||||
"avg_quality": avg_quality,
|
||||
"total_duration": round(total_dur, 2),
|
||||
"language": best_seg.get("language", ""),
|
||||
"text_samples": text_samples,
|
||||
}
|
||||
|
||||
# 性別分類:用最佳 segment 的音頻
|
||||
if self.gender_classifier is not None:
|
||||
try:
|
||||
import torch
|
||||
s = int(best_seg["start"] * sample_rate)
|
||||
e = int(best_seg["end"] * sample_rate)
|
||||
seg_wav = wav[s:min(e, len(wav))]
|
||||
seg_tensor = torch.from_numpy(seg_wav).float().unsqueeze(0)
|
||||
# SpeechBrain gender classifier 接受音頻
|
||||
out = self.gender_classifier.classify_batch(seg_tensor)
|
||||
probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
|
||||
if len(probs) >= 2:
|
||||
idx = int(np.argmax(probs))
|
||||
ref_payload["gender"] = "male" if idx == 0 else "female"
|
||||
ref_payload["gender_conf"] = float(probs[idx])
|
||||
else:
|
||||
ref_payload["gender"] = "unknown"
|
||||
ref_payload["gender_conf"] = 0.0
|
||||
except Exception as e:
|
||||
print(f"[Gender] Classify error: {e}")
|
||||
ref_payload["gender"] = "unknown"
|
||||
ref_payload["gender_conf"] = 0.0
|
||||
else:
|
||||
ref_payload["gender"] = "unknown"
|
||||
ref_payload["gender_conf"] = 0.0
|
||||
|
||||
_qdrant_upsert(self.qdrant_url, self.qdrant_api_key,
|
||||
self.qdrant_collection, [{
|
||||
"id": ref_id,
|
||||
"vector": centroid.tolist(),
|
||||
"payload": ref_payload,
|
||||
}])
|
||||
|
||||
references.append({
|
||||
"speaker_id": speaker_id,
|
||||
"n_segments": int(np.sum(mask)),
|
||||
"avg_quality": avg_quality,
|
||||
"gender": ref_payload["gender"],
|
||||
})
|
||||
|
||||
print(f" Ref: {speaker_id}, gender={ref_payload['gender']}"
|
||||
f" ({ref_payload['gender_conf']:.2f}), q={avg_quality:.3f}")
|
||||
|
||||
return references
|
||||
|
||||
def _ensure_qdrant(self):
|
||||
"""確保 Qdrant collection 可用"""
|
||||
if not self._qdrant_ok:
|
||||
ok = _ensure_speaker_collection(
|
||||
self.qdrant_url, self.qdrant_api_key, self.qdrant_collection
|
||||
)
|
||||
self._qdrant_ok = ok
|
||||
return self._qdrant_ok
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Self-implemented ASRX (Fixed)")
|
||||
parser.add_argument("audio_path", help="Path to audio file")
|
||||
parser = argparse.ArgumentParser(description="SelfASRX - Hybrid Speaker Diarization")
|
||||
parser.add_argument("audio_path", help="Path to audio file (WAV)")
|
||||
parser.add_argument("-o", "--output", help="Output JSON path")
|
||||
parser.add_argument("--min-speech-duration", type=int, default=500)
|
||||
parser.add_argument("--n-speakers", type=int, default=None)
|
||||
parser.add_argument("--file-uuid", help="File UUID for Qdrant storage")
|
||||
parser.add_argument("--max-speakers", type=int, default=10)
|
||||
|
||||
parser.add_argument("--quality-threshold", type=float, default=0.85)
|
||||
parser.add_argument("--resume", help="Checkpoint path to resume from")
|
||||
parser.add_argument("--checkpoint", help="Save checkpoint path after Step 3")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
asrx = SelfASRXFixed()
|
||||
result = asrx.process(
|
||||
args.audio_path,
|
||||
args.output,
|
||||
min_speech_duration_ms=args.min_speech_duration,
|
||||
n_speakers=args.n_speakers,
|
||||
max_speakers=args.max_speakers
|
||||
)
|
||||
|
||||
|
||||
if args.resume:
|
||||
if not Path(args.resume).exists():
|
||||
print(f"Error: Checkpoint not found: {args.resume}")
|
||||
sys.exit(1)
|
||||
result = asrx.resume_from_checkpoint(
|
||||
args.resume, args.audio_path,
|
||||
output_path=args.output,
|
||||
)
|
||||
else:
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
sys.exit(1)
|
||||
|
||||
result = asrx.process(
|
||||
args.audio_path,
|
||||
output_path=args.output,
|
||||
file_uuid=args.file_uuid,
|
||||
max_speakers=args.max_speakers,
|
||||
quality_threshold=args.quality_threshold,
|
||||
checkpoint_path=args.checkpoint,
|
||||
)
|
||||
|
||||
if "error" not in result:
|
||||
print("\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print("\n[Speaker Statistics]")
|
||||
for speaker, stats in result['speaker_stats'].items():
|
||||
pct = stats['duration'] / result['total_duration'] * 100
|
||||
print(f" {speaker}: {stats['count']} segments, " +
|
||||
f"{stats['duration']:.2f}s ({pct:.1f}%)")
|
||||
print(f" Duration: {result['total_duration']:.2f}s")
|
||||
print(f" Segments: {result['n_segments']}")
|
||||
print(f" Speakers: {result['n_speakers']}")
|
||||
if "references" in result:
|
||||
for ref in result["references"]:
|
||||
print(f" {ref['speaker_id']}: gender={ref['gender']}, "
|
||||
f"quality={ref['avg_quality']:.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,280 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Audio Player - 說話人語音播放器
|
||||
從 ASRX 結果中提取並播放每個說話人的語音片段
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
def load_asrx_result(result_path: str) -> Dict:
|
||||
"""載入 ASRX 結果"""
|
||||
with open(result_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def extract_audio_segment(
|
||||
audio_path: str, start_sec: float, end_sec: float, output_path: str
|
||||
) -> bool:
|
||||
"""
|
||||
使用 ffmpeg 提取音頻片段
|
||||
|
||||
Args:
|
||||
audio_path: 原始音頻路徑
|
||||
start_sec: 開始時間(秒)
|
||||
end_sec: 結束時間(秒)
|
||||
output_path: 輸出路徑
|
||||
|
||||
Returns:
|
||||
bool: 是否成功
|
||||
"""
|
||||
duration = end_sec - start_sec
|
||||
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
audio_path,
|
||||
"-ss",
|
||||
str(start_sec),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
output_path,
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
return result.returncode == 0
|
||||
except Exception as e:
|
||||
print(f"Error extracting audio: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def play_audio(audio_path: str) -> bool:
|
||||
"""
|
||||
播放音頻文件
|
||||
|
||||
使用 macOS 的 afplay 或 Linux 的 aplay
|
||||
"""
|
||||
try:
|
||||
# 嘗試使用 afplay (macOS)
|
||||
if os.path.exists("/usr/bin/afplay"):
|
||||
subprocess.run(["afplay", audio_path], check=True)
|
||||
# 嘗試使用 aplay (Linux)
|
||||
elif os.path.exists("/usr/bin/aplay"):
|
||||
subprocess.run(["aplay", audio_path], check=True)
|
||||
else:
|
||||
print(
|
||||
"No audio player found. Please install afplay (macOS) or aplay (Linux)"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error playing audio: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def group_segments_by_speaker(segments: List[Dict]) -> Dict[str, List[Dict]]:
|
||||
"""將語音片段按說話人分組"""
|
||||
speaker_segments = {}
|
||||
|
||||
for seg in segments:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_segments:
|
||||
speaker_segments[speaker] = []
|
||||
speaker_segments[speaker].append(seg)
|
||||
|
||||
# 按開始時間排序
|
||||
for speaker in speaker_segments:
|
||||
speaker_segments[speaker].sort(key=lambda x: x["start"])
|
||||
|
||||
return speaker_segments
|
||||
|
||||
|
||||
def play_speaker_segments(
|
||||
audio_path: str,
|
||||
result_path: str,
|
||||
speaker_id: str = None,
|
||||
limit: int = None,
|
||||
temp_dir: str = None,
|
||||
):
|
||||
"""
|
||||
播放指定說話人的語音片段
|
||||
|
||||
Args:
|
||||
audio_path: 原始音頻路徑
|
||||
result_path: ASRX 結果 JSON 路徑
|
||||
speaker_id: 說話人 ID(None=播放所有)
|
||||
limit: 最多播放幾個片段(None=全部)
|
||||
temp_dir: 臨時目錄
|
||||
"""
|
||||
# 載入結果
|
||||
print(f"[Load] Loading ASRX result: {result_path}")
|
||||
result = load_asrx_result(result_path)
|
||||
|
||||
segments = result.get("segments", [])
|
||||
total_duration = result.get("total_duration", 0)
|
||||
|
||||
print(f"[Info] Total segments: {len(segments)}")
|
||||
print(f"[Info] Total duration: {total_duration / 60:.1f} minutes")
|
||||
|
||||
# 分組
|
||||
speaker_segments = group_segments_by_speaker(segments)
|
||||
|
||||
# 選擇說話人
|
||||
if speaker_id:
|
||||
speakers_to_play = [speaker_id]
|
||||
else:
|
||||
speakers_to_play = sorted(speaker_segments.keys())
|
||||
|
||||
# 創建臨時目錄
|
||||
if temp_dir is None:
|
||||
temp_dir = tempfile.mkdtemp(prefix="speaker_audio_")
|
||||
|
||||
print(f"\n[Info] Temp directory: {temp_dir}")
|
||||
print(f"[Info] Speakers to play: {speakers_to_play}")
|
||||
print("=" * 60)
|
||||
|
||||
# 播放每個說話人的片段
|
||||
for speaker in speakers_to_play:
|
||||
if speaker not in speaker_segments:
|
||||
print(f"\n[Warning] Speaker {speaker} not found!")
|
||||
continue
|
||||
|
||||
segs = speaker_segments[speaker]
|
||||
if limit:
|
||||
segs = segs[:limit]
|
||||
|
||||
print(f"\n▶️ {speaker} ({len(segs)} segments)")
|
||||
print("-" * 60)
|
||||
|
||||
for i, seg in enumerate(segs, 1):
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
|
||||
# 提取音頻
|
||||
temp_audio = os.path.join(temp_dir, f"{speaker}_{i:03d}.wav")
|
||||
|
||||
print(
|
||||
f" [{i:3d}] {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s) ... ",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
if extract_audio_segment(audio_path, start, end, temp_audio):
|
||||
print("✅", end="", flush=True)
|
||||
|
||||
# 播放
|
||||
if play_audio(temp_audio):
|
||||
print(" ▶️ Played")
|
||||
else:
|
||||
print(" ❌ Play failed")
|
||||
else:
|
||||
print(" ❌ Extract failed")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def show_speaker_stats(result_path: str):
|
||||
"""顯示說話人統計資訊"""
|
||||
result = load_asrx_result(result_path)
|
||||
|
||||
segments = result.get("segments", [])
|
||||
speaker_segments = group_segments_by_speaker(segments)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("說話人統計")
|
||||
print("=" * 60)
|
||||
|
||||
# 按時長排序
|
||||
speaker_stats = []
|
||||
for speaker, segs in speaker_segments.items():
|
||||
total_duration = sum(seg["duration"] for seg in segs)
|
||||
speaker_stats.append((speaker, len(segs), total_duration))
|
||||
|
||||
speaker_stats.sort(key=lambda x: x[2], reverse=True)
|
||||
|
||||
total_duration = result.get("total_duration", 0)
|
||||
|
||||
for speaker, count, duration in speaker_stats:
|
||||
pct = duration / total_duration * 100 if total_duration > 0 else 0
|
||||
print(f"{speaker:12} {count:4} segments {duration:8.1f}s ({pct:5.1f}%)")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Speaker Audio Player - 播放說話人語音片段",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# 顯示說話人統計
|
||||
python3 speaker_audio_player.py --stats result.json
|
||||
|
||||
# 播放所有說話人的前 3 個片段
|
||||
python3 speaker_audio_player.py audio.wav result.json --limit 3
|
||||
|
||||
# 播放特定說話人的所有片段
|
||||
python3 speaker_audio_player.py audio.wav result.json --speaker SPEAKER_0
|
||||
|
||||
# 播放 SPEAKER_1 的前 5 個片段
|
||||
python3 speaker_audio_player.py audio.wav result.json --speaker SPEAKER_1 --limit 5
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("audio_path", nargs="?", help="原始音頻文件路徑")
|
||||
parser.add_argument("result_path", help="ASRX 結果 JSON 路徑")
|
||||
parser.add_argument("--stats", action="store_true", help="只显示說話人統計")
|
||||
parser.add_argument("--speaker", type=str, help="指定說話人 ID(如 SPEAKER_0)")
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=None,
|
||||
help="每個說話人最多播放幾個片段(None=全部)",
|
||||
)
|
||||
parser.add_argument("--temp-dir", type=str, default=None, help="臨時目錄路徑")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.stats:
|
||||
show_speaker_stats(args.result_path)
|
||||
return
|
||||
|
||||
if not args.audio_path:
|
||||
print("Error: audio_path is required unless --stats is specified")
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
return
|
||||
|
||||
if not Path(args.result_path).exists():
|
||||
print(f"Error: Result file not found: {args.result_path}")
|
||||
return
|
||||
|
||||
play_speaker_segments(
|
||||
args.audio_path,
|
||||
args.result_path,
|
||||
speaker_id=args.speaker,
|
||||
limit=args.limit,
|
||||
temp_dir=args.temp_dir,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
65
scripts/asrx_self/speaker_classifier.py
Normal file
65
scripts/asrx_self/speaker_classifier.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
Speaker Classifier - 聲紋品質評估與性別分類
|
||||
|
||||
提供品質計算與性別分類功能,作為 main_fixed.py 的輔助模組。
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def compute_embedding_quality(embeddings, labels):
|
||||
"""每個 embedding 到所屬 cluster centroid 的餘弦相似度
|
||||
|
||||
Args:
|
||||
embeddings: [n_segments, 192] 聲紋向量矩陣
|
||||
labels: [n_segments] 聚類標籤
|
||||
|
||||
Returns:
|
||||
qualities: [n_segments] 品質分數 (0-1)
|
||||
"""
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
unique_labels = set(labels)
|
||||
centroids = {}
|
||||
for label in unique_labels:
|
||||
mask = labels == label
|
||||
centroid = np.mean(embeddings[mask], axis=0)
|
||||
norm = np.linalg.norm(centroid)
|
||||
if norm > 0:
|
||||
centroid = centroid / norm
|
||||
centroids[label] = centroid
|
||||
|
||||
qualities = []
|
||||
for emb, label in zip(embeddings, labels):
|
||||
sim = cosine_similarity([emb], [centroids[label]])[0][0]
|
||||
qualities.append(sim)
|
||||
|
||||
return np.array(qualities)
|
||||
|
||||
|
||||
def classify_gender(audio_wav, sample_rate, classifier):
|
||||
"""從音頻段分類性別
|
||||
|
||||
Args:
|
||||
audio_wav: 音頻波形 (numpy array)
|
||||
sample_rate: 採樣率
|
||||
classifier: SpeechBrain EncoderClassifier (gender-recognition-ecapa)
|
||||
|
||||
Returns:
|
||||
dict: {"gender": "male"|"female"|"unknown", "confidence": float}
|
||||
"""
|
||||
default = {"gender": "unknown", "confidence": 0.0}
|
||||
if classifier is None or len(audio_wav) == 0:
|
||||
return default
|
||||
try:
|
||||
import torch
|
||||
seg_tensor = torch.from_numpy(audio_wav).float().unsqueeze(0)
|
||||
out = classifier.classify_batch(seg_tensor)
|
||||
probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
|
||||
if len(probs) >= 2:
|
||||
idx = int(np.argmax(probs))
|
||||
label = "male" if idx == 0 else "female"
|
||||
return {"gender": label, "confidence": float(probs[idx])}
|
||||
except Exception as e:
|
||||
pass
|
||||
return default
|
||||
@@ -1,310 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Clustering - 說話人聚類
|
||||
使用譜聚類算法將聲紋嵌入分組
|
||||
|
||||
技術來源:
|
||||
- 譜聚類:Shi & Malik (2000), IEEE TPAMI
|
||||
- 論文:https://ieeexplore.ieee.org/document/868688
|
||||
- 應用於說話人分離:Wooters & Huijbregts (2008), ICASSP
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from sklearn.cluster import SpectralClustering, AgglomerativeClustering
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
|
||||
def estimate_n_speakers_eigengap(similarity_matrix, max_speakers=10):
|
||||
"""
|
||||
使用特徵值間隙方法估計說話人數量
|
||||
|
||||
技術來源:
|
||||
- 特徵值間隙理論:Lu et al. (2010)
|
||||
- 原理:相似度矩陣的特徵值分佈中,最大間隙對應最佳聚類數
|
||||
|
||||
Args:
|
||||
similarity_matrix: 相似度矩陣 [n, n]
|
||||
max_speakers: 最大說話人數
|
||||
|
||||
Returns:
|
||||
n_speakers: 估計的說話人數量
|
||||
"""
|
||||
# 計算特徵值
|
||||
eigenvalues = np.linalg.eigvalsh(similarity_matrix)
|
||||
|
||||
# 降序排列
|
||||
eigenvalues = np.sort(eigenvalues)[::-1]
|
||||
|
||||
# 只考慮前 max_speakers 個特徵值
|
||||
eigenvalues = eigenvalues[:max_speakers]
|
||||
|
||||
# 計算間隙
|
||||
gaps = np.diff(eigenvalues)
|
||||
|
||||
# 找到最大間隙的位置
|
||||
if len(gaps) > 0:
|
||||
n_speakers = np.argmax(np.abs(gaps)) + 1
|
||||
else:
|
||||
n_speakers = 1
|
||||
|
||||
# 限制範圍
|
||||
n_speakers = max(2, min(n_speakers, max_speakers))
|
||||
|
||||
return n_speakers
|
||||
|
||||
|
||||
def estimate_n_speakers_silhouette(embeddings, max_speakers=10):
|
||||
"""
|
||||
使用輪廓係數估計說話人數量
|
||||
|
||||
Args:
|
||||
embeddings: 嵌入矩陣 [n, d]
|
||||
max_speakers: 最大說話人數
|
||||
|
||||
Returns:
|
||||
n_speakers: 估計的說話人數量
|
||||
"""
|
||||
from sklearn.metrics import silhouette_score
|
||||
|
||||
best_score = -1
|
||||
best_n = 2
|
||||
|
||||
for n in range(2, min(max_speakers + 1, len(embeddings))):
|
||||
clustering = AgglomerativeClustering(n_clusters=n)
|
||||
labels = clustering.fit_predict(embeddings)
|
||||
|
||||
if len(np.unique(labels)) > 1:
|
||||
score = silhouette_score(embeddings, labels)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_n = n
|
||||
|
||||
return best_n
|
||||
|
||||
|
||||
def spectral_clustering_speaker(
|
||||
similarity_matrix, n_speakers=None, auto_estimate=True, max_speakers=10
|
||||
):
|
||||
"""
|
||||
使用譜聚類進行說話人分離
|
||||
|
||||
Args:
|
||||
similarity_matrix: 相似度矩陣 [n, n]
|
||||
n_speakers: 說話人數量(可選,如果為 None 則自動估計)
|
||||
auto_estimate: 是否自動估計說話人數量
|
||||
max_speakers: 最大說話人數
|
||||
|
||||
Returns:
|
||||
speaker_labels: 說話人標籤 [n,]
|
||||
n_speakers: 使用的說話人數量
|
||||
"""
|
||||
n_segments = len(similarity_matrix)
|
||||
|
||||
# 清洗相似度矩陣
|
||||
similarity_matrix = np.nan_to_num(
|
||||
similarity_matrix, nan=0.5, posinf=1.0, neginf=-1.0
|
||||
)
|
||||
|
||||
# 確保對角線為 1
|
||||
np.fill_diagonal(similarity_matrix, 1.0)
|
||||
|
||||
# 確保值在 [-1, 1] 範圍
|
||||
similarity_matrix = np.clip(similarity_matrix, -1.0, 1.0)
|
||||
|
||||
# 自動估計說話人數量
|
||||
if n_speakers is None and auto_estimate:
|
||||
n_speakers = estimate_n_speakers_eigengap(
|
||||
similarity_matrix, max_speakers=max_speakers
|
||||
)
|
||||
print(f"[Clustering] Estimated n_speakers: {n_speakers}")
|
||||
|
||||
if n_speakers is None:
|
||||
n_speakers = 2 # 預設值
|
||||
|
||||
# 確保 n_speakers 不超過樣本數
|
||||
n_speakers = min(n_speakers, n_segments)
|
||||
|
||||
print(f"[Clustering] Running spectral clustering with {n_speakers} clusters...")
|
||||
|
||||
# 譜聚類
|
||||
try:
|
||||
clustering = SpectralClustering(
|
||||
n_clusters=int(n_speakers),
|
||||
affinity="precomputed",
|
||||
assign_labels="kmeans",
|
||||
random_state=42,
|
||||
n_init=10,
|
||||
)
|
||||
|
||||
speaker_labels = clustering.fit_predict(similarity_matrix)
|
||||
|
||||
print("[Clustering] Spectral clustering completed")
|
||||
print(f"[Clustering] n_speakers: {n_speakers}")
|
||||
print(f"[Clustering] n_segments: {n_segments}")
|
||||
|
||||
return speaker_labels, n_speakers
|
||||
|
||||
except Exception as e:
|
||||
print(f"[Clustering] Spectral clustering failed: {e}")
|
||||
print("[Clustering] Using fallback: 2 speakers")
|
||||
# 簡單分配:前一半是 SPEAKER_0,後一半是 SPEAKER_1
|
||||
speaker_labels = np.array(
|
||||
[0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
|
||||
)
|
||||
return speaker_labels, 2
|
||||
|
||||
|
||||
def agglomerative_clustering_speaker(
|
||||
embeddings, n_speakers=None, threshold=0.5, max_speakers=10
|
||||
):
|
||||
"""
|
||||
使用層次聚類進行說話人分離
|
||||
|
||||
Args:
|
||||
embeddings: 嵌入矩陣 [n, d]
|
||||
n_speakers: 說話人數量(可選)
|
||||
threshold: 距離閾值(用於自動決定聚類數)
|
||||
max_speakers: 最大說話人數
|
||||
|
||||
Returns:
|
||||
speaker_labels: 說話人標籤 [n,]
|
||||
n_speakers: 使用的說話人數量
|
||||
"""
|
||||
n_segments = len(embeddings)
|
||||
|
||||
if n_speakers is None:
|
||||
# 使用距離閾值自動決定
|
||||
from sklearn.metrics.pairwise import cosine_distances
|
||||
|
||||
distances = cosine_distances(embeddings)
|
||||
|
||||
# 計算平均最近鄰距離
|
||||
avg_distances = []
|
||||
for i in range(min(100, n_segments)):
|
||||
dists = distances[i]
|
||||
dists = np.sort(dists)
|
||||
if len(dists) > 1:
|
||||
avg_distances.append(dists[1]) # 最近鄰(排除自己)
|
||||
|
||||
if avg_distances:
|
||||
avg_dist = np.mean(avg_distances)
|
||||
# 根據平均距離估計聚類數
|
||||
n_speakers = max(2, int(avg_dist / threshold))
|
||||
n_speakers = min(n_speakers, max_speakers)
|
||||
else:
|
||||
n_speakers = 2
|
||||
|
||||
n_speakers = min(n_speakers, n_segments)
|
||||
|
||||
# 層次聚類
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=n_speakers, metric="cosine", linkage="average"
|
||||
)
|
||||
|
||||
speaker_labels = clustering.fit_predict(embeddings)
|
||||
|
||||
print("[Clustering] Agglomerative clustering completed")
|
||||
print(f"[Clustering] n_speakers: {n_speakers}")
|
||||
|
||||
return speaker_labels, n_speakers
|
||||
|
||||
|
||||
def smooth_speaker_labels(speaker_labels, window_size=5):
|
||||
"""
|
||||
平滑說話人標籤(去除噪聲)
|
||||
|
||||
Args:
|
||||
speaker_labels: 原始說話人標籤
|
||||
window_size: 平滑窗口大小
|
||||
|
||||
Returns:
|
||||
smoothed_labels: 平滑後的標籤
|
||||
"""
|
||||
from scipy import stats
|
||||
|
||||
smoothed = np.copy(speaker_labels)
|
||||
half_window = window_size // 2
|
||||
|
||||
for i in range(len(speaker_labels)):
|
||||
start = max(0, i - half_window)
|
||||
end = min(len(speaker_labels), i + half_window + 1)
|
||||
|
||||
window_labels = speaker_labels[start:end]
|
||||
mode_result = stats.mode(window_labels, keepdims=True)
|
||||
smoothed[i] = mode_result.mode[0]
|
||||
|
||||
return smoothed
|
||||
|
||||
|
||||
def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
|
||||
"""
|
||||
計算說話人分離純度(如果有 ground truth)
|
||||
|
||||
Args:
|
||||
speaker_labels: 預測的說話人標籤
|
||||
ground_truth_labels: 真實的說話人標籤(可選)
|
||||
|
||||
Returns:
|
||||
purity: 純度分數(0-1)
|
||||
"""
|
||||
if ground_truth_labels is None:
|
||||
# 沒有 ground truth,使用聚類純度近似
|
||||
|
||||
# 使用餘弦相似度作為距離
|
||||
purity = 0.5 # 預設值
|
||||
else:
|
||||
# 計算純度
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
|
||||
purity = adjusted_rand_score(ground_truth_labels, speaker_labels)
|
||||
|
||||
return purity
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 測試聚類算法
|
||||
print("[Test] Testing speaker clustering algorithms")
|
||||
|
||||
# 生成模擬數據
|
||||
np.random.seed(42)
|
||||
n_speakers = 3
|
||||
n_segments_per_speaker = 20
|
||||
|
||||
# 生成 3 個說話人的嵌入
|
||||
embeddings = []
|
||||
for i in range(n_speakers):
|
||||
# 每個說話人有不同的中心
|
||||
center = np.random.randn(192) * 2 + i * 3
|
||||
# 添加噪聲
|
||||
for _ in range(n_segments_per_speaker):
|
||||
emb = center + np.random.randn(192) * 0.5
|
||||
embeddings.append(emb)
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
print(f"[Test] Generated {len(embeddings)} embeddings for {n_speakers} speakers")
|
||||
|
||||
# 計算相似度矩陣
|
||||
similarity = cosine_similarity(embeddings)
|
||||
print(f"[Test] Similarity matrix shape: {similarity.shape}")
|
||||
|
||||
# 估計說話人數量
|
||||
estimated_n = estimate_n_speakers_eigengap(similarity, max_speakers=10)
|
||||
print(f"[Test] Estimated n_speakers (eigengap): {estimated_n}")
|
||||
|
||||
estimated_n_silhouette = estimate_n_speakers_silhouette(embeddings, max_speakers=10)
|
||||
print(f"[Test] Estimated n_speakers (silhouette): {estimated_n_silhouette}")
|
||||
|
||||
# 譜聚類
|
||||
labels, n_clusters = spectral_clustering_speaker(
|
||||
similarity, n_speakers=None, auto_estimate=True
|
||||
)
|
||||
|
||||
print("\n[Test] Clustering results:")
|
||||
print(f" True n_speakers: {n_speakers}")
|
||||
print(f" Estimated n_speakers: {n_clusters}")
|
||||
print(f" Unique labels: {np.unique(labels)}")
|
||||
|
||||
# 計算每個聚類的大小
|
||||
for label in np.unique(labels):
|
||||
count = np.sum(labels == label)
|
||||
print(f" Cluster {label}: {count} segments")
|
||||
@@ -1,431 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Player GUI - 說話人語音播放器(圖形界面)
|
||||
使用 tkinter 顯示播放進度和 Speaker ID
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tkinter as tk
|
||||
from tkinter import ttk, filedialog, messagebox
|
||||
|
||||
HAS_TKINTER = True
|
||||
except ImportError:
|
||||
HAS_TKINTER = False
|
||||
|
||||
|
||||
class SpeakerPlayerGUI:
|
||||
"""說話人語音播放器 GUI"""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
self.root.title("🎬 Speaker Audio Player - Face Integration")
|
||||
self.root.geometry("1100x800")
|
||||
|
||||
# 數據
|
||||
self.audio_path = None
|
||||
self.result_path = None
|
||||
self.face_path = None
|
||||
self.result_data = None
|
||||
self.face_data = None
|
||||
self.integrated_data = None
|
||||
self.speaker_segments = {}
|
||||
self.speakers = []
|
||||
self.current_speaker_idx = 0
|
||||
self.is_playing = False
|
||||
self.stop_flag = False
|
||||
|
||||
# 創建界面
|
||||
self.create_widgets()
|
||||
|
||||
def create_widgets(self):
|
||||
"""創建界面組件"""
|
||||
# 頂部:文件選擇
|
||||
top_frame = ttk.Frame(self.root, padding="10")
|
||||
top_frame.pack(fill=tk.X)
|
||||
|
||||
ttk.Label(top_frame, text="📁 Audio:").pack(side=tk.LEFT)
|
||||
self.audio_label = ttk.Label(top_frame, text="未選擇", width=50)
|
||||
self.audio_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(top_frame, text="選擇音頻", command=self.select_audio).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
|
||||
ttk.Label(top_frame, text=" 📊 Result:").pack(side=tk.LEFT, padx=(20, 0))
|
||||
self.result_label = ttk.Label(top_frame, text="未選擇", width=50)
|
||||
self.result_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(top_frame, text="選擇結果", command=self.select_result).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
|
||||
# 中間:說話人列表和片段列表
|
||||
mid_frame = ttk.Frame(self.root, padding="10")
|
||||
mid_frame.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# 左側:說話人列表
|
||||
left_frame = ttk.LabelFrame(mid_frame, text="📢 說話人列表", padding="10")
|
||||
left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False)
|
||||
|
||||
self.speaker_listbox = tk.Listbox(
|
||||
left_frame, width=35, height=20, font=("Arial", 11)
|
||||
)
|
||||
self.speaker_listbox.pack(fill=tk.BOTH, expand=True)
|
||||
self.speaker_listbox.bind("<<ListboxSelect>>", self.on_speaker_select)
|
||||
|
||||
# 右側:片段列表
|
||||
right_frame = ttk.LabelFrame(mid_frame, text="🎵 語音片段", padding="10")
|
||||
right_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10)
|
||||
|
||||
# 片段列表(带滚动条)
|
||||
list_frame = ttk.Frame(right_frame)
|
||||
list_frame.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
scrollbar = ttk.Scrollbar(list_frame)
|
||||
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||
|
||||
self.segment_listbox = tk.Listbox(
|
||||
list_frame,
|
||||
width=50,
|
||||
height=20,
|
||||
font=("Courier", 10),
|
||||
yscrollcommand=scrollbar.set,
|
||||
)
|
||||
self.segment_listbox.pack(fill=tk.BOTH, expand=True)
|
||||
scrollbar.config(command=self.segment_listbox.yview)
|
||||
|
||||
self.segment_listbox.bind("<Double-Button-1>", self.on_segment_double_click)
|
||||
|
||||
# 底部:播放控制和進度
|
||||
bottom_frame = ttk.Frame(self.root, padding="10")
|
||||
bottom_frame.pack(fill=tk.X)
|
||||
|
||||
# 播放控制
|
||||
control_frame = ttk.Frame(bottom_frame)
|
||||
control_frame.pack(fill=tk.X)
|
||||
|
||||
self.play_button = ttk.Button(
|
||||
control_frame, text="▶️ 播放所選", command=self.play_selected, width=15
|
||||
)
|
||||
self.play_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
self.stop_button = ttk.Button(
|
||||
control_frame, text="⏹️ 停止", command=self.stop_playing, width=10
|
||||
)
|
||||
self.stop_button.pack(side=tk.LEFT, padx=5)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
|
||||
self.play_all_button = ttk.Button(
|
||||
control_frame, text="▶️▶️ 播放全部", command=self.play_all, width=15
|
||||
)
|
||||
self.play_all_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
# 進度條
|
||||
progress_frame = ttk.Frame(bottom_frame)
|
||||
progress_frame.pack(fill=tk.X, pady=(10, 0))
|
||||
|
||||
ttk.Label(progress_frame, text="⏱️ 進度:").pack(side=tk.LEFT)
|
||||
self.progress_bar = ttk.Progressbar(progress_frame, mode="determinate")
|
||||
self.progress_bar.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10)
|
||||
|
||||
self.progress_label = ttk.Label(progress_frame, text="0:00 / 0:00", width=20)
|
||||
self.progress_label.pack(side=tk.LEFT)
|
||||
|
||||
# 狀態欄
|
||||
self.status_label = ttk.Label(
|
||||
bottom_frame, text="就緒", relief=tk.SUNKEN, anchor=tk.W
|
||||
)
|
||||
self.status_label.pack(fill=tk.X, pady=(10, 0))
|
||||
|
||||
def select_audio(self):
|
||||
"""選擇音頻文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇音頻文件",
|
||||
filetypes=[("WAV files", "*.wav"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.audio_path = filename
|
||||
self.audio_label.config(text=Path(filename).name)
|
||||
self.check_ready()
|
||||
|
||||
def select_result(self):
|
||||
"""選擇結果文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇 ASRX 結果文件",
|
||||
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.result_path = filename
|
||||
self.result_label.config(text=Path(filename).name)
|
||||
self.load_result()
|
||||
self.check_ready()
|
||||
|
||||
def load_result(self):
|
||||
"""載入 ASRX 結果"""
|
||||
try:
|
||||
with open(self.result_path, "r", encoding="utf-8") as f:
|
||||
self.result_data = json.load(f)
|
||||
|
||||
# 分組
|
||||
self.speaker_segments = {}
|
||||
for seg in self.result_data.get("segments", []):
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in self.speaker_segments:
|
||||
self.speaker_segments[speaker] = []
|
||||
self.speaker_segments[speaker].append(seg)
|
||||
|
||||
# 排序
|
||||
for speaker in self.speaker_segments:
|
||||
self.speaker_segments[speaker].sort(key=lambda x: x["start"])
|
||||
|
||||
# 說話人列表(按時長排序)
|
||||
self.speakers = sorted(
|
||||
self.speaker_segments.keys(),
|
||||
key=lambda s: sum(seg["duration"] for seg in self.speaker_segments[s]),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
# 更新列表框
|
||||
self.speaker_listbox.delete(0, tk.END)
|
||||
for speaker in self.speakers:
|
||||
segs = self.speaker_segments[speaker]
|
||||
total_dur = sum(seg["duration"] for seg in segs)
|
||||
total_dur_min = total_dur / 60
|
||||
self.speaker_listbox.insert(
|
||||
tk.END,
|
||||
f"🔊 {speaker:12} | {len(segs):4d}段 | {total_dur_min:5.1f}分鐘",
|
||||
)
|
||||
|
||||
self.status_label.config(
|
||||
text=f"載入成功:{len(self.speakers)} 個說話人,{len(self.result_data.get('segments', []))} 個片段"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
messagebox.showerror("錯誤", f"載入結果文件失敗:{e}")
|
||||
self.result_path = None
|
||||
self.result_label.config(text="載入失敗")
|
||||
|
||||
def check_ready(self):
|
||||
"""檢查是否就緒"""
|
||||
if self.audio_path and self.result_path:
|
||||
self.status_label.config(text="✅ 就緒 - 請選擇說話人並播放")
|
||||
self.play_button.config(state=tk.NORMAL)
|
||||
self.play_all_button.config(state=tk.NORMAL)
|
||||
else:
|
||||
self.status_label.config(text="⚠️ 請選擇音頻和結果文件")
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
|
||||
def on_speaker_select(self, event):
|
||||
"""說話人選擇事件"""
|
||||
selection = self.speaker_listbox.curselection()
|
||||
if not selection:
|
||||
return
|
||||
|
||||
self.current_speaker_idx = selection[0]
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
|
||||
# 更新片段列表
|
||||
self.segment_listbox.delete(0, tk.END)
|
||||
for i, seg in enumerate(self.speaker_segments[speaker], 1):
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
self.segment_listbox.insert(
|
||||
tk.END,
|
||||
f"[{i:4d}] {speaker:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s)",
|
||||
)
|
||||
|
||||
self.status_label.config(
|
||||
text=f"選擇:{speaker} - {len(self.speaker_segments[speaker])} 個片段"
|
||||
)
|
||||
|
||||
def on_segment_double_click(self, event):
|
||||
"""片段雙擊事件"""
|
||||
self.play_selected()
|
||||
|
||||
def extract_and_play(self, start_sec: float, end_sec: float) -> bool:
|
||||
"""提取並播放音頻"""
|
||||
duration = end_sec - start_sec
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
temp_path = temp_file.name
|
||||
temp_file.close()
|
||||
|
||||
try:
|
||||
# 提取
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-loglevel",
|
||||
"quiet",
|
||||
"-i",
|
||||
self.audio_path,
|
||||
"-ss",
|
||||
str(start_sec),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
temp_path,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
return False
|
||||
|
||||
# 播放
|
||||
if os.path.exists("/usr/bin/afplay"):
|
||||
subprocess.run(["afplay", temp_path], capture_output=True)
|
||||
elif os.path.exists("/usr/bin/aplay"):
|
||||
subprocess.run(["aplay", temp_path], capture_output=True)
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
def play_segment(self, speaker: str, seg: dict, seg_idx: int, total: int):
|
||||
"""播放單個片段"""
|
||||
if self.stop_flag:
|
||||
return False
|
||||
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
|
||||
# 更新 UI
|
||||
self.root.after(
|
||||
0,
|
||||
lambda: self.status_label.config(
|
||||
text=f"▶️ {speaker} [{seg_idx}/{total}] {start:.2f}s - {end:.2f}s"
|
||||
),
|
||||
)
|
||||
|
||||
# 更新進度
|
||||
progress = (seg_idx / total) * 100
|
||||
self.root.after(0, lambda: self.progress_bar.config(value=progress))
|
||||
self.root.after(
|
||||
0, lambda: self.progress_label.config(text=f"{seg_idx}:{total}")
|
||||
)
|
||||
|
||||
# 播放
|
||||
if self.extract_and_play(start, end):
|
||||
return True
|
||||
else:
|
||||
self.root.after(
|
||||
0,
|
||||
lambda: messagebox.showwarning(
|
||||
"警告", f"播放失敗:{speaker} [{seg_idx}]"
|
||||
),
|
||||
)
|
||||
return True
|
||||
|
||||
def play_selected(self):
|
||||
"""播放所選片段"""
|
||||
selection = self.segment_listbox.curselection()
|
||||
if not selection:
|
||||
# 如果沒選擇,播放第一個
|
||||
if self.speakers:
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
segs = self.speaker_segments[speaker]
|
||||
if segs:
|
||||
self.play_all()
|
||||
return
|
||||
|
||||
# 播放所選
|
||||
seg_idx = selection[0]
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
seg = self.speaker_segments[speaker][seg_idx]
|
||||
|
||||
self.is_playing = True
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 在後台線程播放
|
||||
def play_thread():
|
||||
success = self.play_segment(speaker, seg, seg_idx + 1, 1)
|
||||
self.root.after(0, lambda: self.on_play_done())
|
||||
|
||||
thread = threading.Thread(target=play_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def play_all(self):
|
||||
"""播放所選說話人的所有片段"""
|
||||
if not self.speakers:
|
||||
return
|
||||
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
segs = self.speaker_segments[speaker]
|
||||
|
||||
if not segs:
|
||||
return
|
||||
|
||||
self.is_playing = True
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 在後台線程播放
|
||||
def play_thread():
|
||||
for i, seg in enumerate(segs, 1):
|
||||
if self.stop_flag:
|
||||
break
|
||||
self.play_segment(speaker, seg, i, len(segs))
|
||||
time.sleep(0.3) # 片段間隔
|
||||
|
||||
self.root.after(0, lambda: self.on_play_done())
|
||||
|
||||
thread = threading.Thread(target=play_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def stop_playing(self):
|
||||
"""停止播放"""
|
||||
self.stop_flag = True
|
||||
self.is_playing = False
|
||||
self.on_play_done()
|
||||
|
||||
def on_play_done(self):
|
||||
"""播放完成"""
|
||||
self.is_playing = False
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.NORMAL)
|
||||
self.play_all_button.config(state=tk.NORMAL)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
self.progress_bar.config(value=0)
|
||||
self.progress_label.config(text="0:00 / 0:00")
|
||||
|
||||
if self.stop_flag:
|
||||
self.status_label.config(text="⏹️ 已停止")
|
||||
else:
|
||||
self.status_label.config(text="✅ 播放完成")
|
||||
|
||||
|
||||
def main():
|
||||
"""主函數"""
|
||||
if not HAS_TKINTER:
|
||||
print("❌ tkinter 未安裝")
|
||||
print("請使用以下命令安裝:")
|
||||
print(" brew install python-tk@3.9")
|
||||
return
|
||||
|
||||
root = tk.Tk()
|
||||
app = SpeakerPlayerGUI(root)
|
||||
root.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,522 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Player GUI - 說話人語音播放器(Face 整合版)
|
||||
使用 tkinter 顯示播放進度、Speaker ID 和人臉信息
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tkinter as tk
|
||||
from tkinter import ttk, filedialog, messagebox
|
||||
|
||||
HAS_TKINTER = True
|
||||
except ImportError:
|
||||
HAS_TKINTER = False
|
||||
|
||||
|
||||
class SpeakerPlayerGUI:
|
||||
"""說話人語音播放器 GUI(Face 整合版)"""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
self.root.title("🎬 Speaker Player - Face Integration")
|
||||
self.root.geometry("1200x800")
|
||||
|
||||
# 數據
|
||||
self.audio_path = None
|
||||
self.result_path = None
|
||||
self.face_path = None
|
||||
self.result_data = None
|
||||
self.face_data = None
|
||||
self.integrated_data = None
|
||||
self.speaker_segments = {}
|
||||
self.speakers = []
|
||||
self.current_speaker_idx = 0
|
||||
self.is_playing = False
|
||||
self.stop_flag = False
|
||||
|
||||
# 創建界面
|
||||
self.create_widgets()
|
||||
|
||||
def create_widgets(self):
|
||||
"""創建界面組件"""
|
||||
# 頂部:文件選擇
|
||||
top_frame = ttk.Frame(self.root, padding="10")
|
||||
top_frame.pack(fill=tk.X)
|
||||
|
||||
# 第一行:音頻和 ASRX 結果
|
||||
row1_frame = ttk.Frame(top_frame)
|
||||
row1_frame.pack(fill=tk.X)
|
||||
|
||||
ttk.Label(row1_frame, text="📁 Audio:").pack(side=tk.LEFT)
|
||||
self.audio_label = ttk.Label(row1_frame, text="未選擇", width=50)
|
||||
self.audio_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(row1_frame, text="選擇音頻", command=self.select_audio).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
|
||||
ttk.Label(row1_frame, text=" 📊 ASRX:").pack(side=tk.LEFT, padx=(20, 0))
|
||||
self.result_label = ttk.Label(row1_frame, text="未選擇", width=50)
|
||||
self.result_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(row1_frame, text="選擇結果", command=self.select_result).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
|
||||
# 第二行:Face 結果
|
||||
row2_frame = ttk.Frame(top_frame)
|
||||
row2_frame.pack(fill=tk.X, pady=(5, 0))
|
||||
|
||||
ttk.Label(row2_frame, text="👤 Face:").pack(side=tk.LEFT)
|
||||
self.face_label = ttk.Label(row2_frame, text="未選擇 (可選)", width=50)
|
||||
self.face_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(row2_frame, text="選擇 Face", command=self.select_face).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
self.integrate_button = ttk.Button(
|
||||
row2_frame,
|
||||
text="🔗 整合 Face",
|
||||
command=self.integrate_face,
|
||||
state=tk.DISABLED,
|
||||
)
|
||||
self.integrate_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
# 中間:說話人列表和片段列表
|
||||
mid_frame = ttk.Frame(self.root, padding="10")
|
||||
mid_frame.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# 左側:說話人列表(帶 Face 統計)
|
||||
left_frame = ttk.LabelFrame(mid_frame, text="📢 說話人列表", padding="10")
|
||||
left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False)
|
||||
|
||||
self.speaker_listbox = tk.Listbox(
|
||||
left_frame, width=45, height=20, font=("Arial", 11)
|
||||
)
|
||||
self.speaker_listbox.pack(fill=tk.BOTH, expand=True)
|
||||
self.speaker_listbox.bind("<<ListboxSelect>>", self.on_speaker_select)
|
||||
|
||||
# 右側:片段列表(帶 Face 信息)
|
||||
right_frame = ttk.LabelFrame(
|
||||
mid_frame, text="🎵 語音片段 + 👥 人臉", padding="10"
|
||||
)
|
||||
right_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10)
|
||||
|
||||
# 片段列表(带滚动条)
|
||||
list_frame = ttk.Frame(right_frame)
|
||||
list_frame.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
scrollbar = ttk.Scrollbar(list_frame)
|
||||
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||
|
||||
self.segment_listbox = tk.Listbox(
|
||||
list_frame,
|
||||
width=65,
|
||||
height=20,
|
||||
font=("Courier", 9),
|
||||
yscrollcommand=scrollbar.set,
|
||||
)
|
||||
self.segment_listbox.pack(fill=tk.BOTH, expand=True)
|
||||
scrollbar.config(command=self.segment_listbox.yview)
|
||||
|
||||
self.segment_listbox.bind("<Double-Button-1>", self.on_segment_double_click)
|
||||
|
||||
# 底部:播放控制和進度
|
||||
bottom_frame = ttk.Frame(self.root, padding="10")
|
||||
bottom_frame.pack(fill=tk.X)
|
||||
|
||||
# 播放控制
|
||||
control_frame = ttk.Frame(bottom_frame)
|
||||
control_frame.pack(fill=tk.X)
|
||||
|
||||
self.play_button = ttk.Button(
|
||||
control_frame, text="▶️ 播放所選", command=self.play_selected, width=15
|
||||
)
|
||||
self.play_button.pack(side=tk.LEFT, padx=5)
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
|
||||
self.stop_button = ttk.Button(
|
||||
control_frame, text="⏹️ 停止", command=self.stop_playing, width=10
|
||||
)
|
||||
self.stop_button.pack(side=tk.LEFT, padx=5)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
|
||||
self.play_all_button = ttk.Button(
|
||||
control_frame, text="▶️▶️ 播放全部", command=self.play_all, width=15
|
||||
)
|
||||
self.play_all_button.pack(side=tk.LEFT, padx=5)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
|
||||
# 進度條
|
||||
progress_frame = ttk.Frame(bottom_frame)
|
||||
progress_frame.pack(fill=tk.X, pady=(10, 0))
|
||||
|
||||
ttk.Label(progress_frame, text="⏱️ 進度:").pack(side=tk.LEFT)
|
||||
self.progress_bar = ttk.Progressbar(progress_frame, mode="determinate")
|
||||
self.progress_bar.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10)
|
||||
|
||||
self.progress_label = ttk.Label(progress_frame, text="0:00 / 0:00", width=20)
|
||||
self.progress_label.pack(side=tk.LEFT)
|
||||
|
||||
# 狀態欄
|
||||
self.status_label = ttk.Label(
|
||||
bottom_frame, text="就緒", relief=tk.SUNKEN, anchor=tk.W
|
||||
)
|
||||
self.status_label.pack(fill=tk.X, pady=(10, 0))
|
||||
|
||||
def select_audio(self):
|
||||
"""選擇音頻文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇音頻文件",
|
||||
filetypes=[("WAV files", "*.wav"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.audio_path = filename
|
||||
self.audio_label.config(text=Path(filename).name)
|
||||
self.check_ready()
|
||||
|
||||
def select_result(self):
|
||||
"""選擇 ASRX 結果文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇 ASRX 結果文件",
|
||||
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.result_path = filename
|
||||
self.result_label.config(text=Path(filename).name)
|
||||
self.load_result()
|
||||
self.check_ready()
|
||||
|
||||
def select_face(self):
|
||||
"""選擇 Face 結果文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇 Face 檢測結果",
|
||||
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.face_path = filename
|
||||
self.face_label.config(text=Path(filename).name)
|
||||
self.integrate_button.config(state=tk.NORMAL)
|
||||
self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")
|
||||
|
||||
def integrate_face(self):
|
||||
"""整合 Face 與 ASRX"""
|
||||
if not self.face_path or not self.result_path:
|
||||
messagebox.showwarning("警告", "請先選擇 Face 和 ASRX 文件")
|
||||
return
|
||||
|
||||
self.status_label.config(text="🔄 整合中...")
|
||||
self.root.update()
|
||||
|
||||
try:
|
||||
# 載入 Face 數據
|
||||
with open(self.face_path, "r", encoding="utf-8") as f:
|
||||
self.face_data = json.load(f)
|
||||
|
||||
# 重新載入 ASRX 數據並整合
|
||||
self.load_result(integrate_with_face=True)
|
||||
|
||||
self.status_label.config(text="✅ Face 整合完成")
|
||||
self.integrate_button.config(state=tk.DISABLED)
|
||||
|
||||
except Exception as e:
|
||||
messagebox.showerror("錯誤", f"整合失敗:{e}")
|
||||
self.status_label.config(text="❌ 整合失敗")
|
||||
|
||||
def load_result(self, integrate_with_face=False):
|
||||
"""載入 ASRX 結果"""
|
||||
try:
|
||||
with open(self.result_path, "r", encoding="utf-8") as f:
|
||||
self.result_data = json.load(f)
|
||||
|
||||
# 分組
|
||||
self.speaker_segments = {}
|
||||
for seg in self.result_data.get("segments", []):
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in self.speaker_segments:
|
||||
self.speaker_segments[speaker] = []
|
||||
self.speaker_segments[speaker].append(seg)
|
||||
|
||||
# 排序
|
||||
for speaker in self.speaker_segments:
|
||||
self.speaker_segments[speaker].sort(key=lambda x: x["start"])
|
||||
|
||||
# 說話人列表(按時長排序)
|
||||
self.speakers = sorted(
|
||||
self.speaker_segments.keys(),
|
||||
key=lambda s: sum(seg["duration"] for seg in self.speaker_segments[s]),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
# 更新列表框
|
||||
self.speaker_listbox.delete(0, tk.END)
|
||||
for speaker in self.speakers:
|
||||
segs = self.speaker_segments[speaker]
|
||||
total_dur = sum(seg["duration"] for seg in segs)
|
||||
total_dur_min = total_dur / 60
|
||||
|
||||
# 如果有 Face 數據,計算有人臉的片段數
|
||||
face_info = ""
|
||||
if integrate_with_face and self.integrated_data:
|
||||
speaker_integrated = [
|
||||
item
|
||||
for item in self.integrated_data
|
||||
if item["speaker"] == speaker
|
||||
]
|
||||
with_face = sum(
|
||||
1 for item in speaker_integrated if item.get("has_face", False)
|
||||
)
|
||||
face_info = f" | 👥 {with_face}/{len(segs)}"
|
||||
|
||||
self.speaker_listbox.insert(
|
||||
tk.END,
|
||||
f"🔊 {speaker:12} | {len(segs):4d}段 | {total_dur_min:5.1f}分鐘{face_info}",
|
||||
)
|
||||
|
||||
total_segments = len(self.result_data.get("segments", []))
|
||||
self.status_label.config(
|
||||
text=f"載入成功:{len(self.speakers)} 個說話人,{total_segments} 個片段"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
messagebox.showerror("錯誤", f"載入結果文件失敗:{e}")
|
||||
self.result_path = None
|
||||
self.result_label.config(text="載入失敗")
|
||||
|
||||
def check_ready(self):
|
||||
"""檢查是否就緒"""
|
||||
if self.audio_path and self.result_path:
|
||||
self.status_label.config(text="✅ 就緒 - 請選擇說話人並播放")
|
||||
self.play_button.config(state=tk.NORMAL)
|
||||
self.play_all_button.config(state=tk.NORMAL)
|
||||
else:
|
||||
self.status_label.config(text="⚠️ 請選擇音頻和結果文件")
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
|
||||
def on_speaker_select(self, event):
|
||||
"""說話人選擇事件"""
|
||||
selection = self.speaker_listbox.curselection()
|
||||
if not selection:
|
||||
return
|
||||
|
||||
self.current_speaker_idx = selection[0]
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
|
||||
# 更新片段列表
|
||||
self.segment_listbox.delete(0, tk.END)
|
||||
for i, seg in enumerate(self.speaker_segments[speaker], 1):
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
|
||||
# 如果有整合 Face 數據
|
||||
face_info = ""
|
||||
if self.integrated_data:
|
||||
matching = [
|
||||
item
|
||||
for item in self.integrated_data
|
||||
if abs(item["start"] - start) < 0.1 and item["speaker"] == speaker
|
||||
]
|
||||
if matching and matching[0].get("has_face", False):
|
||||
face_info = " 👥✅"
|
||||
elif matching:
|
||||
face_info = " 👥❌"
|
||||
|
||||
self.segment_listbox.insert(
|
||||
tk.END,
|
||||
f"[{i:4d}] {speaker:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s){face_info}",
|
||||
)
|
||||
|
||||
self.status_label.config(
|
||||
text=f"選擇:{speaker} - {len(self.speaker_segments[speaker])} 個片段"
|
||||
)
|
||||
|
||||
def on_segment_double_click(self, event):
|
||||
"""片段雙擊事件"""
|
||||
self.play_selected()
|
||||
|
||||
def extract_and_play(self, start_sec: float, end_sec: float) -> bool:
|
||||
"""提取並播放音頻"""
|
||||
duration = end_sec - start_sec
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
temp_path = temp_file.name
|
||||
temp_file.close()
|
||||
|
||||
try:
|
||||
# 提取
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-loglevel",
|
||||
"quiet",
|
||||
"-i",
|
||||
self.audio_path,
|
||||
"-ss",
|
||||
str(start_sec),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
temp_path,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
return False
|
||||
|
||||
# 播放
|
||||
if os.path.exists("/usr/bin/afplay"):
|
||||
subprocess.run(["afplay", temp_path], capture_output=True)
|
||||
elif os.path.exists("/usr/bin/aplay"):
|
||||
subprocess.run(["aplay", temp_path], capture_output=True)
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
def play_segment(self, speaker: str, seg: dict, seg_idx: int, total: int):
|
||||
"""播放單個片段"""
|
||||
if self.stop_flag:
|
||||
return False
|
||||
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
|
||||
# 更新 UI
|
||||
self.root.after(
|
||||
0,
|
||||
lambda: self.status_label.config(
|
||||
text=f"▶️ {speaker} [{seg_idx}/{total}] {start:.2f}s - {end:.2f}s"
|
||||
),
|
||||
)
|
||||
|
||||
# 更新進度
|
||||
progress = (seg_idx / total) * 100
|
||||
self.root.after(0, lambda: self.progress_bar.config(value=progress))
|
||||
self.root.after(
|
||||
0, lambda: self.progress_label.config(text=f"{seg_idx}:{total}")
|
||||
)
|
||||
|
||||
# 播放
|
||||
if self.extract_and_play(start, end):
|
||||
return True
|
||||
else:
|
||||
self.root.after(
|
||||
0,
|
||||
lambda: messagebox.showwarning(
|
||||
"警告", f"播放失敗:{speaker} [{seg_idx}]"
|
||||
),
|
||||
)
|
||||
return True
|
||||
|
||||
def play_selected(self):
|
||||
"""播放所選片段"""
|
||||
selection = self.segment_listbox.curselection()
|
||||
if not selection:
|
||||
# 如果沒選擇,播放第一個
|
||||
if self.speakers:
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
segs = self.speaker_segments[speaker]
|
||||
if segs:
|
||||
self.play_all()
|
||||
return
|
||||
|
||||
# 播放所選
|
||||
seg_idx = selection[0]
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
seg = self.speaker_segments[speaker][seg_idx]
|
||||
|
||||
self.is_playing = True
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 在後台線程播放
|
||||
def play_thread():
|
||||
success = self.play_segment(speaker, seg, seg_idx + 1, 1)
|
||||
self.root.after(0, lambda: self.on_play_done())
|
||||
|
||||
thread = threading.Thread(target=play_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def play_all(self):
|
||||
"""播放所選說話人的所有片段"""
|
||||
if not self.speakers:
|
||||
return
|
||||
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
segs = self.speaker_segments[speaker]
|
||||
|
||||
if not segs:
|
||||
return
|
||||
|
||||
self.is_playing = True
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 在後台線程播放
|
||||
def play_thread():
|
||||
for i, seg in enumerate(segs, 1):
|
||||
if self.stop_flag:
|
||||
break
|
||||
self.play_segment(speaker, seg, i, len(segs))
|
||||
time.sleep(0.3) # 片段間隔
|
||||
|
||||
self.root.after(0, lambda: self.on_play_done())
|
||||
|
||||
thread = threading.Thread(target=play_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def stop_playing(self):
|
||||
"""停止播放"""
|
||||
self.stop_flag = True
|
||||
self.is_playing = False
|
||||
self.on_play_done()
|
||||
|
||||
def on_play_done(self):
|
||||
"""播放完成"""
|
||||
self.is_playing = False
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.NORMAL)
|
||||
self.play_all_button.config(state=tk.NORMAL)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
self.progress_bar.config(value=0)
|
||||
self.progress_label.config(text="0:00 / 0:00")
|
||||
|
||||
if self.stop_flag:
|
||||
self.status_label.config(text="⏹️ 已停止")
|
||||
else:
|
||||
self.status_label.config(text="✅ 播放完成")
|
||||
|
||||
|
||||
def main():
|
||||
"""主函數"""
|
||||
if not HAS_TKINTER:
|
||||
print("❌ tkinter 未安裝")
|
||||
print("請使用以下命令安裝:")
|
||||
print(" brew install python-tk@3.9")
|
||||
return
|
||||
|
||||
root = tk.Tk()
|
||||
app = SpeakerPlayerGUI(root)
|
||||
root.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,267 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Interactive Speaker Audio Player - 交互式說話人語音播放器
|
||||
可以選擇播放哪個說話人的哪些片段
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
def load_asrx_result(result_path: str) -> Dict:
|
||||
"""載入 ASRX 結果"""
|
||||
with open(result_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def extract_and_play(audio_path: str, start_sec: float, end_sec: float) -> bool:
|
||||
"""提取並播放音頻片段"""
|
||||
duration = end_sec - start_sec
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
temp_path = temp_file.name
|
||||
temp_file.close()
|
||||
|
||||
try:
|
||||
# 提取
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-loglevel",
|
||||
"quiet",
|
||||
"-i",
|
||||
audio_path,
|
||||
"-ss",
|
||||
str(start_sec),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
temp_path,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
return False
|
||||
|
||||
# 播放
|
||||
if os.path.exists("/usr/bin/afplay"):
|
||||
subprocess.run(["afplay", temp_path], capture_output=True)
|
||||
elif os.path.exists("/usr/bin/aplay"):
|
||||
subprocess.run(["aplay", temp_path], capture_output=True)
|
||||
else:
|
||||
print(" ⚠️ No audio player found")
|
||||
return False
|
||||
|
||||
return True
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
|
||||
def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
|
||||
"""顯示選單"""
|
||||
segs = speaker_segments[speaker_id]
|
||||
total_duration = sum(seg["duration"] for seg in segs)
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"🔊 {speaker_id}")
|
||||
print(f"{'=' * 70}")
|
||||
print(f" Segments: {len(segs)}")
|
||||
print(
|
||||
f" Total duration: {total_duration / 60:.1f} minutes ({total_duration:.1f}s)"
|
||||
)
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
# 顯示前 20 個片段
|
||||
for i, seg in enumerate(segs[:20], 1):
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
print(
|
||||
f" [{i:3d}] {speaker_id:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s)"
|
||||
)
|
||||
|
||||
if len(segs) > 20:
|
||||
print(f" ... and {len(segs) - 20} more segments")
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print("Commands:")
|
||||
print(f" [1-{min(20, len(segs))}] Play specific segment")
|
||||
print(" all Play all segments (may take a while)")
|
||||
print(" first N Play first N segments")
|
||||
print(" next Next speaker")
|
||||
print(" prev Previous speaker")
|
||||
print(" list List all speakers")
|
||||
print(" quit Exit")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
|
||||
def interactive_player(audio_path: str, result_path: str):
|
||||
"""交互式播放器"""
|
||||
# 載入結果
|
||||
result = load_asrx_result(result_path)
|
||||
segments = result.get("segments", [])
|
||||
total_duration = result.get("total_duration", 0)
|
||||
|
||||
# 分組
|
||||
speaker_segments = {}
|
||||
for seg in segments:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_segments:
|
||||
speaker_segments[speaker] = []
|
||||
speaker_segments[speaker].append(seg)
|
||||
|
||||
# 排序
|
||||
for speaker in speaker_segments:
|
||||
speaker_segments[speaker].sort(key=lambda x: x["start"])
|
||||
|
||||
# 說話人列表
|
||||
speakers = sorted(
|
||||
speaker_segments.keys(),
|
||||
key=lambda s: sum(seg["duration"] for seg in speaker_segments[s]),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
current_speaker_idx = 0
|
||||
|
||||
print("\n🎬 Speaker Audio Player")
|
||||
print(f"📁 Audio: {audio_path}")
|
||||
print(f"📊 Speakers: {len(speakers)}")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
while True:
|
||||
current_speaker = speakers[current_speaker_idx]
|
||||
show_menu(speaker_segments, current_speaker)
|
||||
|
||||
try:
|
||||
cmd = input(f"\n▶️ {current_speaker} > ").strip().lower()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
print("\n\nExiting...")
|
||||
break
|
||||
|
||||
if not cmd:
|
||||
continue
|
||||
|
||||
# 播放特定片段
|
||||
if cmd.isdigit():
|
||||
idx = int(cmd) - 1
|
||||
if 0 <= idx < len(speaker_segments[current_speaker]):
|
||||
seg = speaker_segments[current_speaker][idx]
|
||||
print(f"\n 🔊 {current_speaker} - Segment {idx + 1}")
|
||||
print(
|
||||
f" ⏱️ {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
|
||||
)
|
||||
print(" ▶️ Playing...", end="", flush=True)
|
||||
if extract_and_play(audio_path, seg["start"], seg["end"]):
|
||||
print(" ✅ Done")
|
||||
else:
|
||||
print(" ❌ Failed")
|
||||
else:
|
||||
print(
|
||||
f" Invalid segment number (1-{len(speaker_segments[current_speaker])})"
|
||||
)
|
||||
|
||||
# 播放所有
|
||||
elif cmd == "all":
|
||||
print(
|
||||
f"\n 🔊 {current_speaker} - Playing all {len(speaker_segments[current_speaker])} segments..."
|
||||
)
|
||||
print("=" * 70)
|
||||
for i, seg in enumerate(speaker_segments[current_speaker], 1):
|
||||
print(
|
||||
f" [{i:3d}/{len(speaker_segments[current_speaker])}] {current_speaker} | "
|
||||
+ f"{seg['start']:7.2f}s - {seg['end']:7.2f}s ({seg['duration']:5.2f}s)",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
if extract_and_play(audio_path, seg["start"], seg["end"]):
|
||||
print(" ✅")
|
||||
else:
|
||||
print(" ❌")
|
||||
print("=" * 70)
|
||||
|
||||
# 播放前 N 個
|
||||
elif cmd.startswith("first "):
|
||||
try:
|
||||
n = int(cmd.split()[1])
|
||||
print(f"\n 🔊 {current_speaker} - Playing first {n} segments...")
|
||||
print("=" * 70)
|
||||
for i, seg in enumerate(speaker_segments[current_speaker][:n], 1):
|
||||
print(
|
||||
f" [{i:3d}/{n}] {current_speaker} | "
|
||||
+ f"{seg['start']:7.2f}s - {seg['end']:7.2f}s ({seg['duration']:5.2f}s)",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
if extract_and_play(audio_path, seg["start"], seg["end"]):
|
||||
print(" ✅")
|
||||
else:
|
||||
print(" ❌")
|
||||
print("=" * 70)
|
||||
except (IndexError, ValueError):
|
||||
print(" Usage: first N")
|
||||
|
||||
# 下一個說話人
|
||||
elif cmd == "next":
|
||||
current_speaker_idx = (current_speaker_idx + 1) % len(speakers)
|
||||
|
||||
# 上一個說話人
|
||||
elif cmd == "prev":
|
||||
current_speaker_idx = (current_speaker_idx - 1) % len(speakers)
|
||||
|
||||
# 列出所有說話人
|
||||
elif cmd == "list":
|
||||
print(f"\n{'=' * 70}")
|
||||
print("📢 All speakers:")
|
||||
print(f"{'=' * 70}")
|
||||
for i, speaker in enumerate(speakers, 1):
|
||||
segs = speaker_segments[speaker]
|
||||
total_dur = sum(seg["duration"] for seg in segs)
|
||||
pct = total_dur / total_duration * 100 if total_duration > 0 else 0
|
||||
print(
|
||||
f" {i:2d}. 🔊 {speaker:12} | {len(segs):4d} segments, "
|
||||
+ f"{total_dur:7.1f}s ({pct:5.1f}%)"
|
||||
)
|
||||
print(f"{'=' * 70}")
|
||||
print(f" Current: 🔊 {speakers[current_speaker_idx]}")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
# 退出
|
||||
elif cmd == "quit" or cmd == "exit" or cmd == "q":
|
||||
print("\nExiting...")
|
||||
break
|
||||
|
||||
else:
|
||||
print(f" Unknown command: {cmd}")
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Interactive Speaker Audio Player")
|
||||
parser.add_argument("audio_path", help="原始音頻文件路徑")
|
||||
parser.add_argument("result_path", help="ASRX 結果 JSON 路徑")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
return
|
||||
|
||||
if not Path(args.result_path).exists():
|
||||
print(f"Error: Result file not found: {args.result_path}")
|
||||
return
|
||||
|
||||
interactive_player(args.audio_path, args.result_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,164 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
GUI Face Player 自動化測試腳本
|
||||
測試所有功能並生成測試報告
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def check_file_exists(path, description):
|
||||
"""檢查文件是否存在"""
|
||||
exists = Path(path).exists()
|
||||
status = "✅" if exists else "❌"
|
||||
size = Path(path).stat().st_size / 1024 / 1024 if exists else 0
|
||||
print(f"{status} {description}: {path} ({size:.1f} MB)")
|
||||
return exists
|
||||
|
||||
|
||||
def check_process_running(pattern):
|
||||
"""檢查進程是否運行"""
|
||||
result = subprocess.run(['pgrep', '-f', pattern], capture_output=True, text=True)
|
||||
running = result.returncode == 0
|
||||
status = "✅" if running else "❌"
|
||||
print(f"{status} 進程:{pattern} ({'運行中' if running else '未運行'})")
|
||||
return running
|
||||
|
||||
|
||||
def test_json_structure(path, required_keys, description):
|
||||
"""測試 JSON 文件結構"""
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
missing_keys = [key for key in required_keys if key not in data]
|
||||
if missing_keys:
|
||||
print(f"❌ {description}: 缺少鍵 {missing_keys}")
|
||||
return False
|
||||
else:
|
||||
print(f"✅ {description}: 結構正確")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ {description}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_integration_script():
|
||||
"""測試整合腳本"""
|
||||
print("\n" + "="*70)
|
||||
print("測試整合腳本")
|
||||
print("="*70)
|
||||
|
||||
cmd = [
|
||||
'python3',
|
||||
'integrate_face_asrx_speaker.py',
|
||||
'/tmp/face_long.json',
|
||||
'/tmp/asrx_charade_optimized.json',
|
||||
'--threshold', '3.0',
|
||||
'--stats'
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
# 檢查輸出
|
||||
if '99.8%' in result.stdout:
|
||||
print("✅ 整合腳本:匹配率正確 (99.8%)")
|
||||
return True
|
||||
else:
|
||||
print("❌ 整合腳本:匹配率異常")
|
||||
print(result.stdout)
|
||||
return False
|
||||
|
||||
|
||||
def test_gui_startup():
|
||||
"""測試 GUI 啟動"""
|
||||
print("\n" + "="*70)
|
||||
print("測試 GUI 啟動")
|
||||
print("="*70)
|
||||
|
||||
# 檢查進程
|
||||
running = check_process_running('speaker_player_gui_face')
|
||||
|
||||
if running:
|
||||
print("✅ GUI 進程:正常運行")
|
||||
return True
|
||||
else:
|
||||
print("❌ GUI 進程:未運行")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""主測試函數"""
|
||||
print("="*70)
|
||||
print("GUI Face Player 自動化測試")
|
||||
print("="*70)
|
||||
|
||||
# 測試文件
|
||||
print("\n" + "="*70)
|
||||
print("測試文件")
|
||||
print("="*70)
|
||||
|
||||
files_ok = True
|
||||
files_ok &= check_file_exists('/tmp/charade_audio.wav', '音頻文件')
|
||||
files_ok &= check_file_exists('/tmp/asrx_charade_optimized.json', 'ASRX 結果')
|
||||
files_ok &= check_file_exists('/tmp/face_long.json', 'Face 結果')
|
||||
files_ok &= check_file_exists('/tmp/charade_integrated.json', '整合結果')
|
||||
|
||||
# 測試 JSON 結構
|
||||
print("\n" + "="*70)
|
||||
print("測試 JSON 結構")
|
||||
print("="*70)
|
||||
|
||||
json_ok = True
|
||||
json_ok &= test_json_structure(
|
||||
'/tmp/asrx_charade_optimized.json',
|
||||
['segments', 'n_speakers'],
|
||||
'ASRX 結果'
|
||||
)
|
||||
json_ok &= test_json_structure(
|
||||
'/tmp/face_long.json',
|
||||
['frames', 'frame_count'],
|
||||
'Face 結果'
|
||||
)
|
||||
json_ok &= test_json_structure(
|
||||
'/tmp/charade_integrated.json',
|
||||
['integrated_segments', 'speaker_stats'],
|
||||
'整合結果'
|
||||
)
|
||||
|
||||
# 測試整合腳本
|
||||
integration_ok = test_integration_script()
|
||||
|
||||
# 測試 GUI
|
||||
gui_ok = test_gui_startup()
|
||||
|
||||
# 總結
|
||||
print("\n" + "="*70)
|
||||
print("測試總結")
|
||||
print("="*70)
|
||||
|
||||
all_ok = files_ok and json_ok and integration_ok and gui_ok
|
||||
|
||||
if all_ok:
|
||||
print("✅ 所有測試通過!")
|
||||
else:
|
||||
print("❌ 部分測試失敗")
|
||||
if not files_ok:
|
||||
print(" - 文件測試失敗")
|
||||
if not json_ok:
|
||||
print(" - JSON 結構測試失敗")
|
||||
if not integration_ok:
|
||||
print(" - 整合腳本測試失敗")
|
||||
if not gui_ok:
|
||||
print(" - GUI 啟動測試失敗")
|
||||
|
||||
print("\n" + "="*70)
|
||||
|
||||
return all_ok
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
exit(0 if success else 1)
|
||||
@@ -1,240 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
長影片(Charade 1963,114 分鐘)完整測試腳本
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def print_header(title):
|
||||
"""打印標題"""
|
||||
print("\n" + "="*70)
|
||||
print(f" {title}")
|
||||
print("="*70)
|
||||
|
||||
|
||||
def test_data_files():
|
||||
"""測試數據文件"""
|
||||
print_header("1. 數據文件測試")
|
||||
|
||||
files = {
|
||||
'音頻文件': '/tmp/charade_audio.wav',
|
||||
'ASRX 結果': '/tmp/asrx_charade_optimized.json',
|
||||
'Face 結果': '/tmp/face_long.json',
|
||||
'整合結果': '/tmp/charade_integrated.json'
|
||||
}
|
||||
|
||||
all_ok = True
|
||||
for name, path in files.items():
|
||||
exists = Path(path).exists()
|
||||
size = Path(path).stat().st_size / 1024 / 1024 if exists else 0
|
||||
status = "✅" if exists else "❌"
|
||||
print(f"{status} {name}: {size:.1f} MB")
|
||||
all_ok = all_ok and exists
|
||||
|
||||
return all_ok
|
||||
|
||||
|
||||
def test_asrx_results():
|
||||
"""測試 ASRX 結果"""
|
||||
print_header("2. ASRX 結果測試")
|
||||
|
||||
with open('/tmp/asrx_charade_optimized.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
total_duration = data.get('total_duration', 0)
|
||||
n_speakers = data.get('n_speakers', 0)
|
||||
n_segments = data.get('n_speech_segments', 0)
|
||||
|
||||
print(f"📊 影片時長:{total_duration/60:.1f} 分鐘 ({total_duration:.1f}秒)")
|
||||
print(f" 說話人數量:{n_speakers}")
|
||||
print(f"📊 語音片段:{n_segments}")
|
||||
|
||||
# 說話人統計
|
||||
print("\n📢 說話人分佈:")
|
||||
speaker_stats = data.get('speaker_stats', {})
|
||||
for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
|
||||
duration = stats.get('duration', 0)
|
||||
count = stats.get('count', 0)
|
||||
pct = duration / total_duration * 100 if total_duration > 0 else 0
|
||||
print(f" {speaker}: {count} 片段,{duration/60:.1f}分鐘 ({pct:.1f}%)")
|
||||
|
||||
return n_speakers >= 2 and n_segments > 100
|
||||
|
||||
|
||||
def test_face_results():
|
||||
"""測試 Face 結果"""
|
||||
print_header("3. Face 結果測試")
|
||||
|
||||
with open('/tmp/face_long.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
total_frames = data.get('frame_count', 0)
|
||||
detected_frames = data.get('frames', [])
|
||||
fps = data.get('fps', 0)
|
||||
|
||||
print(f"📊 總數:{total_frames:,}")
|
||||
print(f"📊 檢測到人臉:{len(detected_frames):,}")
|
||||
print(f"📊 FPS: {fps:.2f}")
|
||||
print(f"📊 檢測率:{len(detected_frames)/total_frames*100:.2f}%")
|
||||
|
||||
return len(detected_frames) > 0
|
||||
|
||||
|
||||
def test_integration():
|
||||
"""測試整合結果"""
|
||||
print_header("4. Face + ASRX 整合測試")
|
||||
|
||||
with open('/tmp/charade_integrated.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
segments = data.get('integrated_segments', [])
|
||||
total = len(segments)
|
||||
with_face = sum(1 for seg in segments if seg.get('has_face', False))
|
||||
match_rate = with_face / total * 100 if total > 0 else 0
|
||||
|
||||
print(f"📊 總片段:{total}")
|
||||
print(f"📊 有人臉:{with_face}")
|
||||
print(f"📊 匹配率:{match_rate:.2f}%")
|
||||
|
||||
# 說話人匹配統計
|
||||
print("\n📢 說話人匹配詳情:")
|
||||
speaker_stats = data.get('speaker_stats', {})
|
||||
for speaker, stats in sorted(speaker_stats.items()):
|
||||
total_seg = stats.get('total_segments', 0)
|
||||
with_face_seg = stats.get('with_face', 0)
|
||||
rate = with_face_seg / total_seg * 100 if total_seg > 0 else 0
|
||||
status = "✅" if rate >= 99 else "⚠️" if rate >= 50 else "❌"
|
||||
print(f" {status} {speaker}: {with_face_seg}/{total_seg} ({rate:.1f}%)")
|
||||
|
||||
return match_rate >= 95
|
||||
|
||||
|
||||
def test_gui_process():
|
||||
"""測試 GUI 進程"""
|
||||
print_header("5. GUI 進程測試")
|
||||
|
||||
result = subprocess.run(['pgrep', '-f', 'speaker_player_gui_face'],
|
||||
capture_output=True, text=True)
|
||||
running = result.returncode == 0
|
||||
|
||||
if running:
|
||||
pid = result.stdout.strip()
|
||||
print(f"✅ GUI 進程運行中 (PID: {pid})")
|
||||
|
||||
# 檢查進程資源使用
|
||||
ps_result = subprocess.run(['ps', 'aux'], capture_output=True, text=True)
|
||||
for line in ps_result.stdout.split('\n'):
|
||||
if 'speaker_player_gui_face' in line and 'grep' not in line:
|
||||
parts = line.split()
|
||||
if len(parts) >= 8:
|
||||
cpu = parts[2]
|
||||
mem = parts[3]
|
||||
print(f" CPU: {cpu}%, 記憶體:{mem}%")
|
||||
else:
|
||||
print("❌ GUI 進程未運行")
|
||||
|
||||
return running
|
||||
|
||||
|
||||
def test_playback():
|
||||
"""測試播放功能(模擬)"""
|
||||
print_header("6. 播放功能測試")
|
||||
|
||||
# 測試 ffmpeg 是否可用
|
||||
result = subprocess.run(['which', 'ffmpeg'], capture_output=True, text=True)
|
||||
ffmpeg_ok = result.returncode == 0
|
||||
print(f"{'✅' if ffmpeg_ok else '❌'} ffmpeg: {'可用' if ffmpeg_ok else '不可用'}")
|
||||
|
||||
# 測試 afplay 是否可用
|
||||
result = subprocess.run(['which', 'afplay'], capture_output=True, text=True)
|
||||
afplay_ok = result.returncode == 0
|
||||
print(f"{'✅' if afplay_ok else '❌'} afplay: {'可用' if afplay_ok else '不可用'}")
|
||||
|
||||
# 測試音頻提取(第一個片段)
|
||||
with open('/tmp/asrx_charade_optimized.json', 'r', encoding='utf-8') as f:
|
||||
asrx_data = json.load(f)
|
||||
|
||||
first_seg = asrx_data['segments'][0]
|
||||
start = first_seg['start']
|
||||
end = first_seg['end']
|
||||
duration = end - start
|
||||
|
||||
print("\n🎵 測試提取第一個片段:")
|
||||
print(f" 時間:{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
|
||||
|
||||
# 實際提取測試
|
||||
temp_file = '/tmp/test_segment.wav'
|
||||
cmd = [
|
||||
'ffmpeg', '-y', '-loglevel', 'quiet',
|
||||
'-i', '/tmp/charade_audio.wav',
|
||||
'-ss', str(start),
|
||||
'-t', str(duration),
|
||||
temp_file
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
extract_ok = result.returncode == 0 and Path(temp_file).exists()
|
||||
|
||||
print(f"{'✅' if extract_ok else '❌'} 音頻提取: {'成功' if extract_ok else '失敗'}")
|
||||
|
||||
if extract_ok:
|
||||
size = Path(temp_file).stat().st_size / 1024
|
||||
print(f" 文件大小:{size:.1f} KB")
|
||||
Path(temp_file).unlink() # 清理
|
||||
|
||||
return ffmpeg_ok and afplay_ok and extract_ok
|
||||
|
||||
|
||||
def generate_report():
|
||||
"""生成測試報告"""
|
||||
print_header("測試報告")
|
||||
|
||||
tests = [
|
||||
("數據文件", test_data_files()),
|
||||
("ASRX 結果", test_asrx_results()),
|
||||
("Face 結果", test_face_results()),
|
||||
("整合結果", test_integration()),
|
||||
("GUI 進程", test_gui_process()),
|
||||
("播放功能", test_playback())
|
||||
]
|
||||
|
||||
passed = sum(1 for _, result in tests if result)
|
||||
total = len(tests)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print(f" 測試總結:{passed}/{total} 通過")
|
||||
print("="*70)
|
||||
|
||||
for name, result in tests:
|
||||
status = "✅" if result else "❌"
|
||||
print(f"{status} {name}")
|
||||
|
||||
if passed == total:
|
||||
print("\n🎉 所有測試通過!")
|
||||
else:
|
||||
print(f"\n⚠️ {total - passed} 個測試失敗")
|
||||
|
||||
# 保存報告
|
||||
report_path = '/tmp/long_movie_test_report.md'
|
||||
with open(report_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# 長影片測試報告\n\n")
|
||||
f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
|
||||
f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
|
||||
f.write("## 結果\n\n")
|
||||
f.write(f"**通過**: {passed}/{total}\n\n")
|
||||
for name, result in tests:
|
||||
status = "✅" if result else "❌"
|
||||
f.write(f"- {status} {name}\n")
|
||||
|
||||
print(f"\n📄 報告已保存:{report_path}")
|
||||
|
||||
return passed == total
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = generate_report()
|
||||
exit(0 if success else 1)
|
||||
@@ -126,6 +126,52 @@ def extract_speech_audio(audio_path, model, utils, output_dir=None):
|
||||
return speech_audios, speech_segments
|
||||
|
||||
|
||||
def scan_within_segment(wav, sample_rate, start_sec, end_sec, model, utils,
|
||||
min_speech_duration_ms=500, min_silence_duration_ms=300):
|
||||
"""
|
||||
在一個時間範圍內執行 VAD 掃描,切出子片段。
|
||||
|
||||
用途: whisper 給出的粗略時間段內,利用句間停頓細切。
|
||||
|
||||
Args:
|
||||
wav: 完整音頻波形 (numpy array)
|
||||
sample_rate: 採樣率
|
||||
start_sec: 掃描起始時間 (秒)
|
||||
end_sec: 掃描結束時間 (秒)
|
||||
model: VAD 模型
|
||||
utils: VAD 工具函數
|
||||
min_speech_duration_ms: 最小語音持續時間
|
||||
min_silence_duration_ms: 最小靜音持續時間
|
||||
|
||||
Returns:
|
||||
sub_segments: [(start_sec, end_sec), ...] 子片段列表 (原始時間軸)
|
||||
"""
|
||||
get_speech_timestamps, _, _, _, _ = utils
|
||||
|
||||
# 提取該時間範圍內的音頻
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
segment_wav = wav[start_sample:end_sample]
|
||||
|
||||
# 在子音頻上執行 VAD
|
||||
speech_ts = get_speech_timestamps(
|
||||
segment_wav,
|
||||
model,
|
||||
sampling_rate=sample_rate,
|
||||
min_speech_duration_ms=min_speech_duration_ms,
|
||||
min_silence_duration_ms=min_silence_duration_ms,
|
||||
return_seconds=True,
|
||||
)
|
||||
|
||||
# 轉換回原始時間軸
|
||||
sub_segments = [
|
||||
(ts["start"] + start_sec, ts["end"] + start_sec)
|
||||
for ts in speech_ts
|
||||
]
|
||||
|
||||
return sub_segments
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 測試 VAD
|
||||
import sys
|
||||
|
||||
35
scripts/asrx_self/whisper_local.py
Normal file
35
scripts/asrx_self/whisper_local.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
Whisper Local - uses faster-whisper for per-segment transcription
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def load_model(size="small"):
|
||||
from faster_whisper import WhisperModel
|
||||
return WhisperModel(size, device="cpu", compute_type="int8")
|
||||
|
||||
|
||||
def transcribe_segment(wav, sample_rate, start_sec, end_sec, model):
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
if start_sample >= len(wav):
|
||||
return {"text": "", "language": "", "lang_prob": 0.0, "segments": []}
|
||||
segment_wav = wav[start_sample:min(end_sample, len(wav))]
|
||||
|
||||
segments_generator, info = model.transcribe(segment_wav, language=None)
|
||||
|
||||
text = ""
|
||||
lang_prob = info.language_probability if info else 0.0
|
||||
language = info.language if info else ""
|
||||
|
||||
segs = list(segments_generator)
|
||||
for seg in segs:
|
||||
text += seg.text + " "
|
||||
|
||||
return {
|
||||
"text": text.strip(),
|
||||
"language": language,
|
||||
"lang_prob": lang_prob,
|
||||
"segments": segs,
|
||||
}
|
||||
Reference in New Issue
Block a user