feat: ASRX hybrid pipeline, identity history, worker fixes, checkpoint system

2026-06-02 07:13:23 +08:00
parent e3066c3f49
commit e1572907ae
198 changed files with 43705 additions and 8910 deletions
--- a/scripts/asrx_self/integrate_face_asrx_speaker.py
+++ b/scripts/asrx_self/integrate_face_asrx_speaker.py
@@ -1,178 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-整合 Face + ASRX 說話人分離（版本 3 - 修復 face_detected 檢查）
-"""
-
-import json
-import argparse
-from pathlib import Path
-from typing import Dict, List
-
-
-def load_json(path: str):
-    """載入 JSON 文件"""
-    with open(path, 'r', encoding='utf-8') as f:
-        return json.load(f)
-
-
-def match_face_with_speaker_v3(face_data: Dict, asrx_data: Dict, 
-                                time_threshold: float = 3.0) -> List[Dict]:
-    """
-    匹配人臉與說話人（版本 3 - 修復版）
-    
-    修復：Face 數據沒有 face_detected 欄位，改用 faces 列表是否為空判斷
-    """
-    face_frames = face_data.get('frames', [])
-    asrx_segments = asrx_data.get('segments', [])
-    
-    # 將 Face 幀按時間排序
-    face_frames_sorted = sorted(face_frames, key=lambda x: x.get('timestamp', 0))
-    
-    print(f"  Face frames: {len(face_frames_sorted)}")
-    print(f"  ASRX segments: {len(asrx_segments)}")
-    
-    # 匹配
-    integrated = []
-    
-    for i, seg in enumerate(asrx_segments):
-        start = seg['start']
-        end = seg['end']
-        speaker = seg['speaker']
-        mid_time = (start + end) / 2
-        
-        # 找到時間範圍內的人臉
-        faces_in_range = []
-        for frame in face_frames_sorted:
-            ts = frame.get('timestamp', 0)
-            
-            # 檢查是否在時間範圍內
-            if start - time_threshold <= ts <= end + time_threshold:
-                # 檢查是否有人臉（faces 列表不為空）
-                faces = frame.get('faces', [])
-                if faces and len(faces) > 0:
-                    faces_in_range.append({
-                        'timestamp': ts,
-                        'faces': faces,
-                        'distance_from_mid': abs(ts - mid_time)
-                    })
-        
-        # 選擇最接近片段中間的人臉
-        if faces_in_range:
-            faces_in_range.sort(key=lambda x: x['distance_from_mid'])
-            best_face = faces_in_range[0]
-        else:
-            best_face = None
-        
-        # 建立整合結果
-        integrated.append({
-            'start': start,
-            'end': end,
-            'duration': seg.get('duration', end - start),
-            'speaker': speaker,
-            'has_face': best_face is not None,
-            'face_timestamp': best_face['timestamp'] if best_face else None,
-            'face_location': best_face['faces'][0] if best_face and best_face['faces'] else None,
-            'face_count_in_range': len(faces_in_range)
-        })
-        
-        # 進度顯示
-        if (i + 1) % 200 == 0:
-            print(f"  Processed {i+1}/{len(asrx_segments)} segments...")
-    
-    return integrated
-
-
-def analyze_speaker_face(integrated: List[Dict]):
-    """分析說話人與人臉的對應"""
-    speaker_stats = {}
-    
-    for item in integrated:
-        speaker = item['speaker']
-        if speaker not in speaker_stats:
-            speaker_stats[speaker] = {
-                'total_segments': 0,
-                'with_face': 0,
-                'without_face': 0,
-                'total_duration': 0
-            }
-        
-        speaker_stats[speaker]['total_segments'] += 1
-        speaker_stats[speaker]['total_duration'] += item['duration']
-        
-        if item['has_face']:
-            speaker_stats[speaker]['with_face'] += 1
-        else:
-            speaker_stats[speaker]['without_face'] += 1
-    
-    return speaker_stats
-
-
-def main():
-    parser = argparse.ArgumentParser(description='整合 Face + ASRX 說話人')
-    parser.add_argument('face_json', help='Face 檢測結果 JSON')
-    parser.add_argument('asrx_json', help='ASRX 說話人分離 JSON')
-    parser.add_argument('-o', '--output', help='輸出整合結果 JSON')
-    parser.add_argument('--threshold', type=float, default=3.0,
-                        help='時間閾值（秒）')
-    parser.add_argument('--stats', action='store_true', help='只显示統計')
-    
-    args = parser.parse_args()
-    
-    # 載入數據
-    print(f"[Load] Face: {args.face_json}")
-    face_data = load_json(args.face_json)
-    
-    print(f"[Load] ASRX: {args.asrx_json}")
-    asrx_data = load_json(args.asrx_json)
-    
-    # 匹配
-    print(f"\n[Match] Matching faces with speakers (threshold={args.threshold}s)...")
-    integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
-    
-    # 分析
-    print("\n[Analyze] Analyzing speaker-face correspondence...")
-    speaker_stats = analyze_speaker_face(integrated)
-    
-    # 顯示統計
-    print(f"\n{'='*70}")
-    print("說話人 - 人臉對應統計")
-    print(f"{'='*70}")
-    
-    total_segments = len(integrated)
-    total_with_face = sum(1 for item in integrated if item['has_face'])
-    
-    for speaker, stats in sorted(speaker_stats.items()):
-        with_face_pct = stats['with_face'] / stats['total_segments'] * 100 if stats['total_segments'] > 0 else 0
-        print(f"\n🔊 {speaker}:")
-        print(f"  總片段：{stats['total_segments']}")
-        print(f"  有人臉：{stats['with_face']} ({with_face_pct:.1f}%)")
-        print(f"  無人臉：{stats['without_face']}")
-        print(f"  總時長：{stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f}分鐘)")
-    
-    print(f"\n{'='*70}")
-    print(f"總計：{total_segments} 片段，{total_with_face} 片段有人臉 ({total_with_face/total_segments*100:.1f}%)")
-    print(f"{'='*70}")
-    
-    # 保存結果
-    if args.output:
-        output_path = Path(args.output)
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        
-        result = {
-            'face_source': str(args.face_json),
-            'asrx_source': str(args.asrx_json),
-            'time_threshold': args.threshold,
-            'integrated_segments': integrated,
-            'speaker_stats': speaker_stats
-        }
-        
-        with open(output_path, 'w', encoding='utf-8') as f:
-            json.dump(result, f, indent=2, ensure_ascii=False)
-        
-        print(f"\n[Save] Results saved to: {output_path}")
-    
-    return integrated, speaker_stats
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/asrx_self/main.py
+++ b/scripts/asrx_self/main.py
@@ -1,268 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-Self-implemented ASRX - 自實作說話人分離系統
-基於聲紋嵌入 + 譜聚類
-
-技術架構:
-1. VAD (Silero VAD) - 語音活動檢測
-2. Speaker Encoder (ECAPA-TDNN) - 聲紋特徵提取
-3. Spectral Clustering - 譜聚類
-4. Post-processing - 後處理
-
-流程:
-音頻 → VAD → 語音片段 → 聲紋嵌入 → 相似度矩陣 → 譜聚類 → 說話人 ID
-"""
-
-import sys
-import json
-import time
-from pathlib import Path
-
-# 導入自定義模組
-from vad import load_vad_model, extract_speech_segments
-from speaker_encoder import (
-    load_speaker_encoder,
-    extract_speaker_embeddings_batch,
-    compute_similarity_matrix,
-    normalize_embeddings,
-)
-from speaker_cluster import spectral_clustering_speaker, smooth_speaker_labels
-
-
-class SelfASRX:
-    """
-    自實作說話人分離系統
-    """
-
-    def __init__(self):
-        """初始化模型"""
-        print("[SelfASRX] Initializing models...")
-
-        # 載入 VAD 模型
-        print("[SelfASRX] Loading VAD model (Silero)...")
-        self.vad_model, self.vad_utils = load_vad_model()
-
-        # 載入聲紋模型
-        print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
-        self.speaker_encoder = load_speaker_encoder()
-
-        print("[SelfASRX] Models loaded successfully")
-
-    def process(
-        self,
-        audio_path,
-        output_path=None,
-        min_speech_duration_ms=500,
-        n_speakers=None,
-        smooth_window=5,
-    ):
-        """
-        處理音頻文件進行說話人分離
-
-        Args:
-            audio_path: 音頻文件路徑
-            output_path: 輸出 JSON 路徑（可選）
-            min_speech_duration_ms: 最小語音持續時間
-            n_speakers: 說話人數量（None=自動估計）
-            smooth_window: 平滑窗口大小
-
-        Returns:
-            result: 說話人分離結果
-        """
-        start_time = time.time()
-        print(f"\n[SelfASRX] Processing: {audio_path}")
-        print("=" * 60)
-
-        # 步驟 1: VAD - 語音活動檢測
-        print("\n[Step 1] Voice Activity Detection...")
-        step1_start = time.time()
-
-        speech_segments, wav, sample_rate = extract_speech_segments(
-            audio_path,
-            self.vad_model,
-            self.vad_utils,
-            min_speech_duration_ms=min_speech_duration_ms,
-        )
-
-        step1_time = time.time() - step1_start
-        print(f"  Speech segments: {len(speech_segments)}")
-        print(f"  Total duration: {len(wav) / sample_rate:.2f}s")
-        print(f"  VAD time: {step1_time:.2f}s")
-
-        if len(speech_segments) == 0:
-            print("[SelfASRX] No speech detected!")
-            return {"error": "No speech detected", "segments": []}
-
-        # 步驟 2: 聲紋特徵提取
-        print("\n[Step 2] Speaker embedding extraction...")
-        step2_start = time.time()
-
-        # 提取語音片段音頻
-        audio_segments = []
-        for start_sec, end_sec in speech_segments:
-            start_sample = int(start_sec * sample_rate)
-            end_sample = int(end_sec * sample_rate)
-            audio_segments.append(wav[start_sample:end_sample])
-
-        # 批量提取嵌入
-        embeddings = extract_speaker_embeddings_batch(
-            self.speaker_encoder, audio_segments, sample_rate
-        )
-
-        # 正規化
-        embeddings = normalize_embeddings(embeddings)
-
-        step2_time = time.time() - step2_start
-        print(f"  Embedding shape: {embeddings.shape}")
-        print(f"  Embedding time: {step2_time:.2f}s")
-
-        # 步驟 3: 計算相似度矩陣
-        print("\n[Step 3] Computing similarity matrix...")
-        step3_start = time.time()
-
-        similarity_matrix = compute_similarity_matrix(embeddings, method="cosine")
-
-        step3_time = time.time() - step3_start
-        print(f"  Similarity matrix shape: {similarity_matrix.shape}")
-        print(f"  Similarity time: {step3_time:.2f}s")
-
-        # 步驟 4: 譜聚類
-        print("\n[Step 4] Spectral clustering...")
-        step4_start = time.time()
-
-        speaker_labels, estimated_n_speakers = spectral_clustering_speaker(
-            similarity_matrix, n_speakers=n_speakers, auto_estimate=(n_speakers is None)
-        )
-
-        # 平滑標籤
-        if smooth_window > 1:
-            speaker_labels = smooth_speaker_labels(
-                speaker_labels, window_size=smooth_window
-            )
-
-        step4_time = time.time() - step4_start
-        print(f"  Estimated speakers: {estimated_n_speakers}")
-        print(f"  Clustering time: {step4_time:.2f}s")
-
-        # 步驟 5: 建立輸出結果
-        print("\n[Step 5] Building output...")
-
-        result = {
-            "audio_path": str(audio_path),
-            "total_duration": len(wav) / sample_rate,
-            "n_speech_segments": len(speech_segments),
-            "n_speakers": int(estimated_n_speakers),
-            "segments": [],
-        }
-
-        for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
-            result["segments"].append(
-                {
-                    "index": i,
-                    "start": round(start, 3),
-                    "end": round(end, 3),
-                    "duration": round(end - start, 3),
-                    "speaker": f"SPEAKER_{int(label)}",
-                }
-            )
-
-        # 統計每個說話人的總時長
-        speaker_stats = {}
-        for seg in result["segments"]:
-            speaker = seg["speaker"]
-            if speaker not in speaker_stats:
-                speaker_stats[speaker] = {"count": 0, "duration": 0}
-            speaker_stats[speaker]["count"] += 1
-            speaker_stats[speaker]["duration"] += seg["duration"]
-
-        result["speaker_stats"] = speaker_stats
-
-        total_time = time.time() - start_time
-        result["processing_time"] = round(total_time, 2)
-        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
-
-        print("\n[SelfASRX] Processing completed!")
-        print(f"  Total time: {total_time:.2f}s")
-        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
-        print(f"  Detected speakers: {estimated_n_speakers}")
-
-        # 保存結果
-        if output_path:
-            output_path = Path(output_path)
-            output_path.parent.mkdir(parents=True, exist_ok=True)
-
-            with open(output_path, "w", encoding="utf-8") as f:
-                json.dump(result, f, indent=2, ensure_ascii=False)
-
-            print(f"  Results saved to: {output_path}")
-
-        print("=" * 60)
-
-        return result
-
-
-def main():
-    """主函數"""
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Self-implemented ASRX - Speaker Diarization"
-    )
-    parser.add_argument("audio_path", help="Path to audio file")
-    parser.add_argument("-o", "--output", help="Output JSON path")
-    parser.add_argument(
-        "--min-speech-duration",
-        type=int,
-        default=500,
-        help="Minimum speech duration in ms (default: 500)",
-    )
-    parser.add_argument(
-        "--n-speakers",
-        type=int,
-        default=None,
-        help="Number of speakers (default: auto-estimate)",
-    )
-    parser.add_argument(
-        "--smooth-window",
-        type=int,
-        default=5,
-        help="Smoothing window size (default: 5)",
-    )
-
-    args = parser.parse_args()
-
-    # 檢查文件是否存在
-    if not Path(args.audio_path).exists():
-        print(f"Error: Audio file not found: {args.audio_path}")
-        sys.exit(1)
-
-    # 創建 ASRX 實例並處理
-    asrx = SelfASRX()
-    result = asrx.process(
-        args.audio_path,
-        args.output,
-        min_speech_duration_ms=args.min_speech_duration,
-        n_speakers=args.n_speakers,
-        smooth_window=args.smooth_window,
-    )
-
-    # 顯示結果摘要
-    if "error" not in result:
-        print("\n[Summary]")
-        print(f"  Audio duration: {result['total_duration']:.2f}s")
-        print(f"  Speech segments: {result['n_speech_segments']}")
-        print(f"  Detected speakers: {result['n_speakers']}")
-        print(f"  Processing time: {result['processing_time']:.2f}s")
-        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
-
-        print("\n[Speaker Statistics]")
-        for speaker, stats in result["speaker_stats"].items():
-            pct = stats["duration"] / result["total_duration"] * 100
-            print(
-                f"  {speaker}: {stats['count']} segments, "
-                + f"{stats['duration']:.2f}s ({pct:.1f}%)"
-            )
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/asrx_self/main_fixed.py
+++ b/scripts/asrx_self/main_fixed.py
@@ -1,308 +1,728 @@
-#!/opt/homebrew/bin/python3.11
 """
-Self-implemented ASRX - Fixed Version
-使用魯棒的聚類算法
+SelfASRXFixed - 7 步 Hybrid Speaker Diarization Pipeline
+
+Pipeline:
+  1. whisper.transcribe(full_audio) → rough segments + text + language
+  2. VAD scan each rough segment → refined segments
+  3. whisper per refined segment → {text, language, lang_prob}
+  4. ECAPA-TDNN per refined segment → 192-dim embeddings
+  5. AgglomerativeClustering → speaker_labels
+  6. Store all embeddings in Qdrant (payload: file_uuid, speaker_id, text, ...)
+  7. High-quality embeddings → gender classify + store reference in Qdrant
 """

 import sys
 import json
 import time
+import os
 import numpy as np
 from pathlib import Path
+from urllib.request import Request, urlopen
+from urllib.error import URLError

-# 導入自定義模組
-from vad import load_vad_model, extract_speech_segments
-from speaker_encoder import (
-    load_speaker_encoder, 
-    extract_speaker_embeddings_batch,
-    normalize_embeddings
-)
-from speaker_cluster_fixed import robust_speaker_clustering
+
+def _load_audio(path):
+    """載入音頻文件，回傳 (wav_numpy, sample_rate)"""
+    import soundfile as sf
+    wav, sr = sf.read(path)
+    if len(wav.shape) > 1:
+        wav = np.mean(wav, axis=1)
+    return wav, sr
+
+
+def _load_whisper_model(size="small"):
+    from whisper_local import load_model
+    return load_model(size)
+
+
+def _load_vad():
+    from vad import load_vad_model
+    return load_vad_model()
+
+
+def _load_speaker_encoder():
+    from speaker_encoder import load_speaker_encoder
+    return load_speaker_encoder()
+
+
+def _load_gender_classifier():
+    try:
+        from speechbrain.inference.classifiers import EncoderClassifier
+        classifier = EncoderClassifier.from_hparams(
+            source="speechbrain/gender-recognition-ecapa",
+            run_opts={"device": "cpu"},
+        )
+        print("[Gender] Classifier loaded: speechbrain/gender-recognition-ecapa")
+        return classifier
+    except Exception as e:
+        print(f"[Gender] Classifier not available: {e}")
+        return None
+
+
+def _ensure_speaker_collection(qdrant_url, api_key, collection):
+    """確認 Qdrant speaker collection 存在，不存在則建立 (dim=192, cosine)"""
+    try:
+        url = f"{qdrant_url}/collections/{collection}"
+        req = Request(url, method="GET",
+                      headers={"api-key": api_key} if api_key else {})
+        try:
+            urlopen(req)
+            return True
+        except URLError as e:
+            if getattr(e, "code", None) == 404:
+                body = json.dumps({
+                    "vectors": {
+                        "size": 192,
+                        "distance": "Cosine"
+                    }
+                }).encode()
+                req = Request(url, data=body, method="PUT",
+                              headers={"Content-Type": "application/json",
+                                       **({"api-key": api_key} if api_key else {})})
+                urlopen(req)
+                print(f"[Qdrant] Created collection: {collection} (dim=192)")
+                return True
+            raise
+    except Exception as e:
+        print(f"[Qdrant] Cannot access Qdrant: {e}")
+        return False
+
+
+def _qdrant_upsert(qdrant_url, api_key, collection, points):
+    """批量寫入 Qdrant points"""
+    try:
+        url = f"{qdrant_url}/collections/{collection}/points?wait=true"
+        body = json.dumps({"points": points}).encode()
+        headers = {"Content-Type": "application/json"}
+        if api_key:
+            headers["api-key"] = api_key
+        req = Request(url, data=body, headers=headers, method="PUT")
+        urlopen(req)
+        return True
+    except Exception as e:
+        print(f"[Qdrant] Upsert failed: {e}")
+        return False
+
+
+def _hash_point_id(file_uuid, label):
+    """產生一致的 point ID"""
+    s = f"{file_uuid}_{label}"
+    return hash(s) & 0x7FFFFFFFFFFFFFFF
+
+
+def _save_checkpoint(path: str, data: dict):
+    """原子寫入 checkpoint（先 .tmp 再 rename）"""
+    tmp = path + ".tmp"
+    Path(tmp).parent.mkdir(parents=True, exist_ok=True)
+    with open(tmp, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+    os.replace(tmp, path)
+
+
+def compute_embedding_quality(embeddings, labels):
+    """每個 embedding 到所屬 cluster centroid 的餘弦相似度"""
+    from sklearn.metrics.pairwise import cosine_similarity
+    unique_labels = set(labels)
+    centroids = {}
+    for label in unique_labels:
+        mask = labels == label
+        centroid = np.mean(embeddings[mask], axis=0)
+        norm = np.linalg.norm(centroid)
+        if norm > 0:
+            centroid = centroid / norm
+        centroids[label] = centroid
+    qualities = []
+    for emb, label in zip(embeddings, labels):
+        sim = cosine_similarity([emb], [centroids[label]])[0][0]
+        qualities.append(sim)
+    return np.array(qualities)


 class SelfASRXFixed:
-    """自實作說話人分離系統（修復版）"""
-    
+    """7 步 Hybrid Speaker Diarization Pipeline"""
+
    def __init__(self):
-        print("[SelfASRX-Fixed] Initializing models...")
-        
-        # 載入 VAD 模型
-        print("[SelfASRX-Fixed] Loading VAD model (Silero)...")
-        self.vad_model, self.vad_utils = load_vad_model()
-        
-        # 載入聲紋模型
-        print("[SelfASRX-Fixed] Loading speaker encoder (ECAPA-TDNN)...")
-        self.speaker_encoder = load_speaker_encoder()
-        
-        print("[SelfASRX-Fixed] Models loaded successfully")
-    
-    def process(self, audio_path, output_path=None, 
-                min_speech_duration_ms=500,
-                n_speakers=None,
-                max_speakers=10):
-        """處理音頻文件"""
-        start_time = time.time()
-        print(f"\n[SelfASRX-Fixed] Processing: {audio_path}")
-        print("=" * 60)
-        
-        # 步驟 1: VAD
-        print("\n[Step 1] Voice Activity Detection...")
-        step1_start = time.time()
-        
-        speech_segments, wav, sample_rate = extract_speech_segments(
-            audio_path, self.vad_model, self.vad_utils,
-            min_speech_duration_ms=min_speech_duration_ms
-        )
-        
-        step1_time = time.time() - step1_start
-        print(f"  Speech segments: {len(speech_segments)}")
-        print(f"  Total duration: {len(wav)/sample_rate:.2f}s")
-        print(f"  VAD time: {step1_time:.2f}s")
-        
-        if len(speech_segments) == 0:
-            print("[SelfASRX-Fixed] No speech detected!")
-            return {"error": "No speech detected", "segments": []}
-        
-        # 步驟 2: 聲紋特徵提取
-        print("\n[Step 2] Speaker embedding extraction...")
-        step2_start = time.time()
-        
-        # 提取語音片段音頻
-        audio_segments = []
-        for start_sec, end_sec in speech_segments:
-            start_sample = int(start_sec * sample_rate)
-            end_sample = int(end_sec * sample_rate)
-            audio_segments.append(wav[start_sample:end_sample])
-        
-        # 批量提取嵌入
-        embeddings = extract_speaker_embeddings_batch(
-            self.speaker_encoder, audio_segments, sample_rate
-        )
-        
-        # 正規化
-        embeddings = normalize_embeddings(embeddings)
-        
-        step2_time = time.time() - step2_start
-        print(f"  Embedding shape: {embeddings.shape}")
-        print(f"  Embedding time: {step2_time:.2f}s")
-        
-        # 步驟 3: 魯棒聚類
-        print("\n[Step 3] Robust speaker clustering...")
-        step3_start = time.time()
-        
-        speaker_labels, estimated_n_speakers = robust_speaker_clustering(
-            embeddings,
-            n_speakers=n_speakers,
-            max_speakers=max_speakers
-        )
-        
-        step3_time = time.time() - step3_start
-        print(f"  Clustering time: {step3_time:.2f}s")
-        
-        # 步驟 4: 建立輸出
-        print("\n[Step 4] Building output...")
-        
-        result = {
-            "audio_path": str(audio_path),
-            "total_duration": len(wav) / sample_rate,
-            "n_speech_segments": len(speech_segments),
-            "n_speakers": int(estimated_n_speakers),
-            "segments": []
-        }
-        
-        for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
-            result["segments"].append({
-                "index": i,
-                "start": round(start, 3),
-                "end": round(end, 3),
-                "duration": round(end - start, 3),
-                "speaker": f"SPEAKER_{int(label)}"
-            })
-        
-        # 統計每個說話人的總時長
-        speaker_stats = {}
-        for seg in result["segments"]:
-            speaker = seg["speaker"]
-            if speaker not in speaker_stats:
-                speaker_stats[speaker] = {"count": 0, "duration": 0}
-            speaker_stats[speaker]["count"] += 1
-            speaker_stats[speaker]["duration"] += seg["duration"]
-        
-        result["speaker_stats"] = speaker_stats
-        
-        total_time = time.time() - start_time
-        result["processing_time"] = round(total_time, 2)
-        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
-        
-        print("\n[SelfASRX-Fixed] Processing completed!")
-        print(f"  Total time: {total_time:.2f}s")
-        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
-        print(f"  Detected speakers: {estimated_n_speakers}")
-        
-        # 保存結果
-        if output_path:
-            output_path = Path(output_path)
-            output_path.parent.mkdir(parents=True, exist_ok=True)
-            
-            with open(output_path, 'w', encoding='utf-8') as f:
-                json.dump(result, f, indent=2, ensure_ascii=False)
-            
-            print(f"  Results saved to: {output_path}")
-        
-        print("=" * 60)
-        
-        return result
+        print("[SelfASRX] Initializing models...")

+        print("[SelfASRX] Loading whisper model...")
+        self.whisper = _load_whisper_model("small")
+
+        print("[SelfASRX] Loading VAD model (Silero)...")
+        self.vad_model, self.vad_utils = _load_vad()
+
+        print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
+        self.speaker_encoder = _load_speaker_encoder()
+
+        print("[SelfASRX] Loading gender classifier...")
+        self.gender_classifier = _load_gender_classifier()
+
+        # Qdrant 設定
+        self.qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333")
+        self.qdrant_api_key = os.environ.get("QDRANT_API_KEY", "")
+        schema = os.environ.get("DATABASE_SCHEMA", "public")
+        self.qdrant_collection = os.environ.get(
+            "QDRANT_SPEAKER_COLLECTION",
+            f"momentry_{schema}_speaker"
+        )
+        self._qdrant_ok = False
+
+        print("[SelfASRX] Models loaded successfully")
+
+    def process(self, audio_path, output_path=None, file_uuid=None,
+                max_speakers=10, quality_threshold=0.85,
+                checkpoint_path=None):
+        """7 步 speaker diarization pipeline

-    def process_with_segments(self, audio_path, asr_segments, output_path=None):
-        """
-        使用 ASR segment 邊界進行 speaker diarization，取代 VAD 步驟。
-        
        Args:
-            audio_path: 音頻文件路徑（WAV）
-            asr_segments: ASR segment 列表，每個包含 start/end（秒）
-            output_path: 輸出 JSON 路徑（可選）
+            audio_path: 音頻文件路徑 (WAV 16kHz mono)
+            output_path: 輸出 JSON 路徑 (可選)
+            file_uuid: 檔案 UUID (用於 Qdrant 儲存)
+            max_speakers: 最大說話人數
+            quality_threshold: 高品質聲紋門檻 (0-1)
+            checkpoint_path: Step 3 完成後儲存 checkpoint 路徑
+
+        Returns:
+            dict: segments, speaker_stats, n_speakers, total_duration, references
        """
        start_time = time.time()
-        print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
+        print(f"\n[SelfASRX] Processing: {audio_path}")
        print("=" * 60)

-        # 載入完整音頻
-        import soundfile as sf
-        wav, sample_rate = sf.read(audio_path)
-        if len(wav.shape) > 1:
-            wav = np.mean(wav, axis=1)  # 轉 mono
-        print(f"  Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
+        # 載入音頻
+        wav, sample_rate = _load_audio(audio_path)
+        total_duration = len(wav) / sample_rate
+        print(f"  Audio: {total_duration:.2f}s, {sample_rate}Hz")

-        # 使用 ASR segments 取代 VAD (audio处理用time)
-        speech_segments = [(s["start_time"], s["end_time"]) for s in asr_segments]
-        print(f"  Speech segments from ASR: {len(speech_segments)}")
+        # ── Step 1: whisper 粗略定位 (faster-whisper) ──
+        print("\n[Step 1] Initial whisper transcription...")
+        t1 = time.time()
+        seg_gen, info = self.whisper.transcribe(audio_path)
+        rough_segments = []
+        for seg in seg_gen:
+            rough_segments.append({"start": seg.start, "end": seg.end, "text": seg.text})
+        language = info.language if info else None
+        print(f"  Rough segments: {len(rough_segments)}")
+        print(f"  Language: {language}")
+        print(f"  Step 1 time: {time.time() - t1:.2f}s")

-        if len(speech_segments) == 0:
-            print("[SelfASRX-Fixed] No ASR segments provided!")
-            return {"error": "No ASR segments", "segments": []}
+        if not rough_segments:
+            print("[SelfASRX] No speech detected by whisper!")
+            return {"error": "No speech detected", "segments": []}

-        # 提取語音片段
-        audio_segments = []
-        for start_sec, end_sec in speech_segments:
-            start_sample = int(start_sec * sample_rate)
-            end_sample = int(end_sec * sample_rate)
-            if start_sample >= len(wav):
+        # ── Step 2: VAD scan 每個 rough segment 細切 ──
+        print("\n[Step 2] VAD scan for refined segmentation...")
+        t2 = time.time()
+        refined_segments = []
+        for seg in rough_segments:
+            s = seg["start"]
+            e = seg["end"]
+            sub = self._vad_scan_segment(wav, sample_rate, s, e)
+            if sub:
+                refined_segments.extend(sub)
+            else:
+                refined_segments.append((s, e))
+        print(f"  Refined segments: {len(refined_segments)}")
+        print(f"  Step 2 time: {time.time() - t2:.2f}s")
+
+        if not refined_segments:
+            return {"error": "No segments after VAD scan", "segments": []}
+
+        # ── Step 3: whisper per refined segment ──
+        print("\n[Step 3] Per-segment transcription...")
+        t3 = time.time()
+        CHECKPOINT_INTERVAL = 50
+
+        segment_texts = []
+        resume_from = 0
+
+        # 載入既有 partial checkpoint（中斷續接）
+        if checkpoint_path and os.path.exists(checkpoint_path):
+            try:
+                with open(checkpoint_path, "r") as f:
+                    cp = json.load(f)
+                if cp.get("checkpoint_version") == 2 and not cp.get("step3_completed"):
+                    saved = cp.get("segment_texts", [])
+                    if saved:
+                        resume_from = len(saved)
+                        segment_texts = saved
+                        print(f"[Step 3] Resuming from #{resume_from}/{len(refined_segments)}")
+            except Exception:
+                pass
+
+        for i, (start_sec, end_sec) in enumerate(refined_segments):
+            if i < resume_from:
                continue
-            audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
+            seg_text = self._transcribe_segment(wav, sample_rate, start_sec, end_sec)
+            segment_texts.append(seg_text)

-        print(f"  Audio segments extracted: {len(audio_segments)}")
+            if checkpoint_path and (i + 1) % CHECKPOINT_INTERVAL == 0:
+                _save_checkpoint(checkpoint_path, {
+                    "checkpoint_version": 2,
+                    "step3_completed": False,
+                    "step3_progress": i + 1,
+                    "language": language,
+                    "total_duration": total_duration,
+                    "refined_segments": [[s, e] for s, e in refined_segments],
+                    "segment_texts": [{
+                        "text": st["text"],
+                        "language": st["language"],
+                        "lang_prob": st["lang_prob"],
+                    } for st in segment_texts],
+                    "file_uuid": file_uuid,
+                    "max_speakers": max_speakers,
+                    "quality_threshold": quality_threshold,
+                })
+                print(f"[Checkpoint] Step 3: {i+1}/{len(refined_segments)}")

-        # 批量提取聲紋嵌入
-        print("\n[Step 2] Speaker embedding extraction...")
-        step2_start = time.time()
+        print(f"  Step 3 time: {time.time() - t3:.2f}s")
+
+        # ── Save final checkpoint after Step 3 ──
+        if checkpoint_path:
+            _save_checkpoint(checkpoint_path, {
+                "checkpoint_version": 2,
+                "step3_completed": True,
+                "language": language,
+                "total_duration": total_duration,
+                "refined_segments": [[s, e] for s, e in refined_segments],
+                "segment_texts": [{
+                    "text": st["text"],
+                    "language": st["language"],
+                    "lang_prob": st["lang_prob"],
+                } for st in segment_texts],
+                "file_uuid": file_uuid,
+                "max_speakers": max_speakers,
+                "quality_threshold": quality_threshold,
+            })
+            print(f"[Checkpoint] Step 3 complete, saved to {checkpoint_path}")
+
+        # ── Step 4: ECAPA-TDNN per refined segment ──
+        print("\n[Step 4] Speaker embedding extraction...")
+        t4 = time.time()
+        audio_segments = []
+        for start_sec, end_sec in refined_segments:
+            s = int(start_sec * sample_rate)
+            e = int(end_sec * sample_rate)
+            audio_segments.append(wav[s:min(e, len(wav))])
+
+        from speaker_encoder import extract_speaker_embeddings_batch, normalize_embeddings
        embeddings = extract_speaker_embeddings_batch(
            self.speaker_encoder, audio_segments, sample_rate
        )
        embeddings = normalize_embeddings(embeddings)
-        step2_time = time.time() - step2_start
-        print(f"  Embedding shape: {embeddings.shape}")
-        print(f"  Embedding time: {step2_time:.2f}s")
+        print(f"  Embeddings: {embeddings.shape}")
+        print(f"  Step 4 time: {time.time() - t4:.2f}s")

-        # 聚類
-        print("\n[Step 3] Robust speaker clustering...")
-        step3_start = time.time()
+        # ── Step 5: AgglomerativeClustering ──
+        print("\n[Step 5] Speaker clustering...")
+        t5 = time.time()
+        from speaker_cluster_fixed import robust_speaker_clustering
        speaker_labels, estimated_n_speakers = robust_speaker_clustering(
-            embeddings, n_speakers=None, max_speakers=10
+            embeddings, n_speakers=None, max_speakers=max_speakers
        )
-        step3_time = time.time() - step3_start
-        print(f"  Clustering time: {step3_time:.2f}s")
+        print(f"  Speakers: {estimated_n_speakers}")
+        print(f"  Step 5 time: {time.time() - t5:.2f}s")

-        # 建立輸出
-        result = {
-            "audio_path": str(audio_path),
-            "total_duration": len(wav) / sample_rate,
-            "n_speech_segments": len(speech_segments),
-            "n_speakers": int(estimated_n_speakers),
-            "segments": []
-        }
+        # 品質計算
+        qualities = compute_embedding_quality(embeddings, speaker_labels)

-        for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
-            result["segments"].append({
-                "index": i,
-                "start": round(start, 3),
-                "end": round(end, 3),
-                "duration": round(end - start, 3),
-                "speaker": f"SPEAKER_{int(label)}"
-            })
-
-        # 加入 embeddings（每個 segment 對應的 192-D speaker embedding）
-        result["embeddings"] = []
-        for emb in embeddings:
-            result["embeddings"].append(emb.tolist())
+        # 建立輸出 segments
+        segments = []
+        for i, ((start_sec, end_sec), label) in enumerate(
+                zip(refined_segments, speaker_labels)):
+            seg = {
+                "start": round(start_sec, 3),
+                "end": round(end_sec, 3),
+                "start_frame": int(start_sec * 30),
+                "end_frame": int(end_sec * 30),
+                "text": segment_texts[i]["text"],
+                "language": segment_texts[i]["language"],
+                "lang_prob": segment_texts[i]["lang_prob"],
+                "speaker": f"SPEAKER_{int(label)}",
+                "speaker_id": f"SPEAKER_{int(label)}",
+                "quality": float(qualities[i]),
+            }
+            segments.append(seg)

        # 統計
        speaker_stats = {}
-        for seg in result["segments"]:
-            speaker = seg["speaker"]
-            if speaker not in speaker_stats:
-                speaker_stats[speaker] = {"count": 0, "duration": 0}
-            speaker_stats[speaker]["count"] += 1
-            speaker_stats[speaker]["duration"] += seg["duration"]
-        result["speaker_stats"] = speaker_stats
+        for seg in segments:
+            spk = seg["speaker_id"]
+            dur = seg["end"] - seg["start"]
+            if spk not in speaker_stats:
+                speaker_stats[spk] = {"count": 0, "duration": 0}
+            speaker_stats[spk]["count"] += 1
+            speaker_stats[spk]["duration"] += dur
+
+        result = {
+            "language": language or "",
+            "segments": segments,
+            "n_speakers": int(estimated_n_speakers),
+            "speaker_stats": speaker_stats,
+            "total_duration": total_duration,
+            "n_segments": len(segments),
+        }
+
+        # ── Step 6: Store embeddings in Qdrant ──
+        if file_uuid:
+            print("\n[Step 6] Storing embeddings in Qdrant...")
+            t6 = time.time()
+            self._store_speaker_embeddings(segments, embeddings, speaker_labels,
+                                           file_uuid)
+            print(f"  Step 6 time: {time.time() - t6:.2f}s")
+
+        # ── Step 7: High-quality classification ──
+        if file_uuid:
+            print("\n[Step 7] Classifying high-quality embeddings...")
+            t7 = time.time()
+            references = self._classify_high_quality_speakers(
+                segments, embeddings, speaker_labels, file_uuid,
+                wav, sample_rate, quality_threshold
+            )
+            if references:
+                result["references"] = references
+            print(f"  Step 7 time: {time.time() - t7:.2f}s")

        total_time = time.time() - start_time
        result["processing_time"] = round(total_time, 2)
-        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
-
-        print("\n[SelfASRX-Fixed] Processing completed!")
-        print(f"  Total time: {total_time:.2f}s")
-        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
-        print(f"  Detected speakers: {estimated_n_speakers}")
+        if total_duration > 0:
+            result["realtime_factor"] = round(total_duration / total_time, 2)

+        # 保存輸出
        if output_path:
-            import json
-            with open(output_path, 'w', encoding='utf-8') as f:
+            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
-            print(f"  Results saved to: {output_path}")
+            print(f"\n[SelfASRX] Saved to: {output_path}")
+
+        print(f"\n[SelfASRX] Done! {len(segments)} segments, "
+              f"{estimated_n_speakers} speakers, "
+              f"{total_time:.2f}s")

-        print("=" * 60)
        return result

+    def resume_from_checkpoint(self, checkpoint_path, audio_path,
+                               output_path=None):
+        """從 checkpoint 載入 Steps 1-3 結果，執行 Steps 4-7"""
+        print(f"\n[SelfASRX] Resuming from checkpoint: {checkpoint_path}")
+        print("=" * 60)
+
+        with open(checkpoint_path, "r", encoding="utf-8") as f:
+            cp = json.load(f)
+
+        if not cp.get("step3_completed"):
+            error_msg = f"Checkpoint step3 not completed (progress: {cp.get('step3_progress', '?')})"
+            print(f"[SelfASRX] {error_msg}")
+            return {"error": error_msg, "segments": []}
+
+        wav, sample_rate = _load_audio(audio_path)
+        refined_segments = [tuple(s) for s in cp["refined_segments"]]
+        segment_texts = cp["segment_texts"]
+        language = cp.get("language", "")
+        total_duration = cp.get("total_duration", 0)
+        file_uuid = cp.get("file_uuid")
+        max_speakers = cp.get("max_speakers", 10)
+        quality_threshold = cp.get("quality_threshold", 0.85)
+
+        print(f"  Loaded checkpoint: {len(refined_segments)} segments, "
+              f"language={language}, duration={total_duration:.2f}s")
+
+        start_time = time.time()
+
+        # ── Step 4: ECAPA-TDNN per refined segment ──
+        print("\n[Step 4] Speaker embedding extraction...")
+        t4 = time.time()
+        audio_segments = []
+        for start_sec, end_sec in refined_segments:
+            s = int(start_sec * sample_rate)
+            e = int(end_sec * sample_rate)
+            audio_segments.append(wav[s:min(e, len(wav))])
+
+        from speaker_encoder import extract_speaker_embeddings_batch, normalize_embeddings
+        embeddings = extract_speaker_embeddings_batch(
+            self.speaker_encoder, audio_segments, sample_rate
+        )
+        embeddings = normalize_embeddings(embeddings)
+        print(f"  Embeddings: {embeddings.shape}")
+        print(f"  Step 4 time: {time.time() - t4:.2f}s")
+
+        # ── Step 5: AgglomerativeClustering ──
+        print("\n[Step 5] Speaker clustering...")
+        t5 = time.time()
+        from speaker_cluster_fixed import robust_speaker_clustering
+        speaker_labels, estimated_n_speakers = robust_speaker_clustering(
+            embeddings, n_speakers=None, max_speakers=max_speakers
+        )
+        print(f"  Speakers: {estimated_n_speakers}")
+        print(f"  Step 5 time: {time.time() - t5:.2f}s")
+
+        # 品質計算
+        qualities = compute_embedding_quality(embeddings, speaker_labels)
+
+        # 建立輸出 segments
+        segments = []
+        for i, ((start_sec, end_sec), label) in enumerate(
+                zip(refined_segments, speaker_labels)):
+            seg = {
+                "start": round(start_sec, 3),
+                "end": round(end_sec, 3),
+                "start_frame": int(start_sec * 30),
+                "end_frame": int(end_sec * 30),
+                "text": segment_texts[i]["text"],
+                "language": segment_texts[i]["language"],
+                "lang_prob": segment_texts[i]["lang_prob"],
+                "speaker": f"SPEAKER_{int(label)}",
+                "speaker_id": f"SPEAKER_{int(label)}",
+                "quality": float(qualities[i]),
+            }
+            segments.append(seg)
+
+        # 統計
+        speaker_stats = {}
+        for seg in segments:
+            spk = seg["speaker_id"]
+            dur = seg["end"] - seg["start"]
+            if spk not in speaker_stats:
+                speaker_stats[spk] = {"count": 0, "duration": 0}
+            speaker_stats[spk]["count"] += 1
+            speaker_stats[spk]["duration"] += dur
+
+        result = {
+            "language": language or "",
+            "segments": segments,
+            "n_speakers": int(estimated_n_speakers),
+            "speaker_stats": speaker_stats,
+            "total_duration": total_duration,
+            "n_segments": len(segments),
+        }
+
+        # ── Step 6: Store embeddings in Qdrant ──
+        if file_uuid:
+            print("\n[Step 6] Storing embeddings in Qdrant...")
+            t6 = time.time()
+            self._store_speaker_embeddings(segments, embeddings, speaker_labels,
+                                           file_uuid)
+            print(f"  Step 6 time: {time.time() - t6:.2f}s")
+
+        # ── Step 7: High-quality classification ──
+        if file_uuid:
+            print("\n[Step 7] Classifying high-quality embeddings...")
+            t7 = time.time()
+            references = self._classify_high_quality_speakers(
+                segments, embeddings, speaker_labels, file_uuid,
+                wav, sample_rate, quality_threshold
+            )
+            if references:
+                result["references"] = references
+            print(f"  Step 7 time: {time.time() - t7:.2f}s")
+
+        total_time = time.time() - start_time
+        result["processing_time"] = round(total_time, 2)
+        if total_duration > 0:
+            result["realtime_factor"] = round(total_duration / total_time, 2)
+
+        # 保存輸出
+        if output_path:
+            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, "w", encoding="utf-8") as f:
+                json.dump(result, f, indent=2, ensure_ascii=False)
+            print(f"\n[SelfASRX] Saved to: {output_path}")
+
+        print(f"\n[SelfASRX] Done! {len(segments)} segments, "
+              f"{estimated_n_speakers} speakers, "
+              f"{total_time:.2f}s")
+
+        return result
+
+    # ── Internal helpers ──
+
+    def _vad_scan_segment(self, wav, sample_rate, start_sec, end_sec):
+        """VAD 細切單一段落"""
+        from vad import scan_within_segment
+        return scan_within_segment(
+            wav, sample_rate, start_sec, end_sec,
+            self.vad_model, self.vad_utils
+        )
+
+    def _transcribe_segment(self, wav, sample_rate, start_sec, end_sec):
+        """轉錄單一段落"""
+        from whisper_local import transcribe_segment
+        return transcribe_segment(wav, sample_rate, start_sec, end_sec, self.whisper)
+
+    def _store_speaker_embeddings(self, segments, embeddings, labels, file_uuid):
+        """Step 6: 所有 embedding 存入 Qdrant"""
+        if not self._ensure_qdrant():
+            return
+
+        points = []
+        for i, (seg, emb, label) in enumerate(
+                zip(segments, embeddings, labels)):
+            point_id = _hash_point_id(file_uuid, f"{i}")
+            points.append({
+                "id": point_id,
+                "vector": emb.tolist(),
+                "payload": {
+                    "type": "speaker_embedding",
+                    "file_uuid": file_uuid,
+                    "speaker_id": seg["speaker_id"],
+                    "text": seg["text"],
+                    "language": seg["language"],
+                    "start_time": seg["start"],
+                    "end_time": seg["end"],
+                }
+            })
+
+        ok = _qdrant_upsert(self.qdrant_url, self.qdrant_api_key,
+                            self.qdrant_collection, points)
+        if ok:
+            print(f"  Stored {len(points)} speaker embeddings to Qdrant")
+        return ok
+
+    def _classify_high_quality_speakers(self, segments, embeddings, labels,
+                                        file_uuid, wav, sample_rate,
+                                        threshold=0.85):
+        """Step 7: 高品質聲紋分級 + 性別分類 → Qdrant reference"""
+        qualities = compute_embedding_quality(embeddings, labels)
+        high_mask = qualities >= threshold
+
+        if not np.any(high_mask):
+            print("  No high-quality embeddings found")
+            return []
+
+        unique_labels = set(labels)
+        references = []
+        for label in unique_labels:
+            mask = (labels == label) & high_mask
+            if not np.any(mask):
+                continue
+            high_indices = [i for i in range(len(segments)) if mask[i]]
+            high_segs = [segments[i] for i in high_indices]
+
+            # 取品質最高的 segment index
+            best_idx = high_indices[int(np.argmax(qualities[mask]))]
+            best_seg = segments[best_idx]
+
+            centroid = np.mean(embeddings[mask], axis=0)
+            norm = np.linalg.norm(centroid)
+            if norm > 0:
+                centroid = centroid / norm
+
+            avg_quality = float(np.mean(qualities[mask]))
+            speaker_id = f"SPEAKER_{int(label)}"
+            text_samples = [s["text"] for s in high_segs[:5] if s["text"]]
+            total_dur = sum(s["end"] - s["start"] for s in high_segs)
+
+            ref_id = _hash_point_id(file_uuid, f"ref_{label}")
+            ref_payload = {
+                "type": "speaker_reference",
+                "file_uuid": file_uuid,
+                "speaker_id": speaker_id,
+                "n_segments": int(np.sum(mask)),
+                "avg_quality": avg_quality,
+                "total_duration": round(total_dur, 2),
+                "language": best_seg.get("language", ""),
+                "text_samples": text_samples,
+            }
+
+            # 性別分類：用最佳 segment 的音頻
+            if self.gender_classifier is not None:
+                try:
+                    import torch
+                    s = int(best_seg["start"] * sample_rate)
+                    e = int(best_seg["end"] * sample_rate)
+                    seg_wav = wav[s:min(e, len(wav))]
+                    seg_tensor = torch.from_numpy(seg_wav).float().unsqueeze(0)
+                    # SpeechBrain gender classifier 接受音頻
+                    out = self.gender_classifier.classify_batch(seg_tensor)
+                    probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
+                    if len(probs) >= 2:
+                        idx = int(np.argmax(probs))
+                        ref_payload["gender"] = "male" if idx == 0 else "female"
+                        ref_payload["gender_conf"] = float(probs[idx])
+                    else:
+                        ref_payload["gender"] = "unknown"
+                        ref_payload["gender_conf"] = 0.0
+                except Exception as e:
+                    print(f"[Gender] Classify error: {e}")
+                    ref_payload["gender"] = "unknown"
+                    ref_payload["gender_conf"] = 0.0
+            else:
+                ref_payload["gender"] = "unknown"
+                ref_payload["gender_conf"] = 0.0
+
+            _qdrant_upsert(self.qdrant_url, self.qdrant_api_key,
+                           self.qdrant_collection, [{
+                               "id": ref_id,
+                               "vector": centroid.tolist(),
+                               "payload": ref_payload,
+                           }])
+
+            references.append({
+                "speaker_id": speaker_id,
+                "n_segments": int(np.sum(mask)),
+                "avg_quality": avg_quality,
+                "gender": ref_payload["gender"],
+            })
+
+            print(f"  Ref: {speaker_id}, gender={ref_payload['gender']}"
+                  f" ({ref_payload['gender_conf']:.2f}), q={avg_quality:.3f}")
+
+        return references
+
+    def _ensure_qdrant(self):
+        """確保 Qdrant collection 可用"""
+        if not self._qdrant_ok:
+            ok = _ensure_speaker_collection(
+                self.qdrant_url, self.qdrant_api_key, self.qdrant_collection
+            )
+            self._qdrant_ok = ok
+        return self._qdrant_ok
+

 def main():
    import argparse
-    
-    parser = argparse.ArgumentParser(description="Self-implemented ASRX (Fixed)")
-    parser.add_argument("audio_path", help="Path to audio file")
+    parser = argparse.ArgumentParser(description="SelfASRX - Hybrid Speaker Diarization")
+    parser.add_argument("audio_path", help="Path to audio file (WAV)")
    parser.add_argument("-o", "--output", help="Output JSON path")
-    parser.add_argument("--min-speech-duration", type=int, default=500)
-    parser.add_argument("--n-speakers", type=int, default=None)
+    parser.add_argument("--file-uuid", help="File UUID for Qdrant storage")
    parser.add_argument("--max-speakers", type=int, default=10)
-    
+    parser.add_argument("--quality-threshold", type=float, default=0.85)
+    parser.add_argument("--resume", help="Checkpoint path to resume from")
+    parser.add_argument("--checkpoint", help="Save checkpoint path after Step 3")
    args = parser.parse_args()
-    
-    if not Path(args.audio_path).exists():
-        print(f"Error: Audio file not found: {args.audio_path}")
-        sys.exit(1)
-    
+
    asrx = SelfASRXFixed()
-    result = asrx.process(
-        args.audio_path,
-        args.output,
-        min_speech_duration_ms=args.min_speech_duration,
-        n_speakers=args.n_speakers,
-        max_speakers=args.max_speakers
-    )
-    
+
+    if args.resume:
+        if not Path(args.resume).exists():
+            print(f"Error: Checkpoint not found: {args.resume}")
+            sys.exit(1)
+        result = asrx.resume_from_checkpoint(
+            args.resume, args.audio_path,
+            output_path=args.output,
+        )
+    else:
+        if not Path(args.audio_path).exists():
+            print(f"Error: Audio file not found: {args.audio_path}")
+            sys.exit(1)
+
+        result = asrx.process(
+            args.audio_path,
+            output_path=args.output,
+            file_uuid=args.file_uuid,
+            max_speakers=args.max_speakers,
+            quality_threshold=args.quality_threshold,
+            checkpoint_path=args.checkpoint,
+        )
+
    if "error" not in result:
        print("\n[Summary]")
-        print(f"  Audio duration: {result['total_duration']:.2f}s")
-        print(f"  Speech segments: {result['n_speech_segments']}")
-        print(f"  Detected speakers: {result['n_speakers']}")
-        print(f"  Processing time: {result['processing_time']:.2f}s")
-        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
-        
-        print("\n[Speaker Statistics]")
-        for speaker, stats in result['speaker_stats'].items():
-            pct = stats['duration'] / result['total_duration'] * 100
-            print(f"  {speaker}: {stats['count']} segments, " +
-                  f"{stats['duration']:.2f}s ({pct:.1f}%)")
+        print(f"  Duration: {result['total_duration']:.2f}s")
+        print(f"  Segments: {result['n_segments']}")
+        print(f"  Speakers: {result['n_speakers']}")
+        if "references" in result:
+            for ref in result["references"]:
+                print(f"  {ref['speaker_id']}: gender={ref['gender']}, "
+                      f"quality={ref['avg_quality']:.3f}")


 if __name__ == "__main__":
--- a/scripts/asrx_self/speaker_audio_player.py
+++ b/scripts/asrx_self/speaker_audio_player.py
@@ -1,280 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-Speaker Audio Player - 說話人語音播放器
-從 ASRX 結果中提取並播放每個說話人的語音片段
-"""
-
-import json
-import argparse
-import subprocess
-import tempfile
-import os
-from pathlib import Path
-from typing import List, Dict
-
-
-def load_asrx_result(result_path: str) -> Dict:
-    """載入 ASRX 結果"""
-    with open(result_path, "r", encoding="utf-8") as f:
-        return json.load(f)
-
-
-def extract_audio_segment(
-    audio_path: str, start_sec: float, end_sec: float, output_path: str
-) -> bool:
-    """
-    使用 ffmpeg 提取音頻片段
-
-    Args:
-        audio_path: 原始音頻路徑
-        start_sec: 開始時間（秒）
-        end_sec: 結束時間（秒）
-        output_path: 輸出路徑
-
-    Returns:
-        bool: 是否成功
-    """
-    duration = end_sec - start_sec
-
-    cmd = [
-        "ffmpeg",
-        "-y",
-        "-i",
-        audio_path,
-        "-ss",
-        str(start_sec),
-        "-t",
-        str(duration),
-        "-acodec",
-        "pcm_s16le",
-        "-ar",
-        "16000",
-        "-ac",
-        "1",
-        output_path,
-    ]
-
-    try:
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        return result.returncode == 0
-    except Exception as e:
-        print(f"Error extracting audio: {e}")
-        return False
-
-
-def play_audio(audio_path: str) -> bool:
-    """
-    播放音頻文件
-
-    使用 macOS 的 afplay 或 Linux 的 aplay
-    """
-    try:
-        # 嘗試使用 afplay (macOS)
-        if os.path.exists("/usr/bin/afplay"):
-            subprocess.run(["afplay", audio_path], check=True)
-        # 嘗試使用 aplay (Linux)
-        elif os.path.exists("/usr/bin/aplay"):
-            subprocess.run(["aplay", audio_path], check=True)
-        else:
-            print(
-                "No audio player found. Please install afplay (macOS) or aplay (Linux)"
-            )
-            return False
-        return True
-    except Exception as e:
-        print(f"Error playing audio: {e}")
-        return False
-
-
-def group_segments_by_speaker(segments: List[Dict]) -> Dict[str, List[Dict]]:
-    """將語音片段按說話人分組"""
-    speaker_segments = {}
-
-    for seg in segments:
-        speaker = seg["speaker"]
-        if speaker not in speaker_segments:
-            speaker_segments[speaker] = []
-        speaker_segments[speaker].append(seg)
-
-    # 按開始時間排序
-    for speaker in speaker_segments:
-        speaker_segments[speaker].sort(key=lambda x: x["start"])
-
-    return speaker_segments
-
-
-def play_speaker_segments(
-    audio_path: str,
-    result_path: str,
-    speaker_id: str = None,
-    limit: int = None,
-    temp_dir: str = None,
-):
-    """
-    播放指定說話人的語音片段
-
-    Args:
-        audio_path: 原始音頻路徑
-        result_path: ASRX 結果 JSON 路徑
-        speaker_id: 說話人 ID（None=播放所有）
-        limit: 最多播放幾個片段（None=全部）
-        temp_dir: 臨時目錄
-    """
-    # 載入結果
-    print(f"[Load] Loading ASRX result: {result_path}")
-    result = load_asrx_result(result_path)
-
-    segments = result.get("segments", [])
-    total_duration = result.get("total_duration", 0)
-
-    print(f"[Info] Total segments: {len(segments)}")
-    print(f"[Info] Total duration: {total_duration / 60:.1f} minutes")
-
-    # 分組
-    speaker_segments = group_segments_by_speaker(segments)
-
-    # 選擇說話人
-    if speaker_id:
-        speakers_to_play = [speaker_id]
-    else:
-        speakers_to_play = sorted(speaker_segments.keys())
-
-    # 創建臨時目錄
-    if temp_dir is None:
-        temp_dir = tempfile.mkdtemp(prefix="speaker_audio_")
-
-    print(f"\n[Info] Temp directory: {temp_dir}")
-    print(f"[Info] Speakers to play: {speakers_to_play}")
-    print("=" * 60)
-
-    # 播放每個說話人的片段
-    for speaker in speakers_to_play:
-        if speaker not in speaker_segments:
-            print(f"\n[Warning] Speaker {speaker} not found!")
-            continue
-
-        segs = speaker_segments[speaker]
-        if limit:
-            segs = segs[:limit]
-
-        print(f"\n▶️  {speaker} ({len(segs)} segments)")
-        print("-" * 60)
-
-        for i, seg in enumerate(segs, 1):
-            start = seg["start"]
-            end = seg["end"]
-            duration = seg["duration"]
-
-            # 提取音頻
-            temp_audio = os.path.join(temp_dir, f"{speaker}_{i:03d}.wav")
-
-            print(
-                f"  [{i:3d}] {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s) ... ",
-                end="",
-                flush=True,
-            )
-
-            if extract_audio_segment(audio_path, start, end, temp_audio):
-                print("✅", end="", flush=True)
-
-                # 播放
-                if play_audio(temp_audio):
-                    print(" ▶️  Played")
-                else:
-                    print(" ❌ Play failed")
-            else:
-                print(" ❌ Extract failed")
-
-        print()
-
-
-def show_speaker_stats(result_path: str):
-    """顯示說話人統計資訊"""
-    result = load_asrx_result(result_path)
-
-    segments = result.get("segments", [])
-    speaker_segments = group_segments_by_speaker(segments)
-
-    print("\n" + "=" * 60)
-    print("說話人統計")
-    print("=" * 60)
-
-    # 按時長排序
-    speaker_stats = []
-    for speaker, segs in speaker_segments.items():
-        total_duration = sum(seg["duration"] for seg in segs)
-        speaker_stats.append((speaker, len(segs), total_duration))
-
-    speaker_stats.sort(key=lambda x: x[2], reverse=True)
-
-    total_duration = result.get("total_duration", 0)
-
-    for speaker, count, duration in speaker_stats:
-        pct = duration / total_duration * 100 if total_duration > 0 else 0
-        print(f"{speaker:12} {count:4} segments  {duration:8.1f}s  ({pct:5.1f}%)")
-
-    print("=" * 60)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Speaker Audio Player - 播放說話人語音片段",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  # 顯示說話人統計
-  python3 speaker_audio_player.py --stats result.json
-  
-  # 播放所有說話人的前 3 個片段
-  python3 speaker_audio_player.py audio.wav result.json --limit 3
-  
-  # 播放特定說話人的所有片段
-  python3 speaker_audio_player.py audio.wav result.json --speaker SPEAKER_0
-  
-  # 播放 SPEAKER_1 的前 5 個片段
-  python3 speaker_audio_player.py audio.wav result.json --speaker SPEAKER_1 --limit 5
-        """,
-    )
-
-    parser.add_argument("audio_path", nargs="?", help="原始音頻文件路徑")
-    parser.add_argument("result_path", help="ASRX 結果 JSON 路徑")
-    parser.add_argument("--stats", action="store_true", help="只显示說話人統計")
-    parser.add_argument("--speaker", type=str, help="指定說話人 ID（如 SPEAKER_0）")
-    parser.add_argument(
-        "--limit",
-        type=int,
-        default=None,
-        help="每個說話人最多播放幾個片段（None=全部）",
-    )
-    parser.add_argument("--temp-dir", type=str, default=None, help="臨時目錄路徑")
-
-    args = parser.parse_args()
-
-    if args.stats:
-        show_speaker_stats(args.result_path)
-        return
-
-    if not args.audio_path:
-        print("Error: audio_path is required unless --stats is specified")
-        parser.print_help()
-        return
-
-    if not Path(args.audio_path).exists():
-        print(f"Error: Audio file not found: {args.audio_path}")
-        return
-
-    if not Path(args.result_path).exists():
-        print(f"Error: Result file not found: {args.result_path}")
-        return
-
-    play_speaker_segments(
-        args.audio_path,
-        args.result_path,
-        speaker_id=args.speaker,
-        limit=args.limit,
-        temp_dir=args.temp_dir,
-    )
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/asrx_self/speaker_classifier.py
+++ b/scripts/asrx_self/speaker_classifier.py
@@ -0,0 +1,65 @@
+"""
+Speaker Classifier - 聲紋品質評估與性別分類
+
+提供品質計算與性別分類功能，作為 main_fixed.py 的輔助模組。
+"""
+
+import numpy as np
+
+
+def compute_embedding_quality(embeddings, labels):
+    """每個 embedding 到所屬 cluster centroid 的餘弦相似度
+
+    Args:
+        embeddings: [n_segments, 192] 聲紋向量矩陣
+        labels: [n_segments] 聚類標籤
+
+    Returns:
+        qualities: [n_segments] 品質分數 (0-1)
+    """
+    from sklearn.metrics.pairwise import cosine_similarity
+
+    unique_labels = set(labels)
+    centroids = {}
+    for label in unique_labels:
+        mask = labels == label
+        centroid = np.mean(embeddings[mask], axis=0)
+        norm = np.linalg.norm(centroid)
+        if norm > 0:
+            centroid = centroid / norm
+        centroids[label] = centroid
+
+    qualities = []
+    for emb, label in zip(embeddings, labels):
+        sim = cosine_similarity([emb], [centroids[label]])[0][0]
+        qualities.append(sim)
+
+    return np.array(qualities)
+
+
+def classify_gender(audio_wav, sample_rate, classifier):
+    """從音頻段分類性別
+
+    Args:
+        audio_wav: 音頻波形 (numpy array)
+        sample_rate: 採樣率
+        classifier: SpeechBrain EncoderClassifier (gender-recognition-ecapa)
+
+    Returns:
+        dict: {"gender": "male"|"female"|"unknown", "confidence": float}
+    """
+    default = {"gender": "unknown", "confidence": 0.0}
+    if classifier is None or len(audio_wav) == 0:
+        return default
+    try:
+        import torch
+        seg_tensor = torch.from_numpy(audio_wav).float().unsqueeze(0)
+        out = classifier.classify_batch(seg_tensor)
+        probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
+        if len(probs) >= 2:
+            idx = int(np.argmax(probs))
+            label = "male" if idx == 0 else "female"
+            return {"gender": label, "confidence": float(probs[idx])}
+    except Exception as e:
+        pass
+    return default
--- a/scripts/asrx_self/speaker_cluster.py
+++ b/scripts/asrx_self/speaker_cluster.py
@@ -1,310 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-Speaker Clustering - 說話人聚類
-使用譜聚類算法將聲紋嵌入分組
-
-技術來源:
- 譜聚類：Shi & Malik (2000), IEEE TPAMI
- 論文：https://ieeexplore.ieee.org/document/868688
- 應用於說話人分離：Wooters & Huijbregts (2008), ICASSP
-"""
-
-import numpy as np
-from sklearn.cluster import SpectralClustering, AgglomerativeClustering
-from sklearn.metrics.pairwise import cosine_similarity
-
-
-def estimate_n_speakers_eigengap(similarity_matrix, max_speakers=10):
-    """
-    使用特徵值間隙方法估計說話人數量
-
-    技術來源:
-    - 特徵值間隙理論：Lu et al. (2010)
-    - 原理：相似度矩陣的特徵值分佈中，最大間隙對應最佳聚類數
-
-    Args:
-        similarity_matrix: 相似度矩陣 [n, n]
-        max_speakers: 最大說話人數
-
-    Returns:
-        n_speakers: 估計的說話人數量
-    """
-    # 計算特徵值
-    eigenvalues = np.linalg.eigvalsh(similarity_matrix)
-
-    # 降序排列
-    eigenvalues = np.sort(eigenvalues)[::-1]
-
-    # 只考慮前 max_speakers 個特徵值
-    eigenvalues = eigenvalues[:max_speakers]
-
-    # 計算間隙
-    gaps = np.diff(eigenvalues)
-
-    # 找到最大間隙的位置
-    if len(gaps) > 0:
-        n_speakers = np.argmax(np.abs(gaps)) + 1
-    else:
-        n_speakers = 1
-
-    # 限制範圍
-    n_speakers = max(2, min(n_speakers, max_speakers))
-
-    return n_speakers
-
-
-def estimate_n_speakers_silhouette(embeddings, max_speakers=10):
-    """
-    使用輪廓係數估計說話人數量
-
-    Args:
-        embeddings: 嵌入矩陣 [n, d]
-        max_speakers: 最大說話人數
-
-    Returns:
-        n_speakers: 估計的說話人數量
-    """
-    from sklearn.metrics import silhouette_score
-
-    best_score = -1
-    best_n = 2
-
-    for n in range(2, min(max_speakers + 1, len(embeddings))):
-        clustering = AgglomerativeClustering(n_clusters=n)
-        labels = clustering.fit_predict(embeddings)
-
-        if len(np.unique(labels)) > 1:
-            score = silhouette_score(embeddings, labels)
-            if score > best_score:
-                best_score = score
-                best_n = n
-
-    return best_n
-
-
-def spectral_clustering_speaker(
-    similarity_matrix, n_speakers=None, auto_estimate=True, max_speakers=10
-):
-    """
-    使用譜聚類進行說話人分離
-
-    Args:
-        similarity_matrix: 相似度矩陣 [n, n]
-        n_speakers: 說話人數量（可選，如果為 None 則自動估計）
-        auto_estimate: 是否自動估計說話人數量
-        max_speakers: 最大說話人數
-
-    Returns:
-        speaker_labels: 說話人標籤 [n,]
-        n_speakers: 使用的說話人數量
-    """
-    n_segments = len(similarity_matrix)
-
-    # 清洗相似度矩陣
-    similarity_matrix = np.nan_to_num(
-        similarity_matrix, nan=0.5, posinf=1.0, neginf=-1.0
-    )
-
-    # 確保對角線為 1
-    np.fill_diagonal(similarity_matrix, 1.0)
-
-    # 確保值在 [-1, 1] 範圍
-    similarity_matrix = np.clip(similarity_matrix, -1.0, 1.0)
-
-    # 自動估計說話人數量
-    if n_speakers is None and auto_estimate:
-        n_speakers = estimate_n_speakers_eigengap(
-            similarity_matrix, max_speakers=max_speakers
-        )
-        print(f"[Clustering] Estimated n_speakers: {n_speakers}")
-
-    if n_speakers is None:
-        n_speakers = 2  # 預設值
-
-    # 確保 n_speakers 不超過樣本數
-    n_speakers = min(n_speakers, n_segments)
-
-    print(f"[Clustering] Running spectral clustering with {n_speakers} clusters...")
-
-    # 譜聚類
-    try:
-        clustering = SpectralClustering(
-            n_clusters=int(n_speakers),
-            affinity="precomputed",
-            assign_labels="kmeans",
-            random_state=42,
-            n_init=10,
-        )
-
-        speaker_labels = clustering.fit_predict(similarity_matrix)
-
-        print("[Clustering] Spectral clustering completed")
-        print(f"[Clustering] n_speakers: {n_speakers}")
-        print(f"[Clustering] n_segments: {n_segments}")
-
-        return speaker_labels, n_speakers
-
-    except Exception as e:
-        print(f"[Clustering] Spectral clustering failed: {e}")
-        print("[Clustering] Using fallback: 2 speakers")
-        # 簡單分配：前一半是 SPEAKER_0，後一半是 SPEAKER_1
-        speaker_labels = np.array(
-            [0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
-        )
-        return speaker_labels, 2
-
-
-def agglomerative_clustering_speaker(
-    embeddings, n_speakers=None, threshold=0.5, max_speakers=10
-):
-    """
-    使用層次聚類進行說話人分離
-
-    Args:
-        embeddings: 嵌入矩陣 [n, d]
-        n_speakers: 說話人數量（可選）
-        threshold: 距離閾值（用於自動決定聚類數）
-        max_speakers: 最大說話人數
-
-    Returns:
-        speaker_labels: 說話人標籤 [n,]
-        n_speakers: 使用的說話人數量
-    """
-    n_segments = len(embeddings)
-
-    if n_speakers is None:
-        # 使用距離閾值自動決定
-        from sklearn.metrics.pairwise import cosine_distances
-
-        distances = cosine_distances(embeddings)
-
-        # 計算平均最近鄰距離
-        avg_distances = []
-        for i in range(min(100, n_segments)):
-            dists = distances[i]
-            dists = np.sort(dists)
-            if len(dists) > 1:
-                avg_distances.append(dists[1])  # 最近鄰（排除自己）
-
-        if avg_distances:
-            avg_dist = np.mean(avg_distances)
-            # 根據平均距離估計聚類數
-            n_speakers = max(2, int(avg_dist / threshold))
-            n_speakers = min(n_speakers, max_speakers)
-        else:
-            n_speakers = 2
-
-    n_speakers = min(n_speakers, n_segments)
-
-    # 層次聚類
-    clustering = AgglomerativeClustering(
-        n_clusters=n_speakers, metric="cosine", linkage="average"
-    )
-
-    speaker_labels = clustering.fit_predict(embeddings)
-
-    print("[Clustering] Agglomerative clustering completed")
-    print(f"[Clustering] n_speakers: {n_speakers}")
-
-    return speaker_labels, n_speakers
-
-
-def smooth_speaker_labels(speaker_labels, window_size=5):
-    """
-    平滑說話人標籤（去除噪聲）
-
-    Args:
-        speaker_labels: 原始說話人標籤
-        window_size: 平滑窗口大小
-
-    Returns:
-        smoothed_labels: 平滑後的標籤
-    """
-    from scipy import stats
-
-    smoothed = np.copy(speaker_labels)
-    half_window = window_size // 2
-
-    for i in range(len(speaker_labels)):
-        start = max(0, i - half_window)
-        end = min(len(speaker_labels), i + half_window + 1)
-
-        window_labels = speaker_labels[start:end]
-        mode_result = stats.mode(window_labels, keepdims=True)
-        smoothed[i] = mode_result.mode[0]
-
-    return smoothed
-
-
-def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
-    """
-    計算說話人分離純度（如果有 ground truth）
-
-    Args:
-        speaker_labels: 預測的說話人標籤
-        ground_truth_labels: 真實的說話人標籤（可選）
-
-    Returns:
-        purity: 純度分數（0-1）
-    """
-    if ground_truth_labels is None:
-        # 沒有 ground truth，使用聚類純度近似
-
-        # 使用餘弦相似度作為距離
-        purity = 0.5  # 預設值
-    else:
-        # 計算純度
-        from sklearn.metrics import adjusted_rand_score
-
-        purity = adjusted_rand_score(ground_truth_labels, speaker_labels)
-
-    return purity
-
-
-if __name__ == "__main__":
-    # 測試聚類算法
-    print("[Test] Testing speaker clustering algorithms")
-
-    # 生成模擬數據
-    np.random.seed(42)
-    n_speakers = 3
-    n_segments_per_speaker = 20
-
-    # 生成 3 個說話人的嵌入
-    embeddings = []
-    for i in range(n_speakers):
-        # 每個說話人有不同的中心
-        center = np.random.randn(192) * 2 + i * 3
-        # 添加噪聲
-        for _ in range(n_segments_per_speaker):
-            emb = center + np.random.randn(192) * 0.5
-            embeddings.append(emb)
-
-    embeddings = np.array(embeddings)
-    print(f"[Test] Generated {len(embeddings)} embeddings for {n_speakers} speakers")
-
-    # 計算相似度矩陣
-    similarity = cosine_similarity(embeddings)
-    print(f"[Test] Similarity matrix shape: {similarity.shape}")
-
-    # 估計說話人數量
-    estimated_n = estimate_n_speakers_eigengap(similarity, max_speakers=10)
-    print(f"[Test] Estimated n_speakers (eigengap): {estimated_n}")
-
-    estimated_n_silhouette = estimate_n_speakers_silhouette(embeddings, max_speakers=10)
-    print(f"[Test] Estimated n_speakers (silhouette): {estimated_n_silhouette}")
-
-    # 譜聚類
-    labels, n_clusters = spectral_clustering_speaker(
-        similarity, n_speakers=None, auto_estimate=True
-    )
-
-    print("\n[Test] Clustering results:")
-    print(f"  True n_speakers: {n_speakers}")
-    print(f"  Estimated n_speakers: {n_clusters}")
-    print(f"  Unique labels: {np.unique(labels)}")
-
-    # 計算每個聚類的大小
-    for label in np.unique(labels):
-        count = np.sum(labels == label)
-        print(f"  Cluster {label}: {count} segments")
--- a/scripts/asrx_self/speaker_player_gui.py
+++ b/scripts/asrx_self/speaker_player_gui.py
@@ -1,431 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-Speaker Player GUI - 說話人語音播放器（圖形界面）
-使用 tkinter 顯示播放進度和 Speaker ID
-"""
-
-import json
-import subprocess
-import tempfile
-import os
-import threading
-import time
-from pathlib import Path
-
-try:
-    import tkinter as tk
-    from tkinter import ttk, filedialog, messagebox
-
-    HAS_TKINTER = True
-except ImportError:
-    HAS_TKINTER = False
-
-
-class SpeakerPlayerGUI:
-    """說話人語音播放器 GUI"""
-
-    def __init__(self, root):
-        self.root = root
-        self.root.title("🎬 Speaker Audio Player - Face Integration")
-        self.root.geometry("1100x800")
-
-        # 數據
-        self.audio_path = None
-        self.result_path = None
-        self.face_path = None
-        self.result_data = None
-        self.face_data = None
-        self.integrated_data = None
-        self.speaker_segments = {}
-        self.speakers = []
-        self.current_speaker_idx = 0
-        self.is_playing = False
-        self.stop_flag = False
-
-        # 創建界面
-        self.create_widgets()
-
-    def create_widgets(self):
-        """創建界面組件"""
-        # 頂部：文件選擇
-        top_frame = ttk.Frame(self.root, padding="10")
-        top_frame.pack(fill=tk.X)
-
-        ttk.Label(top_frame, text="📁 Audio:").pack(side=tk.LEFT)
-        self.audio_label = ttk.Label(top_frame, text="未選擇", width=50)
-        self.audio_label.pack(side=tk.LEFT, padx=5)
-        ttk.Button(top_frame, text="選擇音頻", command=self.select_audio).pack(
-            side=tk.LEFT, padx=5
-        )
-
-        ttk.Label(top_frame, text="  📊 Result:").pack(side=tk.LEFT, padx=(20, 0))
-        self.result_label = ttk.Label(top_frame, text="未選擇", width=50)
-        self.result_label.pack(side=tk.LEFT, padx=5)
-        ttk.Button(top_frame, text="選擇結果", command=self.select_result).pack(
-            side=tk.LEFT, padx=5
-        )
-
-        # 中間：說話人列表和片段列表
-        mid_frame = ttk.Frame(self.root, padding="10")
-        mid_frame.pack(fill=tk.BOTH, expand=True)
-
-        # 左側：說話人列表
-        left_frame = ttk.LabelFrame(mid_frame, text="📢 說話人列表", padding="10")
-        left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False)
-
-        self.speaker_listbox = tk.Listbox(
-            left_frame, width=35, height=20, font=("Arial", 11)
-        )
-        self.speaker_listbox.pack(fill=tk.BOTH, expand=True)
-        self.speaker_listbox.bind("<<ListboxSelect>>", self.on_speaker_select)
-
-        # 右側：片段列表
-        right_frame = ttk.LabelFrame(mid_frame, text="🎵 語音片段", padding="10")
-        right_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10)
-
-        # 片段列表（带滚动条）
-        list_frame = ttk.Frame(right_frame)
-        list_frame.pack(fill=tk.BOTH, expand=True)
-
-        scrollbar = ttk.Scrollbar(list_frame)
-        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
-
-        self.segment_listbox = tk.Listbox(
-            list_frame,
-            width=50,
-            height=20,
-            font=("Courier", 10),
-            yscrollcommand=scrollbar.set,
-        )
-        self.segment_listbox.pack(fill=tk.BOTH, expand=True)
-        scrollbar.config(command=self.segment_listbox.yview)
-
-        self.segment_listbox.bind("<Double-Button-1>", self.on_segment_double_click)
-
-        # 底部：播放控制和進度
-        bottom_frame = ttk.Frame(self.root, padding="10")
-        bottom_frame.pack(fill=tk.X)
-
-        # 播放控制
-        control_frame = ttk.Frame(bottom_frame)
-        control_frame.pack(fill=tk.X)
-
-        self.play_button = ttk.Button(
-            control_frame, text="▶️ 播放所選", command=self.play_selected, width=15
-        )
-        self.play_button.pack(side=tk.LEFT, padx=5)
-
-        self.stop_button = ttk.Button(
-            control_frame, text="⏹️ 停止", command=self.stop_playing, width=10
-        )
-        self.stop_button.pack(side=tk.LEFT, padx=5)
-        self.stop_button.config(state=tk.DISABLED)
-
-        self.play_all_button = ttk.Button(
-            control_frame, text="▶️▶️ 播放全部", command=self.play_all, width=15
-        )
-        self.play_all_button.pack(side=tk.LEFT, padx=5)
-
-        # 進度條
-        progress_frame = ttk.Frame(bottom_frame)
-        progress_frame.pack(fill=tk.X, pady=(10, 0))
-
-        ttk.Label(progress_frame, text="⏱️ 進度:").pack(side=tk.LEFT)
-        self.progress_bar = ttk.Progressbar(progress_frame, mode="determinate")
-        self.progress_bar.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10)
-
-        self.progress_label = ttk.Label(progress_frame, text="0:00 / 0:00", width=20)
-        self.progress_label.pack(side=tk.LEFT)
-
-        # 狀態欄
-        self.status_label = ttk.Label(
-            bottom_frame, text="就緒", relief=tk.SUNKEN, anchor=tk.W
-        )
-        self.status_label.pack(fill=tk.X, pady=(10, 0))
-
-    def select_audio(self):
-        """選擇音頻文件"""
-        filename = filedialog.askopenfilename(
-            title="選擇音頻文件",
-            filetypes=[("WAV files", "*.wav"), ("All files", "*.*")],
-        )
-        if filename:
-            self.audio_path = filename
-            self.audio_label.config(text=Path(filename).name)
-            self.check_ready()
-
-    def select_result(self):
-        """選擇結果文件"""
-        filename = filedialog.askopenfilename(
-            title="選擇 ASRX 結果文件",
-            filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
-        )
-        if filename:
-            self.result_path = filename
-            self.result_label.config(text=Path(filename).name)
-            self.load_result()
-            self.check_ready()
-
-    def load_result(self):
-        """載入 ASRX 結果"""
-        try:
-            with open(self.result_path, "r", encoding="utf-8") as f:
-                self.result_data = json.load(f)
-
-            # 分組
-            self.speaker_segments = {}
-            for seg in self.result_data.get("segments", []):
-                speaker = seg["speaker"]
-                if speaker not in self.speaker_segments:
-                    self.speaker_segments[speaker] = []
-                self.speaker_segments[speaker].append(seg)
-
-            # 排序
-            for speaker in self.speaker_segments:
-                self.speaker_segments[speaker].sort(key=lambda x: x["start"])
-
-            # 說話人列表（按時長排序）
-            self.speakers = sorted(
-                self.speaker_segments.keys(),
-                key=lambda s: sum(seg["duration"] for seg in self.speaker_segments[s]),
-                reverse=True,
-            )
-
-            # 更新列表框
-            self.speaker_listbox.delete(0, tk.END)
-            for speaker in self.speakers:
-                segs = self.speaker_segments[speaker]
-                total_dur = sum(seg["duration"] for seg in segs)
-                total_dur_min = total_dur / 60
-                self.speaker_listbox.insert(
-                    tk.END,
-                    f"🔊 {speaker:12} | {len(segs):4d}段 | {total_dur_min:5.1f}分鐘",
-                )
-
-            self.status_label.config(
-                text=f"載入成功：{len(self.speakers)} 個說話人，{len(self.result_data.get('segments', []))} 個片段"
-            )
-
-        except Exception as e:
-            messagebox.showerror("錯誤", f"載入結果文件失敗：{e}")
-            self.result_path = None
-            self.result_label.config(text="載入失敗")
-
-    def check_ready(self):
-        """檢查是否就緒"""
-        if self.audio_path and self.result_path:
-            self.status_label.config(text="✅ 就緒 - 請選擇說話人並播放")
-            self.play_button.config(state=tk.NORMAL)
-            self.play_all_button.config(state=tk.NORMAL)
-        else:
-            self.status_label.config(text="⚠️ 請選擇音頻和結果文件")
-            self.play_button.config(state=tk.DISABLED)
-            self.play_all_button.config(state=tk.DISABLED)
-
-    def on_speaker_select(self, event):
-        """說話人選擇事件"""
-        selection = self.speaker_listbox.curselection()
-        if not selection:
-            return
-
-        self.current_speaker_idx = selection[0]
-        speaker = self.speakers[self.current_speaker_idx]
-
-        # 更新片段列表
-        self.segment_listbox.delete(0, tk.END)
-        for i, seg in enumerate(self.speaker_segments[speaker], 1):
-            start = seg["start"]
-            end = seg["end"]
-            duration = seg["duration"]
-            self.segment_listbox.insert(
-                tk.END,
-                f"[{i:4d}] {speaker:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s)",
-            )
-
-        self.status_label.config(
-            text=f"選擇：{speaker} - {len(self.speaker_segments[speaker])} 個片段"
-        )
-
-    def on_segment_double_click(self, event):
-        """片段雙擊事件"""
-        self.play_selected()
-
-    def extract_and_play(self, start_sec: float, end_sec: float) -> bool:
-        """提取並播放音頻"""
-        duration = end_sec - start_sec
-        temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-        temp_path = temp_file.name
-        temp_file.close()
-
-        try:
-            # 提取
-            cmd = [
-                "ffmpeg",
-                "-y",
-                "-loglevel",
-                "quiet",
-                "-i",
-                self.audio_path,
-                "-ss",
-                str(start_sec),
-                "-t",
-                str(duration),
-                "-acodec",
-                "pcm_s16le",
-                "-ar",
-                "16000",
-                "-ac",
-                "1",
-                temp_path,
-            ]
-
-            result = subprocess.run(cmd, capture_output=True)
-            if result.returncode != 0:
-                return False
-
-            # 播放
-            if os.path.exists("/usr/bin/afplay"):
-                subprocess.run(["afplay", temp_path], capture_output=True)
-            elif os.path.exists("/usr/bin/aplay"):
-                subprocess.run(["aplay", temp_path], capture_output=True)
-            else:
-                return False
-
-            return True
-        finally:
-            if os.path.exists(temp_path):
-                os.unlink(temp_path)
-
-    def play_segment(self, speaker: str, seg: dict, seg_idx: int, total: int):
-        """播放單個片段"""
-        if self.stop_flag:
-            return False
-
-        start = seg["start"]
-        end = seg["end"]
-        duration = seg["duration"]
-
-        # 更新 UI
-        self.root.after(
-            0,
-            lambda: self.status_label.config(
-                text=f"▶️  {speaker} [{seg_idx}/{total}] {start:.2f}s - {end:.2f}s"
-            ),
-        )
-
-        # 更新進度
-        progress = (seg_idx / total) * 100
-        self.root.after(0, lambda: self.progress_bar.config(value=progress))
-        self.root.after(
-            0, lambda: self.progress_label.config(text=f"{seg_idx}:{total}")
-        )
-
-        # 播放
-        if self.extract_and_play(start, end):
-            return True
-        else:
-            self.root.after(
-                0,
-                lambda: messagebox.showwarning(
-                    "警告", f"播放失敗：{speaker} [{seg_idx}]"
-                ),
-            )
-            return True
-
-    def play_selected(self):
-        """播放所選片段"""
-        selection = self.segment_listbox.curselection()
-        if not selection:
-            # 如果沒選擇，播放第一個
-            if self.speakers:
-                speaker = self.speakers[self.current_speaker_idx]
-                segs = self.speaker_segments[speaker]
-                if segs:
-                    self.play_all()
-            return
-
-        # 播放所選
-        seg_idx = selection[0]
-        speaker = self.speakers[self.current_speaker_idx]
-        seg = self.speaker_segments[speaker][seg_idx]
-
-        self.is_playing = True
-        self.stop_flag = False
-        self.play_button.config(state=tk.DISABLED)
-        self.stop_button.config(state=tk.NORMAL)
-
-        # 在後台線程播放
-        def play_thread():
-            success = self.play_segment(speaker, seg, seg_idx + 1, 1)
-            self.root.after(0, lambda: self.on_play_done())
-
-        thread = threading.Thread(target=play_thread, daemon=True)
-        thread.start()
-
-    def play_all(self):
-        """播放所選說話人的所有片段"""
-        if not self.speakers:
-            return
-
-        speaker = self.speakers[self.current_speaker_idx]
-        segs = self.speaker_segments[speaker]
-
-        if not segs:
-            return
-
-        self.is_playing = True
-        self.stop_flag = False
-        self.play_button.config(state=tk.DISABLED)
-        self.play_all_button.config(state=tk.DISABLED)
-        self.stop_button.config(state=tk.NORMAL)
-
-        # 在後台線程播放
-        def play_thread():
-            for i, seg in enumerate(segs, 1):
-                if self.stop_flag:
-                    break
-                self.play_segment(speaker, seg, i, len(segs))
-                time.sleep(0.3)  # 片段間隔
-
-            self.root.after(0, lambda: self.on_play_done())
-
-        thread = threading.Thread(target=play_thread, daemon=True)
-        thread.start()
-
-    def stop_playing(self):
-        """停止播放"""
-        self.stop_flag = True
-        self.is_playing = False
-        self.on_play_done()
-
-    def on_play_done(self):
-        """播放完成"""
-        self.is_playing = False
-        self.stop_flag = False
-        self.play_button.config(state=tk.NORMAL)
-        self.play_all_button.config(state=tk.NORMAL)
-        self.stop_button.config(state=tk.DISABLED)
-        self.progress_bar.config(value=0)
-        self.progress_label.config(text="0:00 / 0:00")
-
-        if self.stop_flag:
-            self.status_label.config(text="⏹️ 已停止")
-        else:
-            self.status_label.config(text="✅ 播放完成")
-
-
-def main():
-    """主函數"""
-    if not HAS_TKINTER:
-        print("❌ tkinter 未安裝")
-        print("請使用以下命令安裝:")
-        print("  brew install python-tk@3.9")
-        return
-
-    root = tk.Tk()
-    app = SpeakerPlayerGUI(root)
-    root.mainloop()
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/asrx_self/speaker_player_gui_face.py
+++ b/scripts/asrx_self/speaker_player_gui_face.py
@@ -1,522 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-Speaker Player GUI - 說話人語音播放器（Face 整合版）
-使用 tkinter 顯示播放進度、Speaker ID 和人臉信息
-"""
-
-import json
-import subprocess
-import tempfile
-import os
-import threading
-import time
-from pathlib import Path
-
-try:
-    import tkinter as tk
-    from tkinter import ttk, filedialog, messagebox
-
-    HAS_TKINTER = True
-except ImportError:
-    HAS_TKINTER = False
-
-
-class SpeakerPlayerGUI:
-    """說話人語音播放器 GUI（Face 整合版）"""
-
-    def __init__(self, root):
-        self.root = root
-        self.root.title("🎬 Speaker Player - Face Integration")
-        self.root.geometry("1200x800")
-
-        # 數據
-        self.audio_path = None
-        self.result_path = None
-        self.face_path = None
-        self.result_data = None
-        self.face_data = None
-        self.integrated_data = None
-        self.speaker_segments = {}
-        self.speakers = []
-        self.current_speaker_idx = 0
-        self.is_playing = False
-        self.stop_flag = False
-
-        # 創建界面
-        self.create_widgets()
-
-    def create_widgets(self):
-        """創建界面組件"""
-        # 頂部：文件選擇
-        top_frame = ttk.Frame(self.root, padding="10")
-        top_frame.pack(fill=tk.X)
-
-        # 第一行：音頻和 ASRX 結果
-        row1_frame = ttk.Frame(top_frame)
-        row1_frame.pack(fill=tk.X)
-
-        ttk.Label(row1_frame, text="📁 Audio:").pack(side=tk.LEFT)
-        self.audio_label = ttk.Label(row1_frame, text="未選擇", width=50)
-        self.audio_label.pack(side=tk.LEFT, padx=5)
-        ttk.Button(row1_frame, text="選擇音頻", command=self.select_audio).pack(
-            side=tk.LEFT, padx=5
-        )
-
-        ttk.Label(row1_frame, text="  📊 ASRX:").pack(side=tk.LEFT, padx=(20, 0))
-        self.result_label = ttk.Label(row1_frame, text="未選擇", width=50)
-        self.result_label.pack(side=tk.LEFT, padx=5)
-        ttk.Button(row1_frame, text="選擇結果", command=self.select_result).pack(
-            side=tk.LEFT, padx=5
-        )
-
-        # 第二行：Face 結果
-        row2_frame = ttk.Frame(top_frame)
-        row2_frame.pack(fill=tk.X, pady=(5, 0))
-
-        ttk.Label(row2_frame, text="👤 Face:").pack(side=tk.LEFT)
-        self.face_label = ttk.Label(row2_frame, text="未選擇 (可選)", width=50)
-        self.face_label.pack(side=tk.LEFT, padx=5)
-        ttk.Button(row2_frame, text="選擇 Face", command=self.select_face).pack(
-            side=tk.LEFT, padx=5
-        )
-        self.integrate_button = ttk.Button(
-            row2_frame,
-            text="🔗 整合 Face",
-            command=self.integrate_face,
-            state=tk.DISABLED,
-        )
-        self.integrate_button.pack(side=tk.LEFT, padx=5)
-
-        # 中間：說話人列表和片段列表
-        mid_frame = ttk.Frame(self.root, padding="10")
-        mid_frame.pack(fill=tk.BOTH, expand=True)
-
-        # 左側：說話人列表（帶 Face 統計）
-        left_frame = ttk.LabelFrame(mid_frame, text="📢 說話人列表", padding="10")
-        left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False)
-
-        self.speaker_listbox = tk.Listbox(
-            left_frame, width=45, height=20, font=("Arial", 11)
-        )
-        self.speaker_listbox.pack(fill=tk.BOTH, expand=True)
-        self.speaker_listbox.bind("<<ListboxSelect>>", self.on_speaker_select)
-
-        # 右側：片段列表（帶 Face 信息）
-        right_frame = ttk.LabelFrame(
-            mid_frame, text="🎵 語音片段 + 👥 人臉", padding="10"
-        )
-        right_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10)
-
-        # 片段列表（带滚动条）
-        list_frame = ttk.Frame(right_frame)
-        list_frame.pack(fill=tk.BOTH, expand=True)
-
-        scrollbar = ttk.Scrollbar(list_frame)
-        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
-
-        self.segment_listbox = tk.Listbox(
-            list_frame,
-            width=65,
-            height=20,
-            font=("Courier", 9),
-            yscrollcommand=scrollbar.set,
-        )
-        self.segment_listbox.pack(fill=tk.BOTH, expand=True)
-        scrollbar.config(command=self.segment_listbox.yview)
-
-        self.segment_listbox.bind("<Double-Button-1>", self.on_segment_double_click)
-
-        # 底部：播放控制和進度
-        bottom_frame = ttk.Frame(self.root, padding="10")
-        bottom_frame.pack(fill=tk.X)
-
-        # 播放控制
-        control_frame = ttk.Frame(bottom_frame)
-        control_frame.pack(fill=tk.X)
-
-        self.play_button = ttk.Button(
-            control_frame, text="▶️ 播放所選", command=self.play_selected, width=15
-        )
-        self.play_button.pack(side=tk.LEFT, padx=5)
-        self.play_button.config(state=tk.DISABLED)
-
-        self.stop_button = ttk.Button(
-            control_frame, text="⏹️ 停止", command=self.stop_playing, width=10
-        )
-        self.stop_button.pack(side=tk.LEFT, padx=5)
-        self.stop_button.config(state=tk.DISABLED)
-
-        self.play_all_button = ttk.Button(
-            control_frame, text="▶️▶️ 播放全部", command=self.play_all, width=15
-        )
-        self.play_all_button.pack(side=tk.LEFT, padx=5)
-        self.play_all_button.config(state=tk.DISABLED)
-
-        # 進度條
-        progress_frame = ttk.Frame(bottom_frame)
-        progress_frame.pack(fill=tk.X, pady=(10, 0))
-
-        ttk.Label(progress_frame, text="⏱️ 進度:").pack(side=tk.LEFT)
-        self.progress_bar = ttk.Progressbar(progress_frame, mode="determinate")
-        self.progress_bar.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10)
-
-        self.progress_label = ttk.Label(progress_frame, text="0:00 / 0:00", width=20)
-        self.progress_label.pack(side=tk.LEFT)
-
-        # 狀態欄
-        self.status_label = ttk.Label(
-            bottom_frame, text="就緒", relief=tk.SUNKEN, anchor=tk.W
-        )
-        self.status_label.pack(fill=tk.X, pady=(10, 0))
-
-    def select_audio(self):
-        """選擇音頻文件"""
-        filename = filedialog.askopenfilename(
-            title="選擇音頻文件",
-            filetypes=[("WAV files", "*.wav"), ("All files", "*.*")],
-        )
-        if filename:
-            self.audio_path = filename
-            self.audio_label.config(text=Path(filename).name)
-            self.check_ready()
-
-    def select_result(self):
-        """選擇 ASRX 結果文件"""
-        filename = filedialog.askopenfilename(
-            title="選擇 ASRX 結果文件",
-            filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
-        )
-        if filename:
-            self.result_path = filename
-            self.result_label.config(text=Path(filename).name)
-            self.load_result()
-            self.check_ready()
-
-    def select_face(self):
-        """選擇 Face 結果文件"""
-        filename = filedialog.askopenfilename(
-            title="選擇 Face 檢測結果",
-            filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
-        )
-        if filename:
-            self.face_path = filename
-            self.face_label.config(text=Path(filename).name)
-            self.integrate_button.config(state=tk.NORMAL)
-            self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")
-
-    def integrate_face(self):
-        """整合 Face 與 ASRX"""
-        if not self.face_path or not self.result_path:
-            messagebox.showwarning("警告", "請先選擇 Face 和 ASRX 文件")
-            return
-
-        self.status_label.config(text="🔄 整合中...")
-        self.root.update()
-
-        try:
-            # 載入 Face 數據
-            with open(self.face_path, "r", encoding="utf-8") as f:
-                self.face_data = json.load(f)
-
-            # 重新載入 ASRX 數據並整合
-            self.load_result(integrate_with_face=True)
-
-            self.status_label.config(text="✅ Face 整合完成")
-            self.integrate_button.config(state=tk.DISABLED)
-
-        except Exception as e:
-            messagebox.showerror("錯誤", f"整合失敗：{e}")
-            self.status_label.config(text="❌ 整合失敗")
-
-    def load_result(self, integrate_with_face=False):
-        """載入 ASRX 結果"""
-        try:
-            with open(self.result_path, "r", encoding="utf-8") as f:
-                self.result_data = json.load(f)
-
-            # 分組
-            self.speaker_segments = {}
-            for seg in self.result_data.get("segments", []):
-                speaker = seg["speaker"]
-                if speaker not in self.speaker_segments:
-                    self.speaker_segments[speaker] = []
-                self.speaker_segments[speaker].append(seg)
-
-            # 排序
-            for speaker in self.speaker_segments:
-                self.speaker_segments[speaker].sort(key=lambda x: x["start"])
-
-            # 說話人列表（按時長排序）
-            self.speakers = sorted(
-                self.speaker_segments.keys(),
-                key=lambda s: sum(seg["duration"] for seg in self.speaker_segments[s]),
-                reverse=True,
-            )
-
-            # 更新列表框
-            self.speaker_listbox.delete(0, tk.END)
-            for speaker in self.speakers:
-                segs = self.speaker_segments[speaker]
-                total_dur = sum(seg["duration"] for seg in segs)
-                total_dur_min = total_dur / 60
-
-                # 如果有 Face 數據，計算有人臉的片段數
-                face_info = ""
-                if integrate_with_face and self.integrated_data:
-                    speaker_integrated = [
-                        item
-                        for item in self.integrated_data
-                        if item["speaker"] == speaker
-                    ]
-                    with_face = sum(
-                        1 for item in speaker_integrated if item.get("has_face", False)
-                    )
-                    face_info = f" | 👥 {with_face}/{len(segs)}"
-
-                self.speaker_listbox.insert(
-                    tk.END,
-                    f"🔊 {speaker:12} | {len(segs):4d}段 | {total_dur_min:5.1f}分鐘{face_info}",
-                )
-
-            total_segments = len(self.result_data.get("segments", []))
-            self.status_label.config(
-                text=f"載入成功：{len(self.speakers)} 個說話人，{total_segments} 個片段"
-            )
-
-        except Exception as e:
-            messagebox.showerror("錯誤", f"載入結果文件失敗：{e}")
-            self.result_path = None
-            self.result_label.config(text="載入失敗")
-
-    def check_ready(self):
-        """檢查是否就緒"""
-        if self.audio_path and self.result_path:
-            self.status_label.config(text="✅ 就緒 - 請選擇說話人並播放")
-            self.play_button.config(state=tk.NORMAL)
-            self.play_all_button.config(state=tk.NORMAL)
-        else:
-            self.status_label.config(text="⚠️ 請選擇音頻和結果文件")
-            self.play_button.config(state=tk.DISABLED)
-            self.play_all_button.config(state=tk.DISABLED)
-
-    def on_speaker_select(self, event):
-        """說話人選擇事件"""
-        selection = self.speaker_listbox.curselection()
-        if not selection:
-            return
-
-        self.current_speaker_idx = selection[0]
-        speaker = self.speakers[self.current_speaker_idx]
-
-        # 更新片段列表
-        self.segment_listbox.delete(0, tk.END)
-        for i, seg in enumerate(self.speaker_segments[speaker], 1):
-            start = seg["start"]
-            end = seg["end"]
-            duration = seg["duration"]
-
-            # 如果有整合 Face 數據
-            face_info = ""
-            if self.integrated_data:
-                matching = [
-                    item
-                    for item in self.integrated_data
-                    if abs(item["start"] - start) < 0.1 and item["speaker"] == speaker
-                ]
-                if matching and matching[0].get("has_face", False):
-                    face_info = " 👥✅"
-                elif matching:
-                    face_info = " 👥❌"
-
-            self.segment_listbox.insert(
-                tk.END,
-                f"[{i:4d}] {speaker:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s){face_info}",
-            )
-
-        self.status_label.config(
-            text=f"選擇：{speaker} - {len(self.speaker_segments[speaker])} 個片段"
-        )
-
-    def on_segment_double_click(self, event):
-        """片段雙擊事件"""
-        self.play_selected()
-
-    def extract_and_play(self, start_sec: float, end_sec: float) -> bool:
-        """提取並播放音頻"""
-        duration = end_sec - start_sec
-        temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-        temp_path = temp_file.name
-        temp_file.close()
-
-        try:
-            # 提取
-            cmd = [
-                "ffmpeg",
-                "-y",
-                "-loglevel",
-                "quiet",
-                "-i",
-                self.audio_path,
-                "-ss",
-                str(start_sec),
-                "-t",
-                str(duration),
-                "-acodec",
-                "pcm_s16le",
-                "-ar",
-                "16000",
-                "-ac",
-                "1",
-                temp_path,
-            ]
-
-            result = subprocess.run(cmd, capture_output=True)
-            if result.returncode != 0:
-                return False
-
-            # 播放
-            if os.path.exists("/usr/bin/afplay"):
-                subprocess.run(["afplay", temp_path], capture_output=True)
-            elif os.path.exists("/usr/bin/aplay"):
-                subprocess.run(["aplay", temp_path], capture_output=True)
-            else:
-                return False
-
-            return True
-        finally:
-            if os.path.exists(temp_path):
-                os.unlink(temp_path)
-
-    def play_segment(self, speaker: str, seg: dict, seg_idx: int, total: int):
-        """播放單個片段"""
-        if self.stop_flag:
-            return False
-
-        start = seg["start"]
-        end = seg["end"]
-        duration = seg["duration"]
-
-        # 更新 UI
-        self.root.after(
-            0,
-            lambda: self.status_label.config(
-                text=f"▶️  {speaker} [{seg_idx}/{total}] {start:.2f}s - {end:.2f}s"
-            ),
-        )
-
-        # 更新進度
-        progress = (seg_idx / total) * 100
-        self.root.after(0, lambda: self.progress_bar.config(value=progress))
-        self.root.after(
-            0, lambda: self.progress_label.config(text=f"{seg_idx}:{total}")
-        )
-
-        # 播放
-        if self.extract_and_play(start, end):
-            return True
-        else:
-            self.root.after(
-                0,
-                lambda: messagebox.showwarning(
-                    "警告", f"播放失敗：{speaker} [{seg_idx}]"
-                ),
-            )
-            return True
-
-    def play_selected(self):
-        """播放所選片段"""
-        selection = self.segment_listbox.curselection()
-        if not selection:
-            # 如果沒選擇，播放第一個
-            if self.speakers:
-                speaker = self.speakers[self.current_speaker_idx]
-                segs = self.speaker_segments[speaker]
-                if segs:
-                    self.play_all()
-            return
-
-        # 播放所選
-        seg_idx = selection[0]
-        speaker = self.speakers[self.current_speaker_idx]
-        seg = self.speaker_segments[speaker][seg_idx]
-
-        self.is_playing = True
-        self.stop_flag = False
-        self.play_button.config(state=tk.DISABLED)
-        self.stop_button.config(state=tk.NORMAL)
-
-        # 在後台線程播放
-        def play_thread():
-            success = self.play_segment(speaker, seg, seg_idx + 1, 1)
-            self.root.after(0, lambda: self.on_play_done())
-
-        thread = threading.Thread(target=play_thread, daemon=True)
-        thread.start()
-
-    def play_all(self):
-        """播放所選說話人的所有片段"""
-        if not self.speakers:
-            return
-
-        speaker = self.speakers[self.current_speaker_idx]
-        segs = self.speaker_segments[speaker]
-
-        if not segs:
-            return
-
-        self.is_playing = True
-        self.stop_flag = False
-        self.play_button.config(state=tk.DISABLED)
-        self.play_all_button.config(state=tk.DISABLED)
-        self.stop_button.config(state=tk.NORMAL)
-
-        # 在後台線程播放
-        def play_thread():
-            for i, seg in enumerate(segs, 1):
-                if self.stop_flag:
-                    break
-                self.play_segment(speaker, seg, i, len(segs))
-                time.sleep(0.3)  # 片段間隔
-
-            self.root.after(0, lambda: self.on_play_done())
-
-        thread = threading.Thread(target=play_thread, daemon=True)
-        thread.start()
-
-    def stop_playing(self):
-        """停止播放"""
-        self.stop_flag = True
-        self.is_playing = False
-        self.on_play_done()
-
-    def on_play_done(self):
-        """播放完成"""
-        self.is_playing = False
-        self.stop_flag = False
-        self.play_button.config(state=tk.NORMAL)
-        self.play_all_button.config(state=tk.NORMAL)
-        self.stop_button.config(state=tk.DISABLED)
-        self.progress_bar.config(value=0)
-        self.progress_label.config(text="0:00 / 0:00")
-
-        if self.stop_flag:
-            self.status_label.config(text="⏹️ 已停止")
-        else:
-            self.status_label.config(text="✅ 播放完成")
-
-
-def main():
-    """主函數"""
-    if not HAS_TKINTER:
-        print("❌ tkinter 未安裝")
-        print("請使用以下命令安裝:")
-        print("  brew install python-tk@3.9")
-        return
-
-    root = tk.Tk()
-    app = SpeakerPlayerGUI(root)
-    root.mainloop()
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/asrx_self/speaker_player_interactive.py
+++ b/scripts/asrx_self/speaker_player_interactive.py
@@ -1,267 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-Interactive Speaker Audio Player - 交互式說話人語音播放器
-可以選擇播放哪個說話人的哪些片段
-"""
-
-import json
-import subprocess
-import tempfile
-import os
-from pathlib import Path
-from typing import List, Dict
-
-
-def load_asrx_result(result_path: str) -> Dict:
-    """載入 ASRX 結果"""
-    with open(result_path, "r", encoding="utf-8") as f:
-        return json.load(f)
-
-
-def extract_and_play(audio_path: str, start_sec: float, end_sec: float) -> bool:
-    """提取並播放音頻片段"""
-    duration = end_sec - start_sec
-    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    temp_path = temp_file.name
-    temp_file.close()
-
-    try:
-        # 提取
-        cmd = [
-            "ffmpeg",
-            "-y",
-            "-loglevel",
-            "quiet",
-            "-i",
-            audio_path,
-            "-ss",
-            str(start_sec),
-            "-t",
-            str(duration),
-            "-acodec",
-            "pcm_s16le",
-            "-ar",
-            "16000",
-            "-ac",
-            "1",
-            temp_path,
-        ]
-
-        result = subprocess.run(cmd, capture_output=True)
-        if result.returncode != 0:
-            return False
-
-        # 播放
-        if os.path.exists("/usr/bin/afplay"):
-            subprocess.run(["afplay", temp_path], capture_output=True)
-        elif os.path.exists("/usr/bin/aplay"):
-            subprocess.run(["aplay", temp_path], capture_output=True)
-        else:
-            print("  ⚠️  No audio player found")
-            return False
-
-        return True
-    finally:
-        if os.path.exists(temp_path):
-            os.unlink(temp_path)
-
-
-def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
-    """顯示選單"""
-    segs = speaker_segments[speaker_id]
-    total_duration = sum(seg["duration"] for seg in segs)
-
-    print(f"\n{'=' * 70}")
-    print(f"🔊 {speaker_id}")
-    print(f"{'=' * 70}")
-    print(f"  Segments: {len(segs)}")
-    print(
-        f"  Total duration: {total_duration / 60:.1f} minutes ({total_duration:.1f}s)"
-    )
-    print(f"{'=' * 70}")
-
-    # 顯示前 20 個片段
-    for i, seg in enumerate(segs[:20], 1):
-        start = seg["start"]
-        end = seg["end"]
-        duration = seg["duration"]
-        print(
-            f"  [{i:3d}] {speaker_id:12} | {start:7.2f}s - {end:7.2f}s  ({duration:5.2f}s)"
-        )
-
-    if len(segs) > 20:
-        print(f"  ... and {len(segs) - 20} more segments")
-
-    print(f"\n{'=' * 70}")
-    print("Commands:")
-    print(f"  [1-{min(20, len(segs))}]  Play specific segment")
-    print("  all      Play all segments (may take a while)")
-    print("  first N  Play first N segments")
-    print("  next     Next speaker")
-    print("  prev     Previous speaker")
-    print("  list     List all speakers")
-    print("  quit     Exit")
-    print(f"{'=' * 70}")
-
-
-def interactive_player(audio_path: str, result_path: str):
-    """交互式播放器"""
-    # 載入結果
-    result = load_asrx_result(result_path)
-    segments = result.get("segments", [])
-    total_duration = result.get("total_duration", 0)
-
-    # 分組
-    speaker_segments = {}
-    for seg in segments:
-        speaker = seg["speaker"]
-        if speaker not in speaker_segments:
-            speaker_segments[speaker] = []
-        speaker_segments[speaker].append(seg)
-
-    # 排序
-    for speaker in speaker_segments:
-        speaker_segments[speaker].sort(key=lambda x: x["start"])
-
-    # 說話人列表
-    speakers = sorted(
-        speaker_segments.keys(),
-        key=lambda s: sum(seg["duration"] for seg in speaker_segments[s]),
-        reverse=True,
-    )
-
-    current_speaker_idx = 0
-
-    print("\n🎬 Speaker Audio Player")
-    print(f"📁 Audio: {audio_path}")
-    print(f"📊 Speakers: {len(speakers)}")
-    print(f"{'=' * 70}")
-
-    while True:
-        current_speaker = speakers[current_speaker_idx]
-        show_menu(speaker_segments, current_speaker)
-
-        try:
-            cmd = input(f"\n▶️  {current_speaker} > ").strip().lower()
-        except (EOFError, KeyboardInterrupt):
-            print("\n\nExiting...")
-            break
-
-        if not cmd:
-            continue
-
-        # 播放特定片段
-        if cmd.isdigit():
-            idx = int(cmd) - 1
-            if 0 <= idx < len(speaker_segments[current_speaker]):
-                seg = speaker_segments[current_speaker][idx]
-                print(f"\n  🔊 {current_speaker} - Segment {idx + 1}")
-                print(
-                    f"  ⏱️  {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
-                )
-                print("  ▶️  Playing...", end="", flush=True)
-                if extract_and_play(audio_path, seg["start"], seg["end"]):
-                    print(" ✅ Done")
-                else:
-                    print(" ❌ Failed")
-            else:
-                print(
-                    f"  Invalid segment number (1-{len(speaker_segments[current_speaker])})"
-                )
-
-        # 播放所有
-        elif cmd == "all":
-            print(
-                f"\n  🔊 {current_speaker} - Playing all {len(speaker_segments[current_speaker])} segments..."
-            )
-            print("=" * 70)
-            for i, seg in enumerate(speaker_segments[current_speaker], 1):
-                print(
-                    f"  [{i:3d}/{len(speaker_segments[current_speaker])}] {current_speaker} | "
-                    + f"{seg['start']:7.2f}s - {seg['end']:7.2f}s ({seg['duration']:5.2f}s)",
-                    end="",
-                    flush=True,
-                )
-                if extract_and_play(audio_path, seg["start"], seg["end"]):
-                    print(" ✅")
-                else:
-                    print(" ❌")
-            print("=" * 70)
-
-        # 播放前 N 個
-        elif cmd.startswith("first "):
-            try:
-                n = int(cmd.split()[1])
-                print(f"\n  🔊 {current_speaker} - Playing first {n} segments...")
-                print("=" * 70)
-                for i, seg in enumerate(speaker_segments[current_speaker][:n], 1):
-                    print(
-                        f"  [{i:3d}/{n}] {current_speaker} | "
-                        + f"{seg['start']:7.2f}s - {seg['end']:7.2f}s ({seg['duration']:5.2f}s)",
-                        end="",
-                        flush=True,
-                    )
-                    if extract_and_play(audio_path, seg["start"], seg["end"]):
-                        print(" ✅")
-                    else:
-                        print(" ❌")
-                print("=" * 70)
-            except (IndexError, ValueError):
-                print("  Usage: first N")
-
-        # 下一個說話人
-        elif cmd == "next":
-            current_speaker_idx = (current_speaker_idx + 1) % len(speakers)
-
-        # 上一個說話人
-        elif cmd == "prev":
-            current_speaker_idx = (current_speaker_idx - 1) % len(speakers)
-
-        # 列出所有說話人
-        elif cmd == "list":
-            print(f"\n{'=' * 70}")
-            print("📢 All speakers:")
-            print(f"{'=' * 70}")
-            for i, speaker in enumerate(speakers, 1):
-                segs = speaker_segments[speaker]
-                total_dur = sum(seg["duration"] for seg in segs)
-                pct = total_dur / total_duration * 100 if total_duration > 0 else 0
-                print(
-                    f"  {i:2d}. 🔊 {speaker:12} | {len(segs):4d} segments, "
-                    + f"{total_dur:7.1f}s ({pct:5.1f}%)"
-                )
-            print(f"{'=' * 70}")
-            print(f"  Current: 🔊 {speakers[current_speaker_idx]}")
-            print(f"{'=' * 70}")
-
-        # 退出
-        elif cmd == "quit" or cmd == "exit" or cmd == "q":
-            print("\nExiting...")
-            break
-
-        else:
-            print(f"  Unknown command: {cmd}")
-
-
-def main():
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Interactive Speaker Audio Player")
-    parser.add_argument("audio_path", help="原始音頻文件路徑")
-    parser.add_argument("result_path", help="ASRX 結果 JSON 路徑")
-
-    args = parser.parse_args()
-
-    if not Path(args.audio_path).exists():
-        print(f"Error: Audio file not found: {args.audio_path}")
-        return
-
-    if not Path(args.result_path).exists():
-        print(f"Error: Result file not found: {args.result_path}")
-        return
-
-    interactive_player(args.audio_path, args.result_path)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/asrx_self/test_gui_face_player.py
+++ b/scripts/asrx_self/test_gui_face_player.py
@@ -1,164 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-GUI Face Player 自動化測試腳本
-測試所有功能並生成測試報告
-"""
-
-import json
-import subprocess
-from pathlib import Path
-
-
-def check_file_exists(path, description):
-    """檢查文件是否存在"""
-    exists = Path(path).exists()
-    status = "✅" if exists else "❌"
-    size = Path(path).stat().st_size / 1024 / 1024 if exists else 0
-    print(f"{status} {description}: {path} ({size:.1f} MB)")
-    return exists
-
-
-def check_process_running(pattern):
-    """檢查進程是否運行"""
-    result = subprocess.run(['pgrep', '-f', pattern], capture_output=True, text=True)
-    running = result.returncode == 0
-    status = "✅" if running else "❌"
-    print(f"{status} 進程：{pattern} ({'運行中' if running else '未運行'})")
-    return running
-
-
-def test_json_structure(path, required_keys, description):
-    """測試 JSON 文件結構"""
-    try:
-        with open(path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        
-        missing_keys = [key for key in required_keys if key not in data]
-        if missing_keys:
-            print(f"❌ {description}: 缺少鍵 {missing_keys}")
-            return False
-        else:
-            print(f"✅ {description}: 結構正確")
-            return True
-    except Exception as e:
-        print(f"❌ {description}: {e}")
-        return False
-
-
-def test_integration_script():
-    """測試整合腳本"""
-    print("\n" + "="*70)
-    print("測試整合腳本")
-    print("="*70)
-    
-    cmd = [
-        'python3',
-        'integrate_face_asrx_speaker.py',
-        '/tmp/face_long.json',
-        '/tmp/asrx_charade_optimized.json',
-        '--threshold', '3.0',
-        '--stats'
-    ]
-    
-    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
-    
-    # 檢查輸出
-    if '99.8%' in result.stdout:
-        print("✅ 整合腳本：匹配率正確 (99.8%)")
-        return True
-    else:
-        print("❌ 整合腳本：匹配率異常")
-        print(result.stdout)
-        return False
-
-
-def test_gui_startup():
-    """測試 GUI 啟動"""
-    print("\n" + "="*70)
-    print("測試 GUI 啟動")
-    print("="*70)
-    
-    # 檢查進程
-    running = check_process_running('speaker_player_gui_face')
-    
-    if running:
-        print("✅ GUI 進程：正常運行")
-        return True
-    else:
-        print("❌ GUI 進程：未運行")
-        return False
-
-
-def main():
-    """主測試函數"""
-    print("="*70)
-    print("GUI Face Player 自動化測試")
-    print("="*70)
-    
-    # 測試文件
-    print("\n" + "="*70)
-    print("測試文件")
-    print("="*70)
-    
-    files_ok = True
-    files_ok &= check_file_exists('/tmp/charade_audio.wav', '音頻文件')
-    files_ok &= check_file_exists('/tmp/asrx_charade_optimized.json', 'ASRX 結果')
-    files_ok &= check_file_exists('/tmp/face_long.json', 'Face 結果')
-    files_ok &= check_file_exists('/tmp/charade_integrated.json', '整合結果')
-    
-    # 測試 JSON 結構
-    print("\n" + "="*70)
-    print("測試 JSON 結構")
-    print("="*70)
-    
-    json_ok = True
-    json_ok &= test_json_structure(
-        '/tmp/asrx_charade_optimized.json',
-        ['segments', 'n_speakers'],
-        'ASRX 結果'
-    )
-    json_ok &= test_json_structure(
-        '/tmp/face_long.json',
-        ['frames', 'frame_count'],
-        'Face 結果'
-    )
-    json_ok &= test_json_structure(
-        '/tmp/charade_integrated.json',
-        ['integrated_segments', 'speaker_stats'],
-        '整合結果'
-    )
-    
-    # 測試整合腳本
-    integration_ok = test_integration_script()
-    
-    # 測試 GUI
-    gui_ok = test_gui_startup()
-    
-    # 總結
-    print("\n" + "="*70)
-    print("測試總結")
-    print("="*70)
-    
-    all_ok = files_ok and json_ok and integration_ok and gui_ok
-    
-    if all_ok:
-        print("✅ 所有測試通過！")
-    else:
-        print("❌ 部分測試失敗")
-        if not files_ok:
-            print("  - 文件測試失敗")
-        if not json_ok:
-            print("  - JSON 結構測試失敗")
-        if not integration_ok:
-            print("  - 整合腳本測試失敗")
-        if not gui_ok:
-            print("  - GUI 啟動測試失敗")
-    
-    print("\n" + "="*70)
-    
-    return all_ok
-
-
-if __name__ == "__main__":
-    success = main()
-    exit(0 if success else 1)
--- a/scripts/asrx_self/test_long_movie.py
+++ b/scripts/asrx_self/test_long_movie.py
@@ -1,240 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-長影片（Charade 1963，114 分鐘）完整測試腳本
-"""
-
-import json
-import subprocess
-from pathlib import Path
-from datetime import datetime
-
-
-def print_header(title):
-    """打印標題"""
-    print("\n" + "="*70)
-    print(f" {title}")
-    print("="*70)
-
-
-def test_data_files():
-    """測試數據文件"""
-    print_header("1. 數據文件測試")
-    
-    files = {
-        '音頻文件': '/tmp/charade_audio.wav',
-        'ASRX 結果': '/tmp/asrx_charade_optimized.json',
-        'Face 結果': '/tmp/face_long.json',
-        '整合結果': '/tmp/charade_integrated.json'
-    }
-    
-    all_ok = True
-    for name, path in files.items():
-        exists = Path(path).exists()
-        size = Path(path).stat().st_size / 1024 / 1024 if exists else 0
-        status = "✅" if exists else "❌"
-        print(f"{status} {name}: {size:.1f} MB")
-        all_ok = all_ok and exists
-    
-    return all_ok
-
-
-def test_asrx_results():
-    """測試 ASRX 結果"""
-    print_header("2. ASRX 結果測試")
-    
-    with open('/tmp/asrx_charade_optimized.json', 'r', encoding='utf-8') as f:
-        data = json.load(f)
-    
-    total_duration = data.get('total_duration', 0)
-    n_speakers = data.get('n_speakers', 0)
-    n_segments = data.get('n_speech_segments', 0)
-    
-    print(f"📊 影片時長：{total_duration/60:.1f} 分鐘 ({total_duration:.1f}秒)")
-    print(f" 說話人數量：{n_speakers}")
-    print(f"📊 語音片段：{n_segments}")
-    
-    # 說話人統計
-    print("\n📢 說話人分佈:")
-    speaker_stats = data.get('speaker_stats', {})
-    for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
-        duration = stats.get('duration', 0)
-        count = stats.get('count', 0)
-        pct = duration / total_duration * 100 if total_duration > 0 else 0
-        print(f"   {speaker}: {count} 片段，{duration/60:.1f}分鐘 ({pct:.1f}%)")
-    
-    return n_speakers >= 2 and n_segments > 100
-
-
-def test_face_results():
-    """測試 Face 結果"""
-    print_header("3. Face 結果測試")
-    
-    with open('/tmp/face_long.json', 'r', encoding='utf-8') as f:
-        data = json.load(f)
-    
-    total_frames = data.get('frame_count', 0)
-    detected_frames = data.get('frames', [])
-    fps = data.get('fps', 0)
-    
-    print(f"📊 總數：{total_frames:,}")
-    print(f"📊 檢測到人臉：{len(detected_frames):,}")
-    print(f"📊 FPS: {fps:.2f}")
-    print(f"📊 檢測率：{len(detected_frames)/total_frames*100:.2f}%")
-    
-    return len(detected_frames) > 0
-
-
-def test_integration():
-    """測試整合結果"""
-    print_header("4. Face + ASRX 整合測試")
-    
-    with open('/tmp/charade_integrated.json', 'r', encoding='utf-8') as f:
-        data = json.load(f)
-    
-    segments = data.get('integrated_segments', [])
-    total = len(segments)
-    with_face = sum(1 for seg in segments if seg.get('has_face', False))
-    match_rate = with_face / total * 100 if total > 0 else 0
-    
-    print(f"📊 總片段：{total}")
-    print(f"📊 有人臉：{with_face}")
-    print(f"📊 匹配率：{match_rate:.2f}%")
-    
-    # 說話人匹配統計
-    print("\n📢 說話人匹配詳情:")
-    speaker_stats = data.get('speaker_stats', {})
-    for speaker, stats in sorted(speaker_stats.items()):
-        total_seg = stats.get('total_segments', 0)
-        with_face_seg = stats.get('with_face', 0)
-        rate = with_face_seg / total_seg * 100 if total_seg > 0 else 0
-        status = "✅" if rate >= 99 else "⚠️" if rate >= 50 else "❌"
-        print(f"   {status} {speaker}: {with_face_seg}/{total_seg} ({rate:.1f}%)")
-    
-    return match_rate >= 95
-
-
-def test_gui_process():
-    """測試 GUI 進程"""
-    print_header("5. GUI 進程測試")
-    
-    result = subprocess.run(['pgrep', '-f', 'speaker_player_gui_face'], 
-                          capture_output=True, text=True)
-    running = result.returncode == 0
-    
-    if running:
-        pid = result.stdout.strip()
-        print(f"✅ GUI 進程運行中 (PID: {pid})")
-        
-        # 檢查進程資源使用
-        ps_result = subprocess.run(['ps', 'aux'], capture_output=True, text=True)
-        for line in ps_result.stdout.split('\n'):
-            if 'speaker_player_gui_face' in line and 'grep' not in line:
-                parts = line.split()
-                if len(parts) >= 8:
-                    cpu = parts[2]
-                    mem = parts[3]
-                    print(f"   CPU: {cpu}%, 記憶體：{mem}%")
-    else:
-        print("❌ GUI 進程未運行")
-    
-    return running
-
-
-def test_playback():
-    """測試播放功能（模擬）"""
-    print_header("6. 播放功能測試")
-    
-    # 測試 ffmpeg 是否可用
-    result = subprocess.run(['which', 'ffmpeg'], capture_output=True, text=True)
-    ffmpeg_ok = result.returncode == 0
-    print(f"{'✅' if ffmpeg_ok else '❌'} ffmpeg: {'可用' if ffmpeg_ok else '不可用'}")
-    
-    # 測試 afplay 是否可用
-    result = subprocess.run(['which', 'afplay'], capture_output=True, text=True)
-    afplay_ok = result.returncode == 0
-    print(f"{'✅' if afplay_ok else '❌'} afplay: {'可用' if afplay_ok else '不可用'}")
-    
-    # 測試音頻提取（第一個片段）
-    with open('/tmp/asrx_charade_optimized.json', 'r', encoding='utf-8') as f:
-        asrx_data = json.load(f)
-    
-    first_seg = asrx_data['segments'][0]
-    start = first_seg['start']
-    end = first_seg['end']
-    duration = end - start
-    
-    print("\n🎵 測試提取第一個片段:")
-    print(f"   時間：{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
-    
-    # 實際提取測試
-    temp_file = '/tmp/test_segment.wav'
-    cmd = [
-        'ffmpeg', '-y', '-loglevel', 'quiet',
-        '-i', '/tmp/charade_audio.wav',
-        '-ss', str(start),
-        '-t', str(duration),
-        temp_file
-    ]
-    
-    result = subprocess.run(cmd, capture_output=True)
-    extract_ok = result.returncode == 0 and Path(temp_file).exists()
-    
-    print(f"{'✅' if extract_ok else '❌'} 音頻提取: {'成功' if extract_ok else '失敗'}")
-    
-    if extract_ok:
-        size = Path(temp_file).stat().st_size / 1024
-        print(f"   文件大小：{size:.1f} KB")
-        Path(temp_file).unlink()  # 清理
-    
-    return ffmpeg_ok and afplay_ok and extract_ok
-
-
-def generate_report():
-    """生成測試報告"""
-    print_header("測試報告")
-    
-    tests = [
-        ("數據文件", test_data_files()),
-        ("ASRX 結果", test_asrx_results()),
-        ("Face 結果", test_face_results()),
-        ("整合結果", test_integration()),
-        ("GUI 進程", test_gui_process()),
-        ("播放功能", test_playback())
-    ]
-    
-    passed = sum(1 for _, result in tests if result)
-    total = len(tests)
-    
-    print("\n" + "="*70)
-    print(f" 測試總結：{passed}/{total} 通過")
-    print("="*70)
-    
-    for name, result in tests:
-        status = "✅" if result else "❌"
-        print(f"{status} {name}")
-    
-    if passed == total:
-        print("\n🎉 所有測試通過！")
-    else:
-        print(f"\n⚠️ {total - passed} 個測試失敗")
-    
-    # 保存報告
-    report_path = '/tmp/long_movie_test_report.md'
-    with open(report_path, 'w', encoding='utf-8') as f:
-        f.write("# 長影片測試報告\n\n")
-        f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
-        f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
-        f.write("## 結果\n\n")
-        f.write(f"**通過**: {passed}/{total}\n\n")
-        for name, result in tests:
-            status = "✅" if result else "❌"
-            f.write(f"- {status} {name}\n")
-    
-    print(f"\n📄 報告已保存：{report_path}")
-    
-    return passed == total
-
-
-if __name__ == "__main__":
-    success = generate_report()
-    exit(0 if success else 1)
--- a/scripts/asrx_self/vad.py
+++ b/scripts/asrx_self/vad.py
@@ -126,6 +126,52 @@ def extract_speech_audio(audio_path, model, utils, output_dir=None):
    return speech_audios, speech_segments


+def scan_within_segment(wav, sample_rate, start_sec, end_sec, model, utils,
+                        min_speech_duration_ms=500, min_silence_duration_ms=300):
+    """
+    在一個時間範圍內執行 VAD 掃描，切出子片段。
+
+    用途: whisper 給出的粗略時間段內，利用句間停頓細切。
+
+    Args:
+        wav: 完整音頻波形 (numpy array)
+        sample_rate: 採樣率
+        start_sec: 掃描起始時間 (秒)
+        end_sec: 掃描結束時間 (秒)
+        model: VAD 模型
+        utils: VAD 工具函數
+        min_speech_duration_ms: 最小語音持續時間
+        min_silence_duration_ms: 最小靜音持續時間
+
+    Returns:
+        sub_segments: [(start_sec, end_sec), ...] 子片段列表 (原始時間軸)
+    """
+    get_speech_timestamps, _, _, _, _ = utils
+
+    # 提取該時間範圍內的音頻
+    start_sample = int(start_sec * sample_rate)
+    end_sample = int(end_sec * sample_rate)
+    segment_wav = wav[start_sample:end_sample]
+
+    # 在子音頻上執行 VAD
+    speech_ts = get_speech_timestamps(
+        segment_wav,
+        model,
+        sampling_rate=sample_rate,
+        min_speech_duration_ms=min_speech_duration_ms,
+        min_silence_duration_ms=min_silence_duration_ms,
+        return_seconds=True,
+    )
+
+    # 轉換回原始時間軸
+    sub_segments = [
+        (ts["start"] + start_sec, ts["end"] + start_sec)
+        for ts in speech_ts
+    ]
+
+    return sub_segments
+
+
 if __name__ == "__main__":
    # 測試 VAD
    import sys
--- a/scripts/asrx_self/whisper_local.py
+++ b/scripts/asrx_self/whisper_local.py
@@ -0,0 +1,35 @@
+"""
+Whisper Local - uses faster-whisper for per-segment transcription
+"""
+
+import numpy as np
+
+
+def load_model(size="small"):
+    from faster_whisper import WhisperModel
+    return WhisperModel(size, device="cpu", compute_type="int8")
+
+
+def transcribe_segment(wav, sample_rate, start_sec, end_sec, model):
+    start_sample = int(start_sec * sample_rate)
+    end_sample = int(end_sec * sample_rate)
+    if start_sample >= len(wav):
+        return {"text": "", "language": "", "lang_prob": 0.0, "segments": []}
+    segment_wav = wav[start_sample:min(end_sample, len(wav))]
+
+    segments_generator, info = model.transcribe(segment_wav, language=None)
+
+    text = ""
+    lang_prob = info.language_probability if info else 0.0
+    language = info.language if info else ""
+
+    segs = list(segments_generator)
+    for seg in segs:
+        text += seg.text + " "
+
+    return {
+        "text": text.strip(),
+        "language": language,
+        "lang_prob": lang_prob,
+        "segments": segs,
+    }