cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
This commit is contained in:
@@ -130,12 +130,12 @@ def main():
|
||||
integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
|
||||
|
||||
# 分析
|
||||
print(f"\n[Analyze] Analyzing speaker-face correspondence...")
|
||||
print("\n[Analyze] Analyzing speaker-face correspondence...")
|
||||
speaker_stats = analyze_speaker_face(integrated)
|
||||
|
||||
# 顯示統計
|
||||
print(f"\n{'='*70}")
|
||||
print(f"說話人 - 人臉對應統計")
|
||||
print("說話人 - 人臉對應統計")
|
||||
print(f"{'='*70}")
|
||||
|
||||
total_segments = len(integrated)
|
||||
|
||||
@@ -16,7 +16,6 @@ Self-implemented ASRX - 自實作說話人分離系統
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# 導入自定義模組
|
||||
@@ -182,7 +181,7 @@ class SelfASRX:
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print(f"\n[SelfASRX] Processing completed!")
|
||||
print("\n[SelfASRX] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
@@ -249,14 +248,14 @@ def main():
|
||||
|
||||
# 顯示結果摘要
|
||||
if "error" not in result:
|
||||
print(f"\n[Summary]")
|
||||
print("\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print(f"\n[Speaker Statistics]")
|
||||
print("\n[Speaker Statistics]")
|
||||
for speaker, stats in result["speaker_stats"].items():
|
||||
pct = stats["duration"] / result["total_duration"] * 100
|
||||
print(
|
||||
|
||||
@@ -134,7 +134,7 @@ class SelfASRXFixed:
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print(f"\n[SelfASRX-Fixed] Processing completed!")
|
||||
print("\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
@@ -154,6 +154,117 @@ class SelfASRXFixed:
|
||||
return result
|
||||
|
||||
|
||||
def process_with_segments(self, audio_path, asr_segments, output_path=None):
|
||||
"""
|
||||
使用 ASR segment 邊界進行 speaker diarization,取代 VAD 步驟。
|
||||
|
||||
Args:
|
||||
audio_path: 音頻文件路徑(WAV)
|
||||
asr_segments: ASR segment 列表,每個包含 start/end(秒)
|
||||
output_path: 輸出 JSON 路徑(可選)
|
||||
"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 載入完整音頻
|
||||
import soundfile as sf
|
||||
wav, sample_rate = sf.read(audio_path)
|
||||
if len(wav.shape) > 1:
|
||||
wav = np.mean(wav, axis=1) # 轉 mono
|
||||
print(f" Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
|
||||
|
||||
# 使用 ASR segments 取代 VAD
|
||||
speech_segments = [(s["start"], s["end"]) for s in asr_segments]
|
||||
print(f" Speech segments from ASR: {len(speech_segments)}")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX-Fixed] No ASR segments provided!")
|
||||
return {"error": "No ASR segments", "segments": []}
|
||||
|
||||
# 提取語音片段
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
if start_sample >= len(wav):
|
||||
continue
|
||||
audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
|
||||
|
||||
print(f" Audio segments extracted: {len(audio_segments)}")
|
||||
|
||||
# 批量提取聲紋嵌入
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
|
||||
# 聚類
|
||||
print("\n[Step 3] Robust speaker clustering...")
|
||||
step3_start = time.time()
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings, n_speakers=None, max_speakers=10
|
||||
)
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Clustering time: {step3_time:.2f}s")
|
||||
|
||||
# 建立輸出
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": []
|
||||
}
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append({
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}"
|
||||
})
|
||||
|
||||
# 加入 embeddings(每個 segment 對應的 192-D speaker embedding)
|
||||
result["embeddings"] = []
|
||||
for emb in embeddings:
|
||||
result["embeddings"].append(emb.tolist())
|
||||
|
||||
# 統計
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
result["speaker_stats"] = speaker_stats
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print("\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
|
||||
if output_path:
|
||||
import json
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
print(f" Results saved to: {output_path}")
|
||||
|
||||
print("=" * 60)
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
@@ -180,14 +291,14 @@ def main():
|
||||
)
|
||||
|
||||
if "error" not in result:
|
||||
print(f"\n[Summary]")
|
||||
print("\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print(f"\n[Speaker Statistics]")
|
||||
print("\n[Speaker Statistics]")
|
||||
for speaker, stats in result['speaker_stats'].items():
|
||||
pct = stats['duration'] / result['total_duration'] * 100
|
||||
print(f" {speaker}: {stats['count']} segments, " +
|
||||
|
||||
@@ -138,7 +138,7 @@ def spectral_clustering_speaker(
|
||||
|
||||
speaker_labels = clustering.fit_predict(similarity_matrix)
|
||||
|
||||
print(f"[Clustering] Spectral clustering completed")
|
||||
print("[Clustering] Spectral clustering completed")
|
||||
print(f"[Clustering] n_speakers: {n_speakers}")
|
||||
print(f"[Clustering] n_segments: {n_segments}")
|
||||
|
||||
@@ -146,7 +146,7 @@ def spectral_clustering_speaker(
|
||||
|
||||
except Exception as e:
|
||||
print(f"[Clustering] Spectral clustering failed: {e}")
|
||||
print(f"[Clustering] Using fallback: 2 speakers")
|
||||
print("[Clustering] Using fallback: 2 speakers")
|
||||
# 簡單分配:前一半是 SPEAKER_0,後一半是 SPEAKER_1
|
||||
speaker_labels = np.array(
|
||||
[0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
|
||||
@@ -203,7 +203,7 @@ def agglomerative_clustering_speaker(
|
||||
|
||||
speaker_labels = clustering.fit_predict(embeddings)
|
||||
|
||||
print(f"[Clustering] Agglomerative clustering completed")
|
||||
print("[Clustering] Agglomerative clustering completed")
|
||||
print(f"[Clustering] n_speakers: {n_speakers}")
|
||||
|
||||
return speaker_labels, n_speakers
|
||||
@@ -249,7 +249,6 @@ def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
|
||||
"""
|
||||
if ground_truth_labels is None:
|
||||
# 沒有 ground truth,使用聚類純度近似
|
||||
from sklearn.metrics import silhouette_score
|
||||
|
||||
# 使用餘弦相似度作為距離
|
||||
purity = 0.5 # 預設值
|
||||
@@ -300,7 +299,7 @@ if __name__ == "__main__":
|
||||
similarity, n_speakers=None, auto_estimate=True
|
||||
)
|
||||
|
||||
print(f"\n[Test] Clustering results:")
|
||||
print("\n[Test] Clustering results:")
|
||||
print(f" True n_speakers: {n_speakers}")
|
||||
print(f" Estimated n_speakers: {n_clusters}")
|
||||
print(f" Unique labels: {np.unique(labels)}")
|
||||
|
||||
@@ -6,7 +6,6 @@ Speaker Clustering - Fixed Version
|
||||
|
||||
import numpy as np
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
|
||||
def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
|
||||
@@ -57,7 +56,7 @@ def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
|
||||
|
||||
# 統計每個聚類的大小
|
||||
unique, counts = np.unique(speaker_labels, return_counts=True)
|
||||
print(f"[Clustering] Cluster sizes:")
|
||||
print("[Clustering] Cluster sizes:")
|
||||
for label, count in zip(unique, counts):
|
||||
print(f" SPEAKER_{label}: {count} segments ({count/n_segments*100:.1f}%)")
|
||||
|
||||
@@ -148,6 +147,6 @@ if __name__ == "__main__":
|
||||
# 測試聚類
|
||||
labels, n_clusters = robust_speaker_clustering(embeddings)
|
||||
|
||||
print(f"\nResult:")
|
||||
print("\nResult:")
|
||||
print(f" True n_speakers: {n_speakers}")
|
||||
print(f" Estimated n_speakers: {n_clusters}")
|
||||
|
||||
@@ -33,8 +33,8 @@ def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
|
||||
)
|
||||
|
||||
# 獲取模型資訊
|
||||
print(f"[SpeakerEncoder] Model loaded successfully")
|
||||
print(f"[SpeakerEncoder] Embedding dimension: 192")
|
||||
print("[SpeakerEncoder] Model loaded successfully")
|
||||
print("[SpeakerEncoder] Embedding dimension: 192")
|
||||
|
||||
return classifier
|
||||
|
||||
@@ -187,5 +187,5 @@ if __name__ == "__main__":
|
||||
print(f"[Test] Embedding std: {embedding.std():.4f}")
|
||||
|
||||
# 顯示部分嵌入值
|
||||
print(f"\n[Test] First 10 embedding values:")
|
||||
print("\n[Test] First 10 embedding values:")
|
||||
print(f" {embedding[:10]}")
|
||||
|
||||
@@ -11,7 +11,6 @@ import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
try:
|
||||
import tkinter as tk
|
||||
|
||||
@@ -11,7 +11,6 @@ import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
try:
|
||||
import tkinter as tk
|
||||
@@ -203,7 +202,7 @@ class SpeakerPlayerGUI:
|
||||
self.face_path = filename
|
||||
self.face_label.config(text=Path(filename).name)
|
||||
self.integrate_button.config(state=tk.NORMAL)
|
||||
self.status_label.config(text=f"✅ Face 已選擇 - 請點擊整合")
|
||||
self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")
|
||||
|
||||
def integrate_face(self):
|
||||
"""整合 Face 與 ASRX"""
|
||||
|
||||
@@ -93,14 +93,14 @@ def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
|
||||
print(f" ... and {len(segs) - 20} more segments")
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"Commands:")
|
||||
print("Commands:")
|
||||
print(f" [1-{min(20, len(segs))}] Play specific segment")
|
||||
print(f" all Play all segments (may take a while)")
|
||||
print(f" first N Play first N segments")
|
||||
print(f" next Next speaker")
|
||||
print(f" prev Previous speaker")
|
||||
print(f" list List all speakers")
|
||||
print(f" quit Exit")
|
||||
print(" all Play all segments (may take a while)")
|
||||
print(" first N Play first N segments")
|
||||
print(" next Next speaker")
|
||||
print(" prev Previous speaker")
|
||||
print(" list List all speakers")
|
||||
print(" quit Exit")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
|
||||
@@ -132,7 +132,7 @@ def interactive_player(audio_path: str, result_path: str):
|
||||
|
||||
current_speaker_idx = 0
|
||||
|
||||
print(f"\n🎬 Speaker Audio Player")
|
||||
print("\n🎬 Speaker Audio Player")
|
||||
print(f"📁 Audio: {audio_path}")
|
||||
print(f"📊 Speakers: {len(speakers)}")
|
||||
print(f"{'=' * 70}")
|
||||
@@ -159,7 +159,7 @@ def interactive_player(audio_path: str, result_path: str):
|
||||
print(
|
||||
f" ⏱️ {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
|
||||
)
|
||||
print(f" ▶️ Playing...", end="", flush=True)
|
||||
print(" ▶️ Playing...", end="", flush=True)
|
||||
if extract_and_play(audio_path, seg["start"], seg["end"]):
|
||||
print(" ✅ Done")
|
||||
else:
|
||||
@@ -220,7 +220,7 @@ def interactive_player(audio_path: str, result_path: str):
|
||||
# 列出所有說話人
|
||||
elif cmd == "list":
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"📢 All speakers:")
|
||||
print("📢 All speakers:")
|
||||
print(f"{'=' * 70}")
|
||||
for i, speaker in enumerate(speakers, 1):
|
||||
segs = speaker_segments[speaker]
|
||||
|
||||
@@ -6,8 +6,6 @@ GUI Face Player 自動化測試腳本
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
@@ -55,7 +54,7 @@ def test_asrx_results():
|
||||
print(f"📊 語音片段:{n_segments}")
|
||||
|
||||
# 說話人統計
|
||||
print(f"\n📢 說話人分佈:")
|
||||
print("\n📢 說話人分佈:")
|
||||
speaker_stats = data.get('speaker_stats', {})
|
||||
for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
|
||||
duration = stats.get('duration', 0)
|
||||
@@ -102,7 +101,7 @@ def test_integration():
|
||||
print(f"📊 匹配率:{match_rate:.2f}%")
|
||||
|
||||
# 說話人匹配統計
|
||||
print(f"\n📢 說話人匹配詳情:")
|
||||
print("\n📢 說話人匹配詳情:")
|
||||
speaker_stats = data.get('speaker_stats', {})
|
||||
for speaker, stats in sorted(speaker_stats.items()):
|
||||
total_seg = stats.get('total_segments', 0)
|
||||
@@ -164,7 +163,7 @@ def test_playback():
|
||||
end = first_seg['end']
|
||||
duration = end - start
|
||||
|
||||
print(f"\n🎵 測試提取第一個片段:")
|
||||
print("\n🎵 測試提取第一個片段:")
|
||||
print(f" 時間:{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
|
||||
|
||||
# 實際提取測試
|
||||
@@ -222,10 +221,10 @@ def generate_report():
|
||||
# 保存報告
|
||||
report_path = '/tmp/long_movie_test_report.md'
|
||||
with open(report_path, 'w', encoding='utf-8') as f:
|
||||
f.write(f"# 長影片測試報告\n\n")
|
||||
f.write("# 長影片測試報告\n\n")
|
||||
f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
|
||||
f.write(f"**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
|
||||
f.write(f"## 結果\n\n")
|
||||
f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
|
||||
f.write("## 結果\n\n")
|
||||
f.write(f"**通過**: {passed}/{total}\n\n")
|
||||
for name, result in tests:
|
||||
status = "✅" if result else "❌"
|
||||
|
||||
@@ -9,7 +9,6 @@ VAD (Voice Activity Detection) - 語音活動檢測
|
||||
"""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
|
||||
def load_vad_model():
|
||||
@@ -143,7 +142,7 @@ if __name__ == "__main__":
|
||||
print(f"[VAD] Processing: {audio_path}")
|
||||
segments, wav, sr = extract_speech_segments(audio_path, model, utils)
|
||||
|
||||
print(f"\n[VAD] Results:")
|
||||
print("\n[VAD] Results:")
|
||||
print(f" Sample rate: {sr} Hz")
|
||||
print(f" Speech segments: {len(segments)}")
|
||||
print(f" Total duration: {len(wav) / sr:.2f}s")
|
||||
@@ -153,7 +152,7 @@ if __name__ == "__main__":
|
||||
f" Total speech: {total_speech:.2f}s ({total_speech / (len(wav) / sr) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
print(f"\n[VAD] Segments:")
|
||||
print("\n[VAD] Segments:")
|
||||
for i, (start, end) in enumerate(segments[:10]):
|
||||
print(f" {i + 1:3d}. {start:6.2f}s - {end:6.2f}s ({end - start:5.2f}s)")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user