cleanup: remove dead code and duplicate docs

- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
This commit is contained in:
Warren
2026-05-04 01:31:21 +08:00
parent ee81e343ce
commit e75c4d6f07
3270 changed files with 35190 additions and 53367 deletions

View File

@@ -130,12 +130,12 @@ def main():
integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
# 分析
print(f"\n[Analyze] Analyzing speaker-face correspondence...")
print("\n[Analyze] Analyzing speaker-face correspondence...")
speaker_stats = analyze_speaker_face(integrated)
# 顯示統計
print(f"\n{'='*70}")
print(f"說話人 - 人臉對應統計")
print("說話人 - 人臉對應統計")
print(f"{'='*70}")
total_segments = len(integrated)

View File

@@ -16,7 +16,6 @@ Self-implemented ASRX - 自實作說話人分離系統
import sys
import json
import time
import numpy as np
from pathlib import Path
# 導入自定義模組
@@ -182,7 +181,7 @@ class SelfASRX:
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print(f"\n[SelfASRX] Processing completed!")
print("\n[SelfASRX] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
@@ -249,14 +248,14 @@ def main():
# 顯示結果摘要
if "error" not in result:
print(f"\n[Summary]")
print("\n[Summary]")
print(f" Audio duration: {result['total_duration']:.2f}s")
print(f" Speech segments: {result['n_speech_segments']}")
print(f" Detected speakers: {result['n_speakers']}")
print(f" Processing time: {result['processing_time']:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f"\n[Speaker Statistics]")
print("\n[Speaker Statistics]")
for speaker, stats in result["speaker_stats"].items():
pct = stats["duration"] / result["total_duration"] * 100
print(

View File

@@ -134,7 +134,7 @@ class SelfASRXFixed:
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print(f"\n[SelfASRX-Fixed] Processing completed!")
print("\n[SelfASRX-Fixed] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
@@ -154,6 +154,117 @@ class SelfASRXFixed:
return result
def process_with_segments(self, audio_path, asr_segments, output_path=None):
"""
使用 ASR segment 邊界進行 speaker diarization取代 VAD 步驟。
Args:
audio_path: 音頻文件路徑WAV
asr_segments: ASR segment 列表,每個包含 start/end
output_path: 輸出 JSON 路徑(可選)
"""
start_time = time.time()
print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
print("=" * 60)
# 載入完整音頻
import soundfile as sf
wav, sample_rate = sf.read(audio_path)
if len(wav.shape) > 1:
wav = np.mean(wav, axis=1) # 轉 mono
print(f" Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
# 使用 ASR segments 取代 VAD
speech_segments = [(s["start"], s["end"]) for s in asr_segments]
print(f" Speech segments from ASR: {len(speech_segments)}")
if len(speech_segments) == 0:
print("[SelfASRX-Fixed] No ASR segments provided!")
return {"error": "No ASR segments", "segments": []}
# 提取語音片段
audio_segments = []
for start_sec, end_sec in speech_segments:
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
if start_sample >= len(wav):
continue
audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
print(f" Audio segments extracted: {len(audio_segments)}")
# 批量提取聲紋嵌入
print("\n[Step 2] Speaker embedding extraction...")
step2_start = time.time()
embeddings = extract_speaker_embeddings_batch(
self.speaker_encoder, audio_segments, sample_rate
)
embeddings = normalize_embeddings(embeddings)
step2_time = time.time() - step2_start
print(f" Embedding shape: {embeddings.shape}")
print(f" Embedding time: {step2_time:.2f}s")
# 聚類
print("\n[Step 3] Robust speaker clustering...")
step3_start = time.time()
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
embeddings, n_speakers=None, max_speakers=10
)
step3_time = time.time() - step3_start
print(f" Clustering time: {step3_time:.2f}s")
# 建立輸出
result = {
"audio_path": str(audio_path),
"total_duration": len(wav) / sample_rate,
"n_speech_segments": len(speech_segments),
"n_speakers": int(estimated_n_speakers),
"segments": []
}
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
result["segments"].append({
"index": i,
"start": round(start, 3),
"end": round(end, 3),
"duration": round(end - start, 3),
"speaker": f"SPEAKER_{int(label)}"
})
# 加入 embeddings每個 segment 對應的 192-D speaker embedding
result["embeddings"] = []
for emb in embeddings:
result["embeddings"].append(emb.tolist())
# 統計
speaker_stats = {}
for seg in result["segments"]:
speaker = seg["speaker"]
if speaker not in speaker_stats:
speaker_stats[speaker] = {"count": 0, "duration": 0}
speaker_stats[speaker]["count"] += 1
speaker_stats[speaker]["duration"] += seg["duration"]
result["speaker_stats"] = speaker_stats
total_time = time.time() - start_time
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print("\n[SelfASRX-Fixed] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
if output_path:
import json
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f" Results saved to: {output_path}")
print("=" * 60)
return result
def main():
import argparse
@@ -180,14 +291,14 @@ def main():
)
if "error" not in result:
print(f"\n[Summary]")
print("\n[Summary]")
print(f" Audio duration: {result['total_duration']:.2f}s")
print(f" Speech segments: {result['n_speech_segments']}")
print(f" Detected speakers: {result['n_speakers']}")
print(f" Processing time: {result['processing_time']:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f"\n[Speaker Statistics]")
print("\n[Speaker Statistics]")
for speaker, stats in result['speaker_stats'].items():
pct = stats['duration'] / result['total_duration'] * 100
print(f" {speaker}: {stats['count']} segments, " +

View File

@@ -138,7 +138,7 @@ def spectral_clustering_speaker(
speaker_labels = clustering.fit_predict(similarity_matrix)
print(f"[Clustering] Spectral clustering completed")
print("[Clustering] Spectral clustering completed")
print(f"[Clustering] n_speakers: {n_speakers}")
print(f"[Clustering] n_segments: {n_segments}")
@@ -146,7 +146,7 @@ def spectral_clustering_speaker(
except Exception as e:
print(f"[Clustering] Spectral clustering failed: {e}")
print(f"[Clustering] Using fallback: 2 speakers")
print("[Clustering] Using fallback: 2 speakers")
# 簡單分配:前一半是 SPEAKER_0後一半是 SPEAKER_1
speaker_labels = np.array(
[0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
@@ -203,7 +203,7 @@ def agglomerative_clustering_speaker(
speaker_labels = clustering.fit_predict(embeddings)
print(f"[Clustering] Agglomerative clustering completed")
print("[Clustering] Agglomerative clustering completed")
print(f"[Clustering] n_speakers: {n_speakers}")
return speaker_labels, n_speakers
@@ -249,7 +249,6 @@ def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
"""
if ground_truth_labels is None:
# 沒有 ground truth使用聚類純度近似
from sklearn.metrics import silhouette_score
# 使用餘弦相似度作為距離
purity = 0.5 # 預設值
@@ -300,7 +299,7 @@ if __name__ == "__main__":
similarity, n_speakers=None, auto_estimate=True
)
print(f"\n[Test] Clustering results:")
print("\n[Test] Clustering results:")
print(f" True n_speakers: {n_speakers}")
print(f" Estimated n_speakers: {n_clusters}")
print(f" Unique labels: {np.unique(labels)}")

View File

@@ -6,7 +6,6 @@ Speaker Clustering - Fixed Version
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
@@ -57,7 +56,7 @@ def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
# 統計每個聚類的大小
unique, counts = np.unique(speaker_labels, return_counts=True)
print(f"[Clustering] Cluster sizes:")
print("[Clustering] Cluster sizes:")
for label, count in zip(unique, counts):
print(f" SPEAKER_{label}: {count} segments ({count/n_segments*100:.1f}%)")
@@ -148,6 +147,6 @@ if __name__ == "__main__":
# 測試聚類
labels, n_clusters = robust_speaker_clustering(embeddings)
print(f"\nResult:")
print("\nResult:")
print(f" True n_speakers: {n_speakers}")
print(f" Estimated n_speakers: {n_clusters}")

View File

@@ -33,8 +33,8 @@ def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
)
# 獲取模型資訊
print(f"[SpeakerEncoder] Model loaded successfully")
print(f"[SpeakerEncoder] Embedding dimension: 192")
print("[SpeakerEncoder] Model loaded successfully")
print("[SpeakerEncoder] Embedding dimension: 192")
return classifier
@@ -187,5 +187,5 @@ if __name__ == "__main__":
print(f"[Test] Embedding std: {embedding.std():.4f}")
# 顯示部分嵌入值
print(f"\n[Test] First 10 embedding values:")
print("\n[Test] First 10 embedding values:")
print(f" {embedding[:10]}")

View File

@@ -11,7 +11,6 @@ import os
import threading
import time
from pathlib import Path
from typing import List, Dict
try:
import tkinter as tk

View File

@@ -11,7 +11,6 @@ import os
import threading
import time
from pathlib import Path
from typing import List, Dict
try:
import tkinter as tk
@@ -203,7 +202,7 @@ class SpeakerPlayerGUI:
self.face_path = filename
self.face_label.config(text=Path(filename).name)
self.integrate_button.config(state=tk.NORMAL)
self.status_label.config(text=f"✅ Face 已選擇 - 請點擊整合")
self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")
def integrate_face(self):
"""整合 Face 與 ASRX"""

View File

@@ -93,14 +93,14 @@ def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
print(f" ... and {len(segs) - 20} more segments")
print(f"\n{'=' * 70}")
print(f"Commands:")
print("Commands:")
print(f" [1-{min(20, len(segs))}] Play specific segment")
print(f" all Play all segments (may take a while)")
print(f" first N Play first N segments")
print(f" next Next speaker")
print(f" prev Previous speaker")
print(f" list List all speakers")
print(f" quit Exit")
print(" all Play all segments (may take a while)")
print(" first N Play first N segments")
print(" next Next speaker")
print(" prev Previous speaker")
print(" list List all speakers")
print(" quit Exit")
print(f"{'=' * 70}")
@@ -132,7 +132,7 @@ def interactive_player(audio_path: str, result_path: str):
current_speaker_idx = 0
print(f"\n🎬 Speaker Audio Player")
print("\n🎬 Speaker Audio Player")
print(f"📁 Audio: {audio_path}")
print(f"📊 Speakers: {len(speakers)}")
print(f"{'=' * 70}")
@@ -159,7 +159,7 @@ def interactive_player(audio_path: str, result_path: str):
print(
f" ⏱️ {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
)
print(f" ▶️ Playing...", end="", flush=True)
print(" ▶️ Playing...", end="", flush=True)
if extract_and_play(audio_path, seg["start"], seg["end"]):
print(" ✅ Done")
else:
@@ -220,7 +220,7 @@ def interactive_player(audio_path: str, result_path: str):
# 列出所有說話人
elif cmd == "list":
print(f"\n{'=' * 70}")
print(f"📢 All speakers:")
print("📢 All speakers:")
print(f"{'=' * 70}")
for i, speaker in enumerate(speakers, 1):
segs = speaker_segments[speaker]

View File

@@ -6,8 +6,6 @@ GUI Face Player 自動化測試腳本
import json
import subprocess
import time
import os
from pathlib import Path

View File

@@ -5,7 +5,6 @@
import json
import subprocess
import time
from pathlib import Path
from datetime import datetime
@@ -55,7 +54,7 @@ def test_asrx_results():
print(f"📊 語音片段:{n_segments}")
# 說話人統計
print(f"\n📢 說話人分佈:")
print("\n📢 說話人分佈:")
speaker_stats = data.get('speaker_stats', {})
for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
duration = stats.get('duration', 0)
@@ -102,7 +101,7 @@ def test_integration():
print(f"📊 匹配率:{match_rate:.2f}%")
# 說話人匹配統計
print(f"\n📢 說話人匹配詳情:")
print("\n📢 說話人匹配詳情:")
speaker_stats = data.get('speaker_stats', {})
for speaker, stats in sorted(speaker_stats.items()):
total_seg = stats.get('total_segments', 0)
@@ -164,7 +163,7 @@ def test_playback():
end = first_seg['end']
duration = end - start
print(f"\n🎵 測試提取第一個片段:")
print("\n🎵 測試提取第一個片段:")
print(f" 時間:{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
# 實際提取測試
@@ -222,10 +221,10 @@ def generate_report():
# 保存報告
report_path = '/tmp/long_movie_test_report.md'
with open(report_path, 'w', encoding='utf-8') as f:
f.write(f"# 長影片測試報告\n\n")
f.write("# 長影片測試報告\n\n")
f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
f.write(f"**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
f.write(f"## 結果\n\n")
f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
f.write("## 結果\n\n")
f.write(f"**通過**: {passed}/{total}\n\n")
for name, result in tests:
status = "" if result else ""

View File

@@ -9,7 +9,6 @@ VAD (Voice Activity Detection) - 語音活動檢測
"""
import torch
import numpy as np
def load_vad_model():
@@ -143,7 +142,7 @@ if __name__ == "__main__":
print(f"[VAD] Processing: {audio_path}")
segments, wav, sr = extract_speech_segments(audio_path, model, utils)
print(f"\n[VAD] Results:")
print("\n[VAD] Results:")
print(f" Sample rate: {sr} Hz")
print(f" Speech segments: {len(segments)}")
print(f" Total duration: {len(wav) / sr:.2f}s")
@@ -153,7 +152,7 @@ if __name__ == "__main__":
f" Total speech: {total_speech:.2f}s ({total_speech / (len(wav) / sr) * 100:.1f}%)"
)
print(f"\n[VAD] Segments:")
print("\n[VAD] Segments:")
for i, (start, end) in enumerate(segments[:10]):
print(f" {i + 1:3d}. {start:6.2f}s - {end:6.2f}s ({end - start:5.2f}s)")