cleanup: remove dead code and duplicate docs

- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
This commit is contained in:
Warren
2026-05-04 01:31:21 +08:00
parent ee81e343ce
commit e75c4d6f07
3270 changed files with 35190 additions and 53367 deletions

View File

@@ -134,7 +134,7 @@ class SelfASRXFixed:
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print(f"\n[SelfASRX-Fixed] Processing completed!")
print("\n[SelfASRX-Fixed] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
@@ -154,6 +154,117 @@ class SelfASRXFixed:
return result
def process_with_segments(self, audio_path, asr_segments, output_path=None):
"""
使用 ASR segment 邊界進行 speaker diarization取代 VAD 步驟。
Args:
audio_path: 音頻文件路徑WAV
asr_segments: ASR segment 列表,每個包含 start/end
output_path: 輸出 JSON 路徑(可選)
"""
start_time = time.time()
print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
print("=" * 60)
# 載入完整音頻
import soundfile as sf
wav, sample_rate = sf.read(audio_path)
if len(wav.shape) > 1:
wav = np.mean(wav, axis=1) # 轉 mono
print(f" Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
# 使用 ASR segments 取代 VAD
speech_segments = [(s["start"], s["end"]) for s in asr_segments]
print(f" Speech segments from ASR: {len(speech_segments)}")
if len(speech_segments) == 0:
print("[SelfASRX-Fixed] No ASR segments provided!")
return {"error": "No ASR segments", "segments": []}
# 提取語音片段
audio_segments = []
for start_sec, end_sec in speech_segments:
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
if start_sample >= len(wav):
continue
audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
print(f" Audio segments extracted: {len(audio_segments)}")
# 批量提取聲紋嵌入
print("\n[Step 2] Speaker embedding extraction...")
step2_start = time.time()
embeddings = extract_speaker_embeddings_batch(
self.speaker_encoder, audio_segments, sample_rate
)
embeddings = normalize_embeddings(embeddings)
step2_time = time.time() - step2_start
print(f" Embedding shape: {embeddings.shape}")
print(f" Embedding time: {step2_time:.2f}s")
# 聚類
print("\n[Step 3] Robust speaker clustering...")
step3_start = time.time()
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
embeddings, n_speakers=None, max_speakers=10
)
step3_time = time.time() - step3_start
print(f" Clustering time: {step3_time:.2f}s")
# 建立輸出
result = {
"audio_path": str(audio_path),
"total_duration": len(wav) / sample_rate,
"n_speech_segments": len(speech_segments),
"n_speakers": int(estimated_n_speakers),
"segments": []
}
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
result["segments"].append({
"index": i,
"start": round(start, 3),
"end": round(end, 3),
"duration": round(end - start, 3),
"speaker": f"SPEAKER_{int(label)}"
})
# 加入 embeddings每個 segment 對應的 192-D speaker embedding
result["embeddings"] = []
for emb in embeddings:
result["embeddings"].append(emb.tolist())
# 統計
speaker_stats = {}
for seg in result["segments"]:
speaker = seg["speaker"]
if speaker not in speaker_stats:
speaker_stats[speaker] = {"count": 0, "duration": 0}
speaker_stats[speaker]["count"] += 1
speaker_stats[speaker]["duration"] += seg["duration"]
result["speaker_stats"] = speaker_stats
total_time = time.time() - start_time
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print("\n[SelfASRX-Fixed] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
if output_path:
import json
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f" Results saved to: {output_path}")
print("=" * 60)
return result
def main():
import argparse
@@ -180,14 +291,14 @@ def main():
)
if "error" not in result:
print(f"\n[Summary]")
print("\n[Summary]")
print(f" Audio duration: {result['total_duration']:.2f}s")
print(f" Speech segments: {result['n_speech_segments']}")
print(f" Detected speakers: {result['n_speakers']}")
print(f" Processing time: {result['processing_time']:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f"\n[Speaker Statistics]")
print("\n[Speaker Statistics]")
for speaker, stats in result['speaker_stats'].items():
pct = stats['duration'] / result['total_duration'] * 100
print(f" {speaker}: {stats['count']} segments, " +