cleanup: remove dead code and duplicate docs

- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00
parent ee81e343ce
commit e75c4d6f07
3270 changed files with 35190 additions and 53367 deletions
--- a/scripts/asrx_self/main_fixed.py
+++ b/scripts/asrx_self/main_fixed.py
@@ -134,7 +134,7 @@ class SelfASRXFixed:
        result["processing_time"] = round(total_time, 2)
        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
        
-        print(f"\n[SelfASRX-Fixed] Processing completed!")
+        print("\n[SelfASRX-Fixed] Processing completed!")
        print(f"  Total time: {total_time:.2f}s")
        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
        print(f"  Detected speakers: {estimated_n_speakers}")
@@ -154,6 +154,117 @@ class SelfASRXFixed:
        return result


+    def process_with_segments(self, audio_path, asr_segments, output_path=None):
+        """
+        使用 ASR segment 邊界進行 speaker diarization，取代 VAD 步驟。
+        
+        Args:
+            audio_path: 音頻文件路徑（WAV）
+            asr_segments: ASR segment 列表，每個包含 start/end（秒）
+            output_path: 輸出 JSON 路徑（可選）
+        """
+        start_time = time.time()
+        print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
+        print("=" * 60)
+
+        # 載入完整音頻
+        import soundfile as sf
+        wav, sample_rate = sf.read(audio_path)
+        if len(wav.shape) > 1:
+            wav = np.mean(wav, axis=1)  # 轉 mono
+        print(f"  Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
+
+        # 使用 ASR segments 取代 VAD
+        speech_segments = [(s["start"], s["end"]) for s in asr_segments]
+        print(f"  Speech segments from ASR: {len(speech_segments)}")
+
+        if len(speech_segments) == 0:
+            print("[SelfASRX-Fixed] No ASR segments provided!")
+            return {"error": "No ASR segments", "segments": []}
+
+        # 提取語音片段
+        audio_segments = []
+        for start_sec, end_sec in speech_segments:
+            start_sample = int(start_sec * sample_rate)
+            end_sample = int(end_sec * sample_rate)
+            if start_sample >= len(wav):
+                continue
+            audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
+
+        print(f"  Audio segments extracted: {len(audio_segments)}")
+
+        # 批量提取聲紋嵌入
+        print("\n[Step 2] Speaker embedding extraction...")
+        step2_start = time.time()
+        embeddings = extract_speaker_embeddings_batch(
+            self.speaker_encoder, audio_segments, sample_rate
+        )
+        embeddings = normalize_embeddings(embeddings)
+        step2_time = time.time() - step2_start
+        print(f"  Embedding shape: {embeddings.shape}")
+        print(f"  Embedding time: {step2_time:.2f}s")
+
+        # 聚類
+        print("\n[Step 3] Robust speaker clustering...")
+        step3_start = time.time()
+        speaker_labels, estimated_n_speakers = robust_speaker_clustering(
+            embeddings, n_speakers=None, max_speakers=10
+        )
+        step3_time = time.time() - step3_start
+        print(f"  Clustering time: {step3_time:.2f}s")
+
+        # 建立輸出
+        result = {
+            "audio_path": str(audio_path),
+            "total_duration": len(wav) / sample_rate,
+            "n_speech_segments": len(speech_segments),
+            "n_speakers": int(estimated_n_speakers),
+            "segments": []
+        }
+
+        for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
+            result["segments"].append({
+                "index": i,
+                "start": round(start, 3),
+                "end": round(end, 3),
+                "duration": round(end - start, 3),
+                "speaker": f"SPEAKER_{int(label)}"
+            })
+
+        # 加入 embeddings（每個 segment 對應的 192-D speaker embedding）
+        result["embeddings"] = []
+        for emb in embeddings:
+            result["embeddings"].append(emb.tolist())
+
+        # 統計
+        speaker_stats = {}
+        for seg in result["segments"]:
+            speaker = seg["speaker"]
+            if speaker not in speaker_stats:
+                speaker_stats[speaker] = {"count": 0, "duration": 0}
+            speaker_stats[speaker]["count"] += 1
+            speaker_stats[speaker]["duration"] += seg["duration"]
+        result["speaker_stats"] = speaker_stats
+
+        total_time = time.time() - start_time
+        result["processing_time"] = round(total_time, 2)
+        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
+
+        print("\n[SelfASRX-Fixed] Processing completed!")
+        print(f"  Total time: {total_time:.2f}s")
+        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
+        print(f"  Detected speakers: {estimated_n_speakers}")
+
+        if output_path:
+            import json
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(result, f, indent=2, ensure_ascii=False)
+            print(f"  Results saved to: {output_path}")
+
+        print("=" * 60)
+        return result
+
+
 def main():
    import argparse
    
@@ -180,14 +291,14 @@ def main():
    )
    
    if "error" not in result:
-        print(f"\n[Summary]")
+        print("\n[Summary]")
        print(f"  Audio duration: {result['total_duration']:.2f}s")
        print(f"  Speech segments: {result['n_speech_segments']}")
        print(f"  Detected speakers: {result['n_speakers']}")
        print(f"  Processing time: {result['processing_time']:.2f}s")
        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
        
-        print(f"\n[Speaker Statistics]")
+        print("\n[Speaker Statistics]")
        for speaker, stats in result['speaker_stats'].items():
            pct = stats['duration'] / result['total_duration'] * 100
            print(f"  {speaker}: {stats['count']} segments, " +