feat: ASRX hybrid pipeline, identity history, worker fixes, checkpoint system

This commit is contained in:
Accusys
2026-06-02 07:13:23 +08:00
parent e3066c3f49
commit e1572907ae
198 changed files with 43705 additions and 8910 deletions

View File

@@ -126,6 +126,52 @@ def extract_speech_audio(audio_path, model, utils, output_dir=None):
return speech_audios, speech_segments
def scan_within_segment(wav, sample_rate, start_sec, end_sec, model, utils,
min_speech_duration_ms=500, min_silence_duration_ms=300):
"""
在一個時間範圍內執行 VAD 掃描,切出子片段。
用途: whisper 給出的粗略時間段內,利用句間停頓細切。
Args:
wav: 完整音頻波形 (numpy array)
sample_rate: 採樣率
start_sec: 掃描起始時間 (秒)
end_sec: 掃描結束時間 (秒)
model: VAD 模型
utils: VAD 工具函數
min_speech_duration_ms: 最小語音持續時間
min_silence_duration_ms: 最小靜音持續時間
Returns:
sub_segments: [(start_sec, end_sec), ...] 子片段列表 (原始時間軸)
"""
get_speech_timestamps, _, _, _, _ = utils
# 提取該時間範圍內的音頻
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
segment_wav = wav[start_sample:end_sample]
# 在子音頻上執行 VAD
speech_ts = get_speech_timestamps(
segment_wav,
model,
sampling_rate=sample_rate,
min_speech_duration_ms=min_speech_duration_ms,
min_silence_duration_ms=min_silence_duration_ms,
return_seconds=True,
)
# 轉換回原始時間軸
sub_segments = [
(ts["start"] + start_sec, ts["end"] + start_sec)
for ts in speech_ts
]
return sub_segments
if __name__ == "__main__":
# 測試 VAD
import sys