feat: ASRX hybrid pipeline, identity history, worker fixes, checkpoint system
This commit is contained in:
@@ -126,6 +126,52 @@ def extract_speech_audio(audio_path, model, utils, output_dir=None):
|
||||
return speech_audios, speech_segments
|
||||
|
||||
|
||||
def scan_within_segment(wav, sample_rate, start_sec, end_sec, model, utils,
|
||||
min_speech_duration_ms=500, min_silence_duration_ms=300):
|
||||
"""
|
||||
在一個時間範圍內執行 VAD 掃描,切出子片段。
|
||||
|
||||
用途: whisper 給出的粗略時間段內,利用句間停頓細切。
|
||||
|
||||
Args:
|
||||
wav: 完整音頻波形 (numpy array)
|
||||
sample_rate: 採樣率
|
||||
start_sec: 掃描起始時間 (秒)
|
||||
end_sec: 掃描結束時間 (秒)
|
||||
model: VAD 模型
|
||||
utils: VAD 工具函數
|
||||
min_speech_duration_ms: 最小語音持續時間
|
||||
min_silence_duration_ms: 最小靜音持續時間
|
||||
|
||||
Returns:
|
||||
sub_segments: [(start_sec, end_sec), ...] 子片段列表 (原始時間軸)
|
||||
"""
|
||||
get_speech_timestamps, _, _, _, _ = utils
|
||||
|
||||
# 提取該時間範圍內的音頻
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
segment_wav = wav[start_sample:end_sample]
|
||||
|
||||
# 在子音頻上執行 VAD
|
||||
speech_ts = get_speech_timestamps(
|
||||
segment_wav,
|
||||
model,
|
||||
sampling_rate=sample_rate,
|
||||
min_speech_duration_ms=min_speech_duration_ms,
|
||||
min_silence_duration_ms=min_silence_duration_ms,
|
||||
return_seconds=True,
|
||||
)
|
||||
|
||||
# 轉換回原始時間軸
|
||||
sub_segments = [
|
||||
(ts["start"] + start_sec, ts["end"] + start_sec)
|
||||
for ts in speech_ts
|
||||
]
|
||||
|
||||
return sub_segments
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 測試 VAD
|
||||
import sys
|
||||
|
||||
Reference in New Issue
Block a user