feat: media API (video/bbox/thumbnail), UUID unification, dot matrix text, portal fixes, API dictionary V1.3
This commit is contained in:
@@ -18,6 +18,7 @@ Configuration:
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import signal
|
||||
import subprocess
|
||||
@@ -179,7 +180,11 @@ def run_asr(video_path, output_path, uuid: str = ""):
|
||||
if publisher:
|
||||
publisher.info("asr", "Loading Whisper model...")
|
||||
|
||||
sys.stderr.write(f"[ASR] Loading Whisper model {MODEL_SIZE}...\n")
|
||||
sys.stderr.flush()
|
||||
model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
|
||||
sys.stderr.write(f"[ASR] Model loaded\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", f"Transcribing: {video_path}")
|
||||
@@ -189,10 +194,13 @@ def run_asr(video_path, output_path, uuid: str = ""):
|
||||
|
||||
if cut_scenes:
|
||||
# 分段處理:對每個場景萃取音訊並轉錄
|
||||
sys.stderr.write(f"[ASR] Starting segmented transcription for {len(cut_scenes)} scenes\n")
|
||||
sys.stderr.flush()
|
||||
import subprocess
|
||||
import tempfile
|
||||
import json
|
||||
temp_dir = tempfile.mkdtemp(prefix="asr_cut_")
|
||||
sys.stderr.write(f"[ASR] Temp dir: {temp_dir}\n")
|
||||
sys.stderr.flush()
|
||||
transcript_language = None
|
||||
|
||||
# 建立 scene lookup: 給定時間點,找是哪個 scene
|
||||
@@ -204,29 +212,79 @@ def run_asr(video_path, output_path, uuid: str = ""):
|
||||
|
||||
# 逐段處理,每段結果即時寫入 .asr.tmp
|
||||
tmp_path = output_path + ".tmp"
|
||||
err_path = output_path + ".err"
|
||||
all_segments = []
|
||||
|
||||
# Resume: 若 executor 將 .tmp rename 成 .err,先救回
|
||||
if not os.path.exists(tmp_path) and os.path.exists(err_path) and os.path.getsize(err_path) > 10:
|
||||
try:
|
||||
os.rename(err_path, tmp_path)
|
||||
sys.stderr.write(f"[ASR] Recovered .err → .tmp for resume ({os.path.getsize(tmp_path)} bytes)\n")
|
||||
sys.stderr.flush()
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"[ASR] Failed to recover .err: {e}\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
# Resume: 若已有 .asr.tmp,載入已完成的 segments 並跳過已處理的 scenes
|
||||
resume_from_scene = 0
|
||||
if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 10:
|
||||
try:
|
||||
with open(tmp_path) as f:
|
||||
existing = json.load(f)
|
||||
all_segments = existing.get("segments", [])
|
||||
if all_segments:
|
||||
# 找出最後一個 segment 的 end_time,決定 resume 起點
|
||||
last_end = max(s.get("end", 0) for s in all_segments)
|
||||
# 找出最後完成的 scene_idx(場景 end_time > last_end)
|
||||
for i, (st, et) in enumerate(cut_scenes):
|
||||
if et > last_end:
|
||||
resume_from_scene = i
|
||||
break
|
||||
else:
|
||||
resume_from_scene = len(cut_scenes) # 全部完成
|
||||
# 繼承 language
|
||||
if existing.get("language"):
|
||||
transcript_language = existing["language"]
|
||||
sys.stderr.write(f"[ASR] Resume from scene {resume_from_scene}/{len(cut_scenes)} "
|
||||
f"(last segment end={last_end:.1f}s, {len(all_segments)} existing segments)\n")
|
||||
sys.stderr.flush()
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"[ASR] Failed to load tmp for resume: {e}, starting fresh\n")
|
||||
sys.stderr.flush()
|
||||
all_segments = []
|
||||
|
||||
for idx, (start_t, end_t) in enumerate(cut_scenes):
|
||||
if idx < resume_from_scene:
|
||||
continue # 跳過已處理的 scenes
|
||||
seg_wav = os.path.join(temp_dir, f"seg_{idx:04d}.wav")
|
||||
sys.stderr.write(f"[ASR] Scene {idx}: {start_t:.1f}-{end_t:.1f}s\n")
|
||||
sys.stderr.flush()
|
||||
# 用 ffmpeg 萃取出該段音訊
|
||||
t0 = time.time()
|
||||
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
|
||||
"-ss", str(start_t), "-to", str(end_t),
|
||||
"-ar", "16000", "-ac", "1", seg_wav]
|
||||
subprocess.run(cmd, check=False, capture_output=True)
|
||||
sys.stderr.write(f"[ASR] Scene {idx}: ffmpeg took {time.time()-t0:.1f}s\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
if not os.path.exists(seg_wav) or os.path.getsize(seg_wav) < 100:
|
||||
continue # 跳過空音訊
|
||||
sys.stderr.write(f"[ASR] Scene {idx}: empty audio, skipping\n")
|
||||
sys.stderr.flush()
|
||||
continue
|
||||
|
||||
try:
|
||||
t1 = time.time()
|
||||
seg_result, seg_info = model.transcribe(
|
||||
seg_wav, beam_size=5,
|
||||
vad_filter=True,
|
||||
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
|
||||
)
|
||||
if transcript_language is None:
|
||||
transcript_language = seg_info.language
|
||||
sys.stderr.write(f"[ASR] Scene {idx}: transcribe took {time.time()-t1:.1f}s, language={seg_info.language}\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
scene_segments = []
|
||||
seg_language = seg_info.language if seg_info else transcript_language
|
||||
for segment in seg_result:
|
||||
seg_start = start_t + segment.start
|
||||
seg_end = start_t + segment.end
|
||||
@@ -236,6 +294,7 @@ def run_asr(video_path, output_path, uuid: str = ""):
|
||||
"end": seg_end,
|
||||
"text": segment.text.strip(),
|
||||
"scene_number": scene_idx + 1,
|
||||
"language": seg_language,
|
||||
})
|
||||
total_segments += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user