feat: media API (video/bbox/thumbnail), UUID unification, dot matrix text, portal fixes, API dictionary V1.3

This commit is contained in:
Warren
2026-05-06 13:34:49 +08:00
parent e75c4d6f07
commit 74b6182eba
197 changed files with 17511 additions and 8759 deletions

View File

@@ -18,6 +18,7 @@ Configuration:
import sys
import json
import os
import time
import argparse
import signal
import subprocess
@@ -179,7 +180,11 @@ def run_asr(video_path, output_path, uuid: str = ""):
if publisher:
publisher.info("asr", "Loading Whisper model...")
sys.stderr.write(f"[ASR] Loading Whisper model {MODEL_SIZE}...\n")
sys.stderr.flush()
model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
sys.stderr.write(f"[ASR] Model loaded\n")
sys.stderr.flush()
if publisher:
publisher.info("asr", f"Transcribing: {video_path}")
@@ -189,10 +194,13 @@ def run_asr(video_path, output_path, uuid: str = ""):
if cut_scenes:
# 分段處理:對每個場景萃取音訊並轉錄
sys.stderr.write(f"[ASR] Starting segmented transcription for {len(cut_scenes)} scenes\n")
sys.stderr.flush()
import subprocess
import tempfile
import json
temp_dir = tempfile.mkdtemp(prefix="asr_cut_")
sys.stderr.write(f"[ASR] Temp dir: {temp_dir}\n")
sys.stderr.flush()
transcript_language = None
# 建立 scene lookup: 給定時間點,找是哪個 scene
@@ -204,29 +212,79 @@ def run_asr(video_path, output_path, uuid: str = ""):
# 逐段處理,每段結果即時寫入 .asr.tmp
tmp_path = output_path + ".tmp"
err_path = output_path + ".err"
all_segments = []
# Resume: 若 executor 將 .tmp rename 成 .err先救回
if not os.path.exists(tmp_path) and os.path.exists(err_path) and os.path.getsize(err_path) > 10:
try:
os.rename(err_path, tmp_path)
sys.stderr.write(f"[ASR] Recovered .err → .tmp for resume ({os.path.getsize(tmp_path)} bytes)\n")
sys.stderr.flush()
except Exception as e:
sys.stderr.write(f"[ASR] Failed to recover .err: {e}\n")
sys.stderr.flush()
# Resume: 若已有 .asr.tmp載入已完成的 segments 並跳過已處理的 scenes
resume_from_scene = 0
if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 10:
try:
with open(tmp_path) as f:
existing = json.load(f)
all_segments = existing.get("segments", [])
if all_segments:
# 找出最後一個 segment 的 end_time決定 resume 起點
last_end = max(s.get("end", 0) for s in all_segments)
# 找出最後完成的 scene_idx場景 end_time > last_end
for i, (st, et) in enumerate(cut_scenes):
if et > last_end:
resume_from_scene = i
break
else:
resume_from_scene = len(cut_scenes) # 全部完成
# 繼承 language
if existing.get("language"):
transcript_language = existing["language"]
sys.stderr.write(f"[ASR] Resume from scene {resume_from_scene}/{len(cut_scenes)} "
f"(last segment end={last_end:.1f}s, {len(all_segments)} existing segments)\n")
sys.stderr.flush()
except Exception as e:
sys.stderr.write(f"[ASR] Failed to load tmp for resume: {e}, starting fresh\n")
sys.stderr.flush()
all_segments = []
for idx, (start_t, end_t) in enumerate(cut_scenes):
if idx < resume_from_scene:
continue # 跳過已處理的 scenes
seg_wav = os.path.join(temp_dir, f"seg_{idx:04d}.wav")
sys.stderr.write(f"[ASR] Scene {idx}: {start_t:.1f}-{end_t:.1f}s\n")
sys.stderr.flush()
# 用 ffmpeg 萃取出該段音訊
t0 = time.time()
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
"-ss", str(start_t), "-to", str(end_t),
"-ar", "16000", "-ac", "1", seg_wav]
subprocess.run(cmd, check=False, capture_output=True)
sys.stderr.write(f"[ASR] Scene {idx}: ffmpeg took {time.time()-t0:.1f}s\n")
sys.stderr.flush()
if not os.path.exists(seg_wav) or os.path.getsize(seg_wav) < 100:
continue # 跳過空音訊
sys.stderr.write(f"[ASR] Scene {idx}: empty audio, skipping\n")
sys.stderr.flush()
continue
try:
t1 = time.time()
seg_result, seg_info = model.transcribe(
seg_wav, beam_size=5,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
)
if transcript_language is None:
transcript_language = seg_info.language
sys.stderr.write(f"[ASR] Scene {idx}: transcribe took {time.time()-t1:.1f}s, language={seg_info.language}\n")
sys.stderr.flush()
scene_segments = []
seg_language = seg_info.language if seg_info else transcript_language
for segment in seg_result:
seg_start = start_t + segment.start
seg_end = start_t + segment.end
@@ -236,6 +294,7 @@ def run_asr(video_path, output_path, uuid: str = ""):
"end": seg_end,
"text": segment.text.strip(),
"scene_number": scene_idx + 1,
"language": seg_language,
})
total_segments += 1