feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/asr_processor.py
+++ b/scripts/asr_processor.py
@@ -1,12 +1,36 @@
 #!/opt/homebrew/bin/python3.11
+"""
+ASR Processor - faster-whisper small model (Production)
+
+Version: 2.1
+Model: small (int8 quantization, CPU)
+Reason: small 模型在準確率和速度間取得最佳平衡
+        經實驗驗證，最少要使用 small 才可以較好的處理多語種及台灣腔國語
+
+Configuration:
+- Model: faster-whisper/small
+- Device: CPU (MPS not supported by faster_whisper)
+- Compute: int8
+- Beam size: 5
+- VAD filter: enabled (min_silence=500ms, speech_pad=200ms)
+- Audio fallback: ffmpeg extraction for PyAV-incompatible streams (v2.1)
+"""
 import sys
 import json
 import os
+import time
 import argparse
 import signal
 import subprocess
+import tempfile
+from datetime import datetime
 from faster_whisper import WhisperModel

+PROCESSOR_VERSION = "2.1"
+MODEL_SIZE = "small"
+DEVICE = "cpu"
+COMPUTE_TYPE = "int8"
+
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from redis_publisher import RedisPublisher

@@ -40,6 +64,84 @@ def has_audio_stream(video_path):
        return True


+def extract_audio_with_ffmpeg(video_path):
+    """Extract audio from video to WAV using ffmpeg.
+    
+    Returns path to temporary WAV file. Caller is responsible for cleanup.
+    """
+    wav_path = tempfile.mktemp(suffix=".wav", prefix="asr_audio_")
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i", video_path,
+        "-vn",
+        "-acodec", "pcm_s16le",
+        "-ar", "16000",
+        "-ac", "1",
+        wav_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        sys.stderr.write(f"ASR: ffmpeg extraction failed: {result.stderr}\n")
+        sys.stderr.flush()
+        return None
+    return wav_path
+
+
+def transcribe_with_fallback(model, video_path, publisher=None):
+    """Transcribe video with fallback to ffmpeg-extracted WAV.
+    
+    First tries direct transcription (PyAV). If PyAV fails to decode,
+    falls back to ffmpeg audio extraction then transcription.
+    """
+    # Try direct transcription first
+    try:
+        if publisher:
+            publisher.info("asr", "Direct transcription attempt...")
+        return model.transcribe(
+            video_path,
+            beam_size=5,
+            vad_filter=True,
+            vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
+        )
+    except Exception as e:
+        error_str = str(e)
+        # Check if it's a PyAV/av decoding error
+        is_pyav_error = any(
+            keyword in error_str.lower()
+            for keyword in ["av.error", "avcodec", "decode", "packet"]
+        )
+        
+        if not is_pyav_error:
+            raise  # Re-raise non-PyAV errors
+        
+        if publisher:
+            publisher.info("asr", "PyAV decode failed, falling back to ffmpeg extraction...")
+        sys.stderr.write("ASR: PyAV decode error detected, falling back to ffmpeg extraction\n")
+        sys.stderr.flush()
+        
+        wav_path = extract_audio_with_ffmpeg(video_path)
+        if wav_path is None:
+            raise RuntimeError("Failed to extract audio with ffmpeg")
+        
+        try:
+            if publisher:
+                publisher.info("asr", "Transcribing extracted WAV audio...")
+            segments, info = model.transcribe(
+                wav_path,
+                beam_size=5,
+                vad_filter=True,
+                vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
+            )
+            return segments, info
+        finally:
+            # Clean up temporary WAV file
+            try:
+                os.remove(wav_path)
+            except OSError:
+                pass
+
+
 def run_asr(video_path, output_path, uuid: str = ""):
    # Set up signal handlers
    signal.signal(signal.SIGTERM, signal_handler)
@@ -72,13 +174,8 @@ def run_asr(video_path, output_path, uuid: str = ""):
    if publisher:
        publisher.info("asr", f"Transcribing: {video_path}")

-    # Transcribe with VAD filter for better accuracy
-    segments, info = model.transcribe(
-        video_path,
-        beam_size=5,
-        vad_filter=True,
-        vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
-    )
+    # Transcribe with VAD filter for better accuracy, with PyAV fallback
+    segments, info = transcribe_with_fallback(model, video_path, publisher)

    if publisher:
        publisher.info("asr", f"ASR_LANGUAGE:{info.language}")