feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/asr_processor_base.py
+++ b/scripts/asr_processor_base.py
@@ -0,0 +1,119 @@
+#!/opt/homebrew/bin/python3.11
+import sys
+import json
+import os
+import argparse
+import signal
+import subprocess
+from faster_whisper import WhisperModel
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from redis_publisher import RedisPublisher
+
+
+def signal_handler(signum, frame):
+    print(f"ASR: Received signal {signum}, exiting...")
+    sys.exit(1)
+
+
+def has_audio_stream(video_path):
+    """Check if video file has audio stream using ffprobe."""
+    try:
+        cmd = [
+            "ffprobe",
+            "-v",
+            "error",
+            "-select_streams",
+            "a",
+            "-show_entries",
+            "stream=codec_type",
+            "-of",
+            "csv=p=0",
+            video_path,
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return bool(result.stdout.strip())
+    except subprocess.CalledProcessError:
+        return False
+    except FileNotFoundError:
+        print("WARNING: ffprobe not found, assuming audio exists")
+        return True
+
+
+def run_asr(video_path, output_path, uuid: str = ""):
+    # Set up signal handlers
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
+
+    publisher = RedisPublisher(uuid) if uuid else None
+    if publisher:
+        publisher.info("asr", "ASR_START")
+
+    # Check for audio stream
+    if not has_audio_stream(video_path):
+        if publisher:
+            publisher.info("asr", "No audio stream detected, skipping transcription")
+        output = {"language": "", "language_probability": 0.0, "segments": []}
+        with open(output_path, "w") as f:
+            json.dump(output, f, indent=2)
+        if publisher:
+            publisher.complete("asr", "0 segments (no audio)")
+        sys.stderr.write("ASR: No audio stream, skipping transcription\n")
+        sys.stderr.flush()
+        sys.exit(0)
+
+    if publisher:
+        publisher.info("asr", "Loading Whisper model...")
+
+    # Use base model with CPU (MPS not supported by faster_whisper)
+    model = WhisperModel("base", device="cpu", compute_type="int8")
+
+    if publisher:
+        publisher.info("asr", f"Transcribing: {video_path}")
+
+    segments, info = model.transcribe(video_path, beam_size=5)
+
+    if publisher:
+        publisher.info("asr", f"ASR_LANGUAGE:{info.language}")
+
+    results = []
+    total_segments = 0
+
+    for segment in segments:
+        results.append(
+            {"start": segment.start, "end": segment.end, "text": segment.text.strip()}
+        )
+        total_segments += 1
+        if total_segments % 100 == 0:
+            if publisher:
+                publisher.progress(
+                    "asr", total_segments, 0, f"Segment {total_segments}"
+                )
+
+    output = {
+        "language": info.language,
+        "language_probability": info.language_probability,
+        "segments": results,
+    }
+
+    with open(output_path, "w") as f:
+        json.dump(output, f, indent=2)
+
+    if publisher:
+        publisher.complete("asr", f"{len(results)} segments")
+
+    sys.stderr.write(
+        f"ASR: Transcription complete, {len(results)} segments written to {output_path}\n"
+    )
+    sys.stderr.flush()
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ASR Transcription (base model)")
+    parser.add_argument("video_path", help="Path to video file")
+    parser.add_argument("output_path", help="Output JSON path")
+    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
+    args = parser.parse_args()
+
+    run_asr(args.video_path, args.output_path, args.uuid)