feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/asr_processor_debug.py
+++ b/scripts/asr_processor_debug.py
@@ -0,0 +1,722 @@
+#!/opt/homebrew/bin/python3.11
+"""
+ASR Processor with chunked transcription for large files and resource monitoring.
+Maintains backward compatibility with existing API.
+"""
+
+import sys
+import json
+import os
+import argparse
+import signal
+import subprocess
+import tempfile
+import time
+import shutil
+from typing import List, Dict, Any, Optional, Tuple
+
+# Try to import psutil for resource monitoring
+PSUTIL_AVAILABLE = False
+psutil = None
+try:
+    import psutil
+
+    PSUTIL_AVAILABLE = True
+except ImportError:
+    sys.stderr.write("WARNING: psutil not available, resource monitoring disabled\n")
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from redis_publisher import RedisPublisher  # noqa: E402
+
+
+def save_checkpoint(
+    checkpoint_path: str,
+    segments: List[Dict[str, Any]],
+    language: Optional[str],
+    language_prob: Optional[float],
+    processed_chunks: List[int],
+    total_chunks: int,
+) -> None:
+    """Save transcription checkpoint to resume later."""
+    checkpoint_data = {
+        "segments": segments,
+        "language": language or "",
+        "language_probability": language_prob or 0.0,
+        "processed_chunks": processed_chunks,
+        "total_chunks": total_chunks,
+        "timestamp": time.time(),
+    }
+    try:
+        with open(checkpoint_path, "w") as f:
+            json.dump(checkpoint_data, f, indent=2, default=str)
+    except Exception as e:
+        sys.stderr.write(f"ASR: Failed to save checkpoint: {e}\n")
+
+
+def load_checkpoint(checkpoint_path: str) -> Optional[Dict[str, Any]]:
+    """Load transcription checkpoint if exists."""
+    try:
+        with open(checkpoint_path, "r") as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def check_health() -> Dict[str, Any]:
+    """Check health of ASR processor dependencies."""
+    health = {
+        "status": "healthy",
+        "checks": {},
+        "timestamp": time.time(),
+    }
+
+    # Check ffmpeg
+    try:
+        result = subprocess.run(["ffmpeg", "-version"], capture_output=True, text=True)
+        health["checks"]["ffmpeg"] = {
+            "available": result.returncode == 0,
+            "version": result.stdout.split("\n")[0].split(" ")[2]
+            if result.stdout
+            else "unknown",
+        }
+    except Exception as e:
+        health["checks"]["ffmpeg"] = {"available": False, "error": str(e)}
+
+    # Check ffprobe
+    try:
+        result = subprocess.run(["ffprobe", "-version"], capture_output=True, text=True)
+        health["checks"]["ffprobe"] = {
+            "available": result.returncode == 0,
+            "version": result.stdout.split("\n")[0].split(" ")[2]
+            if result.stdout
+            else "unknown",
+        }
+    except Exception as e:
+        health["checks"]["ffprobe"] = {"available": False, "error": str(e)}
+
+    # Check faster_whisper import
+    try:
+        import faster_whisper
+
+        health["checks"]["faster_whisper"] = {
+            "available": True,
+            "version": getattr(faster_whisper, "__version__", "unknown"),
+        }
+    except ImportError as e:
+        health["checks"]["faster_whisper"] = {"available": False, "error": str(e)}
+        health["status"] = "unhealthy"
+
+    # Check psutil import
+    try:
+        import psutil
+
+        health["checks"]["psutil"] = {
+            "available": True,
+            "version": getattr(psutil, "__version__", "unknown"),
+        }
+    except ImportError:
+        health["checks"]["psutil"] = {
+            "available": False,
+            "warning": "resource monitoring disabled",
+        }
+
+    # Determine overall status
+    if not health["checks"].get("ffmpeg", {}).get("available", False) or not health[
+        "checks"
+    ].get("ffprobe", {}).get("available", False):
+        health["status"] = "unhealthy"
+
+    return health
+
+
+def signal_handler(signum, frame):
+    sys.stderr.write(f"ASR: Received signal {signum}, exiting...\n")
+    sys.exit(1)
+
+
+def has_audio_stream(video_path: str) -> bool:
+    """Check if video file has audio stream using ffprobe."""
+    try:
+        cmd = [
+            "ffprobe",
+            "-v",
+            "error",
+            "-select_streams",
+            "a",
+            "-show_entries",
+            "stream=codec_type",
+            "-of",
+            "csv=p=0",
+            video_path,
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return bool(result.stdout.strip())
+    except subprocess.CalledProcessError:
+        return False
+    except FileNotFoundError:
+        sys.stderr.write("WARNING: ffprobe not found, assuming audio exists\n")
+        return True
+
+
+def get_media_duration(media_path: str) -> float:
+    """Get media duration in seconds using ffprobe."""
+    cmd = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-show_entries",
+        "format=duration",
+        "-of",
+        "csv=p=0",
+        media_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    try:
+        return float(result.stdout.strip())
+    except (ValueError, AttributeError):
+        return 0.0
+
+
+def extract_audio(video_path: str, audio_path: str) -> bool:
+    """Extract audio from video to WAV format."""
+    cmd = [
+        "ffmpeg",
+        "-i",
+        video_path,
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-ac",
+        "1",
+        "-y",
+        audio_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True)
+    return result.returncode == 0 and os.path.exists(audio_path)
+
+
+def extract_chunk(
+    audio_path: str, start: float, duration: float, output_path: str
+) -> bool:
+    """Extract a chunk of audio using ffmpeg."""
+    cmd = [
+        "ffmpeg",
+        "-i",
+        audio_path,
+        "-ss",
+        str(start),
+        "-t",
+        str(duration),
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-ac",
+        "1",
+        "-y",
+        output_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True)
+    success = (
+        result.returncode == 0
+        and os.path.exists(output_path)
+        and os.path.getsize(output_path) > 0
+    )
+    sys.stderr.write(
+        f"ASR_DEBUG: extract_chunk: start={start}, duration={duration}, success={success}, returncode={result.returncode}\n"
+    )
+    sys.stderr.flush()
+    return success
+
+
+def monitor_resources(pid: int, interval: float = 0.1) -> Dict[str, Any]:
+    """Monitor CPU and memory usage for a process."""
+    if not PSUTIL_AVAILABLE or psutil is None:
+        return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
+
+    try:
+        process = psutil.Process(pid)
+        cpu_percent = process.cpu_percent(interval=interval)
+        memory_info = process.memory_info()
+        memory_mb = memory_info.rss / (1024 * 1024)
+        return {
+            "cpu_percent": cpu_percent,
+            "memory_mb": memory_mb,
+            "available": True,
+            "pid": pid,
+        }
+    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+        return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
+
+
+def transcribe_direct(
+    model, audio_path: str, publisher: Optional[RedisPublisher] = None
+) -> Tuple[List[Dict[str, Any]], Any]:
+    """Transcribe audio directly (non-chunked)."""
+    if publisher:
+        publisher.info("asr", "Transcribing audio directly...")
+
+    start_time = time.time()
+    segments, info = model.transcribe(audio_path, beam_size=5)
+
+    results = []
+    total_segments = 0
+    for segment in segments:
+        results.append(
+            {"start": segment.start, "end": segment.end, "text": segment.text.strip()}
+        )
+        total_segments += 1
+        if total_segments % 100 == 0 and publisher:
+            publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
+
+    elapsed = time.time() - start_time
+    if publisher:
+        publisher.info(
+            "asr", f"Direct transcription: {len(results)} segments in {elapsed:.1f}s"
+        )
+
+    return results, info
+
+
+def transcribe_chunk(
+    model,
+    chunk_path: str,
+    chunk_start: float,
+    chunk_idx: int,
+    total_chunks: int,
+    publisher: Optional[RedisPublisher] = None,
+) -> Tuple[List[Dict[str, Any]], Any]:
+    """Transcribe a single audio chunk."""
+    if publisher:
+        publisher.info("asr", f"Transcribing chunk {chunk_idx + 1}/{total_chunks}")
+
+    sys.stderr.write(
+        f"ASR_DEBUG: transcribe_chunk: chunk_idx={chunk_idx}, path={chunk_path}, size={os.path.getsize(chunk_path) if os.path.exists(chunk_path) else 0}\n"
+    )
+    sys.stderr.flush()
+
+    start_time = time.time()
+    segments, info = model.transcribe(chunk_path, beam_size=5)
+    sys.stderr.write(
+        "ASR_DEBUG: transcribe_chunk: transcription completed, got segments\n"
+    )
+    sys.stderr.flush()
+
+    results = []
+    for segment in segments:
+        results.append(
+            {
+                "start": segment.start + chunk_start,
+                "end": segment.end + chunk_start,
+                "text": segment.text.strip(),
+            }
+        )
+
+    elapsed = time.time() - start_time
+    if publisher:
+        publisher.info(
+            "asr",
+            f"Chunk {chunk_idx + 1}/{total_chunks}: {len(results)} segments in {elapsed:.1f}s",
+        )
+
+    return results, info
+
+
+def run_asr(
+    video_path: str,
+    output_path: str,
+    uuid: str = "",
+    chunk_duration: int = 600,  # 10 minutes default
+    max_direct_duration: int = 1200,  # 20 minutes: use direct transcription for shorter files (safe limit)
+    model_size: str = "tiny",
+    compute_type: str = "int8",
+    monitor_interval: int = 60,
+) -> None:
+    # Set up signal handlers
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
+
+    publisher = RedisPublisher(uuid) if uuid else None
+    if publisher:
+        publisher.info("asr", "ASR_START")
+    sys.stderr.write("ASR_DEBUG: Audio stream check...\n")
+
+    # Check for audio stream
+    if not has_audio_stream(video_path):
+        if publisher:
+            publisher.info("asr", "No audio stream detected, skipping transcription")
+        output = {
+            "processor_name": "asr",
+            "processor_version": "2.0.0",
+            "contract_version": "1.0",
+            "language": None,
+            "language_probability": None,
+            "segments": [],
+        }
+        with open(output_path, "w") as f:
+            json.dump(output, f, indent=2)
+        if publisher:
+            publisher.complete("asr", "0 segments (no audio)")
+        sys.stderr.write("ASR: No audio stream, skipping transcription\n")
+        sys.stderr.flush()
+        sys.exit(0)
+
+    # Create temporary directory
+    sys.stderr.write("ASR_DEBUG: Creating temporary directory...\n")
+    temp_dir = tempfile.mkdtemp(prefix="asr_")
+    sys.stderr.write(f"ASR_DEBUG: temp_dir={temp_dir}\n")
+    audio_path = os.path.join(temp_dir, "audio.wav")
+
+    if publisher:
+        publisher.info("asr", "Extracting audio from video...")
+    sys.stderr.write("ASR_DEBUG: Extracting audio...\n")
+
+    # Extract audio
+    if not extract_audio(video_path, audio_path):
+        if publisher:
+            publisher.error("asr", "Failed to extract audio")
+        sys.stderr.write("ASR: Failed to extract audio\n")
+        sys.stderr.flush()
+        # Clean up
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        sys.exit(1)
+
+    sys.stderr.write("ASR_DEBUG: Audio extraction successful, getting duration...\n")
+    # Get audio duration
+    try:
+        total_duration = get_media_duration(audio_path)
+    except Exception as e:
+        if publisher:
+            publisher.error("asr", f"Failed to get audio duration: {e}")
+        sys.stderr.write(f"ASR: Failed to get audio duration: {e}\n")
+        sys.stderr.flush()
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        sys.exit(1)
+
+    if publisher:
+        publisher.info(
+            "asr",
+            f"Audio duration: {total_duration:.1f}s ({total_duration / 3600:.1f} hrs)",
+        )
+
+    sys.stderr.write("ASR_DEBUG: Loading Whisper model...\n")
+    # Load Whisper model
+    if publisher:
+        publisher.info(
+            "asr", f"Loading Whisper model ({model_size}, {compute_type})..."
+        )
+
+    try:
+        from faster_whisper import WhisperModel
+
+        model = WhisperModel(model_size, device="cpu", compute_type=compute_type)
+    except Exception as e:
+        if publisher:
+            publisher.error("asr", f"Failed to load Whisper model: {e}")
+        sys.stderr.write(f"ASR: Failed to load Whisper model: {e}\n")
+        sys.stderr.flush()
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        sys.exit(1)
+
+    if publisher:
+        publisher.info("asr", "Whisper model loaded successfully")
+    sys.stderr.write("ASR_DEBUG: Whisper model loaded.\n")
+
+    # Decide whether to use chunked or direct transcription
+    use_chunked = total_duration > max_direct_duration
+    sys.stderr.write(
+        f"ASR_DEBUG: total_duration={total_duration:.1f}s, max_direct_duration={max_direct_duration}s, use_chunked={use_chunked}\n"
+    )
+
+    all_segments = []
+    language = None
+    language_prob = None
+    chunks = []  # Initialize chunks variable
+
+    if not use_chunked:
+        sys.stderr.write("ASR_DEBUG: Starting direct transcription...\n")
+        # Direct transcription for shorter audio
+        if publisher:
+            publisher.info(
+                "asr", f"Using direct transcription (duration ≤ {max_direct_duration}s)"
+            )
+
+        try:
+            segments, info = transcribe_direct(model, audio_path, publisher)
+            all_segments.extend(segments)
+            language = info.language
+            language_prob = info.language_probability
+        except Exception as e:
+            if publisher:
+                publisher.error("asr", f"Direct transcription failed: {e}")
+            sys.stderr.write(f"ASR: Direct transcription failed: {e}\n")
+            sys.stderr.flush()
+            # Fall back to chunked approach
+            use_chunked = True
+            if publisher:
+                publisher.info("asr", "Falling back to chunked transcription")
+
+    if use_chunked:
+        # Chunked transcription for long audio
+        sys.stderr.write("ASR_DEBUG: Starting chunked transcription...\n")
+        if publisher:
+            publisher.info(
+                "asr", f"Using chunked transcription ({chunk_duration}s chunks)"
+            )
+
+        # Calculate chunks
+        chunks = []
+        start = 0.0
+        chunk_idx = 0
+        while start < total_duration:
+            chunk_end = min(start + chunk_duration, total_duration)
+            chunks.append(
+                {
+                    "start": start,
+                    "end": chunk_end,
+                    "duration": chunk_end - start,
+                    "idx": chunk_idx,
+                }
+            )
+            start = chunk_end
+            chunk_idx += 1
+
+        if publisher:
+            publisher.info("asr", f"Split into {len(chunks)} chunks")
+
+        sys.stderr.write(f"ASR_DEBUG: Calculated {len(chunks)} chunks\n")
+        chunk_temp_dir = os.path.join(temp_dir, "chunks")
+        os.makedirs(chunk_temp_dir, exist_ok=True)
+        sys.stderr.write("ASR_DEBUG: Created chunk directory\n")
+
+        last_resource_report = time.time()
+
+        sys.stderr.write(f"ASR_DEBUG: Starting loop over {len(chunks)} chunks\n")
+        for i, chunk in enumerate(chunks):
+            sys.stderr.write(
+                f"ASR_DEBUG: Loop iteration {i}, chunk start={chunk['start']:.1f}\n"
+            )
+            sys.stderr.flush()
+            chunk_path = os.path.join(chunk_temp_dir, f"chunk_{i:04d}.wav")
+
+            if publisher and os.environ.get("MOMENTRY_DISABLE_REDIS") != "1":
+                sys.stderr.write("ASR_DEBUG: Before publisher.progress\n")
+                sys.stderr.flush()
+                publisher.progress(
+                    "asr", i, len(chunks), f"Processing chunk {i + 1}/{len(chunks)}"
+                )
+                sys.stderr.write("ASR_DEBUG: After publisher.progress\n")
+                sys.stderr.flush()
+            elif publisher:
+                sys.stderr.write(
+                    "ASR_DEBUG: Redis disabled, skipping publisher.progress\n"
+                )
+                sys.stderr.flush()
+
+            # Extract chunk
+            if not extract_chunk(
+                audio_path, chunk["start"], chunk["duration"], chunk_path
+            ):
+                if publisher:
+                    publisher.warning("asr", f"Failed to extract chunk {i}, skipping")
+                continue
+
+            # Resource monitoring (sample every monitor_interval seconds)
+            current_time = time.time()
+            if (
+                PSUTIL_AVAILABLE
+                and publisher
+                and (current_time - last_resource_report) >= monitor_interval
+            ):
+                resources = monitor_resources(os.getpid())
+                if resources["available"]:
+                    publisher.info(
+                        "asr",
+                        f"Resource usage: CPU {resources['cpu_percent']:.1f}%, "
+                        f"Memory {resources['memory_mb']:.1f}MB",
+                    )
+                last_resource_report = current_time
+
+            # Transcribe chunk with retry logic
+            sys.stderr.write(
+                f"ASR_DEBUG: Starting transcription for chunk {i}, retry loop\n"
+            )
+            sys.stderr.flush()
+            max_retries = 3
+            transcribed = False
+            last_error = None
+
+            for retry in range(max_retries):
+                try:
+                    segments, info = transcribe_chunk(
+                        model, chunk_path, chunk["start"], i, len(chunks), publisher
+                    )
+                    all_segments.extend(segments)
+
+                    if language is None:
+                        language = info.language
+                        language_prob = info.language_probability
+                        if publisher:
+                            publisher.info(
+                                "asr",
+                                f"Detected language: {language} (prob {language_prob:.2f})",
+                            )
+
+                    transcribed = True
+                    break  # Success, exit retry loop
+
+                except Exception as e:
+                    last_error = e
+                    if publisher:
+                        publisher.warning(
+                            "asr",
+                            f"Error transcribing chunk {i} (attempt {retry + 1}/{max_retries}): {e}",
+                        )
+                    sys.stderr.write(
+                        f"ASR: Error transcribing chunk {i} (attempt {retry + 1}/{max_retries}): {e}\n"
+                    )
+                    sys.stderr.flush()
+
+                    if retry < max_retries - 1:
+                        # Wait before retry (exponential backoff)
+                        wait_time = 2**retry  # 1, 2, 4 seconds
+                        if publisher:
+                            publisher.info("asr", f"Retrying in {wait_time}s...")
+                        time.sleep(wait_time)
+                    else:
+                        # Final attempt failed
+                        if publisher:
+                            publisher.error(
+                                "asr",
+                                f"Failed to transcribe chunk {i} after {max_retries} attempts: {last_error}",
+                            )
+                        sys.stderr.write(
+                            f"ASR: Failed to transcribe chunk {i} after {max_retries} attempts: {last_error}\n"
+                        )
+                        sys.stderr.flush()
+                        # Continue with next chunk (skip this one)
+
+            # Clean up chunk file
+            sys.stderr.write(
+                f"ASR_DEBUG: Finished processing chunk {i}, transcribed={transcribed}\n"
+            )
+            sys.stderr.flush()
+            try:
+                os.unlink(chunk_path)
+            except Exception:
+                pass
+
+    # Clean up temporary directory
+    try:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+    except Exception:
+        pass
+
+    # Sort segments by start time
+    all_segments.sort(key=lambda x: x["start"])
+
+    # Prepare output (maintain same format as original)
+    output = {
+        "processor_name": "asr",
+        "processor_version": "2.0.0",
+        "contract_version": "1.0",
+        "language": language if language is not None else None,
+        "language_probability": language_prob if language_prob is not None else None,
+        "segments": all_segments,
+    }
+
+    # Add metadata for chunked processing (optional)
+    if use_chunked:
+        output["processing_mode"] = "chunked"
+        output["chunk_count"] = len(chunks) if "chunks" in locals() else 0
+        output["chunk_duration"] = chunk_duration
+    else:
+        output["processing_mode"] = "direct"
+
+    # Write output
+    with open(output_path, "w") as f:
+        json.dump(output, f, indent=2)
+
+    if publisher:
+        publisher.complete(
+            "asr",
+            f"{len(all_segments)} segments ({'chunked' if use_chunked else 'direct'} mode)",
+        )
+
+    sys.stderr.write(
+        f"ASR: Transcription complete, {len(all_segments)} segments written to {output_path}\n"
+    )
+    sys.stderr.flush()
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="ASR Transcription with chunked processing"
+    )
+    parser.add_argument("video_path", nargs="?", help="Path to video file")
+    parser.add_argument("output_path", nargs="?", help="Output JSON path")
+    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
+    parser.add_argument("--version", action="version", version="2.0.0")
+    parser.add_argument(
+        "--check-health", action="store_true", help="Check dependencies and exit"
+    )
+
+    # Hidden arguments for configuration (can be set via environment variables)
+    parser.add_argument(
+        "--chunk-duration", type=int, default=600, help=argparse.SUPPRESS
+    )  # 10 minutes default
+    parser.add_argument(
+        "--max-direct-duration", type=int, default=1200, help=argparse.SUPPRESS
+    )  # 20 minutes (safe limit based on testing)
+    parser.add_argument("--model-size", default="tiny", help=argparse.SUPPRESS)
+    parser.add_argument("--compute-type", default="int8", help=argparse.SUPPRESS)
+    parser.add_argument(
+        "--monitor-interval", type=int, default=60, help=argparse.SUPPRESS
+    )
+
+    args = parser.parse_args()
+
+    # Handle health check
+    if args.check_health:
+        health = check_health()
+        print(json.dumps(health, indent=2))
+        sys.exit(0 if health["status"] == "healthy" else 1)
+
+    # Validate required arguments when not doing health check
+    if args.video_path is None or args.output_path is None:
+        parser.error(
+            "video_path and output_path are required when not using --check-health"
+        )
+
+    # Allow environment variable overrides
+    chunk_duration_str = os.environ.get("MOMENTRY_ASR_CHUNK_DURATION")
+    if chunk_duration_str is not None:
+        chunk_duration = int(chunk_duration_str)
+    else:
+        chunk_duration = args.chunk_duration
+
+    max_direct_duration_str = os.environ.get("MOMENTRY_ASR_MAX_DIRECT_DURATION")
+    if max_direct_duration_str is not None:
+        max_direct_duration = int(max_direct_duration_str)
+    else:
+        max_direct_duration = args.max_direct_duration
+
+    model_size = os.environ.get("MOMENTRY_ASR_MODEL_SIZE")
+    if model_size is None:
+        model_size = args.model_size
+
+    compute_type = os.environ.get("MOMENTRY_ASR_COMPUTE_TYPE")
+    if compute_type is None:
+        compute_type = args.compute_type
+
+    run_asr(
+        args.video_path,
+        args.output_path,
+        args.uuid,
+        chunk_duration,
+        max_direct_duration,
+        model_size,
+        compute_type,
+    )