feat: ASRX hybrid pipeline, identity history, worker fixes, checkpoint system

2026-06-02 07:13:23 +08:00
parent e3066c3f49
commit e1572907ae
198 changed files with 43705 additions and 8910 deletions
--- a/scripts/asrx_processor_simplified.py
+++ b/scripts/asrx_processor_simplified.py
@@ -1,177 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-ASRX 處理器 - 簡化版
-先做轉錄，說話人分離可選
-修復 PyTorch 2.6 兼容性問題
-"""
-
-# Fix for PyTorch 2.6+ compatibility - MUST be set before importing torch
-import os
-os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
-
-import sys
-import json
-import argparse
-import signal
-import subprocess
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from redis_publisher import RedisPublisher
-
-
-def signal_handler(signum, frame):
-    print(f"ASRX: Received signal {signum}, exiting...")
-    sys.exit(1)
-
-
-def has_audio_stream(video_path):
-    """Check if video file has audio stream using ffprobe."""
-    try:
-        cmd = [
-            "ffprobe",
-            "-v",
-            "error",
-            "-select_streams",
-            "a",
-            "-show_entries",
-            "stream=codec_type",
-            "-of",
-            "csv=p=0",
-            video_path,
-        ]
-        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
-        return bool(result.stdout.strip())
-    except subprocess.CalledProcessError:
-        return False
-    except FileNotFoundError:
-        print("WARNING: ffprobe not found, assuming audio exists")
-        return True
-
-
-def process_asrx(video_path: str, output_path: str, uuid: str = "", skip_diarization: bool = True):
-    """
-    Process video for speaker diarization using whisperx
-    
-    Args:
-        video_path: Path to video file
-        output_path: Path to output JSON
-        uuid: UUID for Redis progress
-        skip_diarization: Skip speaker diarization (only transcription)
-    """
-    
-    signal.signal(signal.SIGTERM, signal_handler)
-    signal.signal(signal.SIGINT, signal_handler)
-
-    publisher = RedisPublisher(uuid) if uuid else None
-    if publisher:
-        publisher.info("asrx", "ASRX_START")
-
-    try:
-        import whisperx
-        import torch
-    except ImportError as e:
-        if publisher:
-            publisher.error("asrx", f"Missing dependency: {e}")
-        result = {"language": None, "segments": []}
-        if publisher:
-            publisher.complete("asrx", "0 segments")
-        with open(output_path, "w") as f:
-            json.dump(result, f, indent=2)
-        sys.exit(1)
-
-    # Check for audio stream
-    if not has_audio_stream(video_path):
-        if publisher:
-            publisher.info("asrx", "No audio stream detected, skipping transcription")
-        output = {"language": "", "language_probability": 0.0, "segments": []}
-        with open(output_path, "w") as f:
-            json.dump(output, f, indent=2)
-        if publisher:
-            publisher.complete("asrx", "0 segments (no audio)")
-        sys.stderr.write("ASRX: No audio stream, skipping transcription\n")
-        sys.stderr.flush()
-        sys.exit(0)
-
-    if publisher:
-        publisher.info("asrx", "ASRX_LOADING_MODEL")
-
-    try:
-        # Load model
-        if publisher:
-            publisher.info("asrx", "Loading whisperx base model (this may take a while)...")
-        
-        model = whisperx.load_model("base", device="cpu", compute_type="int8")
-        
-        if publisher:
-            publisher.info("asrx", "ASRX_TRANSCRIBING")
-        
-        # Transcribe with language detection
-        result = model.transcribe(video_path)
-        
-        if publisher:
-            publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
-        
-        # Build output (without diarization for now)
-        segments = []
-        for seg in result.get("segments", []):
-            text = seg.get("text", "").strip()
-            if text:
-                segments.append(
-                    {
-                        "start": seg.get("start", 0.0),
-                        "end": seg.get("end", 0.0),
-                        "text": text,
-                        "speaker_id": None,  # Will be added when diarization is enabled
-                    }
-                )
-        
-        output_result = {
-            "language": result.get("language"),
-            "language_probability": result.get("language_probability", 0),
-            "segments": segments,
-            "diarization_enabled": not skip_diarization
-        }
-        
-        if publisher:
-            publisher.complete("asrx", f"{len(segments)} segments")
-        
-        with open(output_path, "w") as f:
-            json.dump(output_result, f, indent=2, ensure_ascii=False)
-        
-        sys.stderr.write(
-            f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
-        )
-        sys.stderr.flush()
-        sys.exit(0)
-        
-    except Exception as e:
-        if publisher:
-            publisher.error("asrx", f"Error: {e}")
-        import traceback
-        traceback.print_exc()
-        result = {"language": None, "segments": [], "error": str(e)}
-        if publisher:
-            publisher.complete("asrx", "0 segments (error)")
-        with open(output_path, "w") as f:
-            json.dump(result, f, indent=2)
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="ASRX Speaker Diarization (Simplified)")
-    parser.add_argument("video_path", help="Path to video file")
-    parser.add_argument("output_path", help="Output JSON path")
-    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
-    parser.add_argument(
-        "--skip-diarization",
-        action="store_true",
-        help="Skip speaker diarization (only transcription)"
-    )
-    args = parser.parse_args()
-
-    process_asrx(
-        args.video_path,
-        args.output_path,
-        args.uuid,
-        args.skip_diarization
-    )