momentry_core/scripts/asrx_processor_custom.py

#!/opt/homebrew/bin/python3.11
"""
ASRX Processor - Custom Implementation Wrapper
Uses SpeechBrain ECAPA-TDNN (no HuggingFace token required)
"""

import sys
import json
import argparse
import os
from pathlib import Path

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(
    0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self")
)

from redis_publisher import RedisPublisher


def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
    """Process video for speaker diarization using custom implementation"""

    publisher = RedisPublisher(uuid) if uuid else None
    if publisher:
        publisher.info("asrx", "ASRX_START")

    try:
        from asrx_self.main_fixed import SelfASRXFixed

        if publisher:
            publisher.info("asrx", "ASRX_LOADING_MODEL")

        # Initialize custom ASRX processor
        asrx = SelfASRXFixed()

        if publisher:
            publisher.info("asrx", "ASRX_TRANSCRIBING")

        # Process video/audio
        result = asrx.process(
            video_path,
            output_path=None,  # We'll save our own format
            min_speech_duration_ms=500,
            max_speakers=10,
        )

        if "error" in result:
            if publisher:
                publisher.error("asrx", result["error"])

            # Return empty result
            output_result = {"language": None, "segments": []}

            with open(output_path, "w") as f:
                json.dump(output_result, f, indent=2)

            if publisher:
                publisher.complete("asrx", "0 segments")

            return output_result

        # Convert to Rust-expected format
        output_result = {
            "language": None,  # Custom implementation doesn't detect language
            "segments": [],
        }

        # Convert segments
        for seg in result["segments"]:
            output_result["segments"].append(
                {
                    "start": seg["start"],
                    "end": seg["end"],
                    "text": "",  # Will be filled by matching with ASR later
                    "speaker_id": seg["speaker"],
                }
            )

        # Add speaker_stats as optional metadata
        if "speaker_stats" in result:
            output_result["speaker_stats"] = result["speaker_stats"]

        if publisher:
            publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")

        # Save output
        with open(output_path, "w") as f:
            json.dump(output_result, f, indent=2)

        if publisher:
            publisher.complete("asrx", f"{len(output_result['segments'])} segments")

        print(
            f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}"
        )

        return output_result

    except Exception as e:
        if publisher:
            publisher.error("asrx", str(e))

        import traceback

        traceback.print_exc()

        # Return empty result on error
        output_result = {"language": None, "segments": []}

        with open(output_path, "w") as f:
            json.dump(output_result, f, indent=2)

        if publisher:
            publisher.complete("asrx", "0 segments")

        return output_result


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="ASRX Processor (Custom Implementation)"
    )
    parser.add_argument("video_path", help="Path to video/audio file")
    parser.add_argument("output_path", help="Path to output JSON file")
    parser.add_argument("--uuid", help="UUID for Redis publishing", default="")

    args = parser.parse_args()

    if not Path(args.video_path).exists():
        print(f"Error: Video file not found: {args.video_path}")
        sys.exit(1)

    result = process_asrx_custom(args.video_path, args.output_path, args.uuid)

    print(f"\n[Summary]")
    print(f"  Total segments: {len(result['segments'])}")
    if "speaker_stats" in result:
        print(f"  Detected speakers: {len(result['speaker_stats'])}")
        for speaker, stats in result["speaker_stats"].items():
            print(f"    {speaker}: {stats['count']} segments")