Files
momentry_core/scripts/asrx_processor_custom.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

142 lines
4.1 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
ASRX Processor - Custom Implementation Wrapper
Uses SpeechBrain ECAPA-TDNN (no HuggingFace token required)
"""
import sys
import json
import argparse
import os
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(
0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self")
)
from redis_publisher import RedisPublisher
def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
"""Process video for speaker diarization using custom implementation"""
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asrx", "ASRX_START")
try:
from asrx_self.main_fixed import SelfASRXFixed
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
# Initialize custom ASRX processor
asrx = SelfASRXFixed()
if publisher:
publisher.info("asrx", "ASRX_TRANSCRIBING")
# Process video/audio
result = asrx.process(
video_path,
output_path=None, # We'll save our own format
min_speech_duration_ms=500,
max_speakers=10,
)
if "error" in result:
if publisher:
publisher.error("asrx", result["error"])
# Return empty result
output_result = {"language": None, "segments": []}
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2)
if publisher:
publisher.complete("asrx", "0 segments")
return output_result
# Convert to Rust-expected format
output_result = {
"language": None, # Custom implementation doesn't detect language
"segments": [],
}
# Convert segments
for seg in result["segments"]:
output_result["segments"].append(
{
"start": seg["start"],
"end": seg["end"],
"text": "", # Will be filled by matching with ASR later
"speaker_id": seg["speaker"],
}
)
# Add speaker_stats as optional metadata
if "speaker_stats" in result:
output_result["speaker_stats"] = result["speaker_stats"]
if publisher:
publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")
# Save output
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2)
if publisher:
publisher.complete("asrx", f"{len(output_result['segments'])} segments")
print(
f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}"
)
return output_result
except Exception as e:
if publisher:
publisher.error("asrx", str(e))
import traceback
traceback.print_exc()
# Return empty result on error
output_result = {"language": None, "segments": []}
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2)
if publisher:
publisher.complete("asrx", "0 segments")
return output_result
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="ASRX Processor (Custom Implementation)"
)
parser.add_argument("video_path", help="Path to video/audio file")
parser.add_argument("output_path", help="Path to output JSON file")
parser.add_argument("--uuid", help="UUID for Redis publishing", default="")
args = parser.parse_args()
if not Path(args.video_path).exists():
print(f"Error: Video file not found: {args.video_path}")
sys.exit(1)
result = process_asrx_custom(args.video_path, args.output_path, args.uuid)
print(f"\n[Summary]")
print(f" Total segments: {len(result['segments'])}")
if "speaker_stats" in result:
print(f" Detected speakers: {len(result['speaker_stats'])}")
for speaker, stats in result["speaker_stats"].items():
print(f" {speaker}: {stats['count']} segments")