#!/opt/homebrew/bin/python3.11 """ ASR Processor with chunked transcription for large files and resource monitoring. Maintains backward compatibility with existing API. """ import sys import json import os import argparse import signal import subprocess import tempfile import time import shutil from typing import List, Dict, Any, Optional, Tuple # Try to import psutil for resource monitoring PSUTIL_AVAILABLE = False psutil = None try: import psutil PSUTIL_AVAILABLE = True except ImportError: sys.stderr.write("WARNING: psutil not available, resource monitoring disabled\n") sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from redis_publisher import RedisPublisher # noqa: E402 def save_checkpoint( checkpoint_path: str, segments: List[Dict[str, Any]], language: Optional[str], language_prob: Optional[float], processed_chunks: List[int], total_chunks: int, ) -> None: """Save transcription checkpoint to resume later.""" checkpoint_data = { "segments": segments, "language": language or "", "language_probability": language_prob or 0.0, "processed_chunks": processed_chunks, "total_chunks": total_chunks, "timestamp": time.time(), } try: with open(checkpoint_path, "w") as f: json.dump(checkpoint_data, f, indent=2, default=str) except Exception as e: sys.stderr.write(f"ASR: Failed to save checkpoint: {e}\n") def load_checkpoint(checkpoint_path: str) -> Optional[Dict[str, Any]]: """Load transcription checkpoint if exists.""" try: with open(checkpoint_path, "r") as f: return json.load(f) except Exception: return None def check_health() -> Dict[str, Any]: """Check health of ASR processor dependencies.""" health = { "status": "healthy", "checks": {}, "timestamp": time.time(), } # Check ffmpeg try: result = subprocess.run(["ffmpeg", "-version"], capture_output=True, text=True) health["checks"]["ffmpeg"] = { "available": result.returncode == 0, "version": result.stdout.split("\n")[0].split(" ")[2] if result.stdout else "unknown", } except Exception as e: health["checks"]["ffmpeg"] = {"available": False, "error": str(e)} # Check ffprobe try: result = subprocess.run(["ffprobe", "-version"], capture_output=True, text=True) health["checks"]["ffprobe"] = { "available": result.returncode == 0, "version": result.stdout.split("\n")[0].split(" ")[2] if result.stdout else "unknown", } except Exception as e: health["checks"]["ffprobe"] = {"available": False, "error": str(e)} # Check faster_whisper import try: import faster_whisper health["checks"]["faster_whisper"] = { "available": True, "version": getattr(faster_whisper, "__version__", "unknown"), } except ImportError as e: health["checks"]["faster_whisper"] = {"available": False, "error": str(e)} health["status"] = "unhealthy" # Check psutil import try: import psutil health["checks"]["psutil"] = { "available": True, "version": getattr(psutil, "__version__", "unknown"), } except ImportError: health["checks"]["psutil"] = { "available": False, "warning": "resource monitoring disabled", } # Determine overall status if not health["checks"].get("ffmpeg", {}).get("available", False) or not health[ "checks" ].get("ffprobe", {}).get("available", False): health["status"] = "unhealthy" return health def signal_handler(signum, frame): sys.stderr.write(f"ASR: Received signal {signum}, exiting...\n") sys.exit(1) def has_audio_stream(video_path: str) -> bool: """Check if video file has audio stream using ffprobe.""" try: cmd = [ "ffprobe", "-v", "error", "-select_streams", "a", "-show_entries", "stream=codec_type", "-of", "csv=p=0", video_path, ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) return bool(result.stdout.strip()) except subprocess.CalledProcessError: return False except FileNotFoundError: sys.stderr.write("WARNING: ffprobe not found, assuming audio exists\n") return True def get_media_duration(media_path: str) -> float: """Get media duration in seconds using ffprobe.""" cmd = [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "csv=p=0", media_path, ] result = subprocess.run(cmd, capture_output=True, text=True) try: return float(result.stdout.strip()) except (ValueError, AttributeError): return 0.0 def extract_audio(video_path: str, audio_path: str) -> bool: """Extract audio from video to WAV format.""" cmd = [ "ffmpeg", "-i", video_path, "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", audio_path, ] result = subprocess.run(cmd, capture_output=True) return result.returncode == 0 and os.path.exists(audio_path) def extract_chunk( audio_path: str, start: float, duration: float, output_path: str ) -> bool: """Extract a chunk of audio using ffmpeg.""" cmd = [ "ffmpeg", "-i", audio_path, "-ss", str(start), "-t", str(duration), "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", output_path, ] result = subprocess.run(cmd, capture_output=True) success = ( result.returncode == 0 and os.path.exists(output_path) and os.path.getsize(output_path) > 0 ) sys.stderr.write( f"ASR_DEBUG: extract_chunk: start={start}, duration={duration}, success={success}, returncode={result.returncode}\n" ) sys.stderr.flush() return success def monitor_resources(pid: int, interval: float = 0.1) -> Dict[str, Any]: """Monitor CPU and memory usage for a process.""" if not PSUTIL_AVAILABLE or psutil is None: return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False} try: process = psutil.Process(pid) cpu_percent = process.cpu_percent(interval=interval) memory_info = process.memory_info() memory_mb = memory_info.rss / (1024 * 1024) return { "cpu_percent": cpu_percent, "memory_mb": memory_mb, "available": True, "pid": pid, } except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False} def transcribe_direct( model, audio_path: str, publisher: Optional[RedisPublisher] = None ) -> Tuple[List[Dict[str, Any]], Any]: """Transcribe audio directly (non-chunked).""" if publisher: publisher.info("asr", "Transcribing audio directly...") start_time = time.time() segments, info = model.transcribe(audio_path, beam_size=5) results = [] total_segments = 0 for segment in segments: results.append( {"start": segment.start, "end": segment.end, "text": segment.text.strip()} ) total_segments += 1 if total_segments % 100 == 0 and publisher: publisher.progress("asr", total_segments, 0, f"Segment {total_segments}") elapsed = time.time() - start_time if publisher: publisher.info( "asr", f"Direct transcription: {len(results)} segments in {elapsed:.1f}s" ) return results, info def transcribe_chunk( model, chunk_path: str, chunk_start: float, chunk_idx: int, total_chunks: int, publisher: Optional[RedisPublisher] = None, ) -> Tuple[List[Dict[str, Any]], Any]: """Transcribe a single audio chunk.""" if publisher: publisher.info("asr", f"Transcribing chunk {chunk_idx + 1}/{total_chunks}") sys.stderr.write( f"ASR_DEBUG: transcribe_chunk: chunk_idx={chunk_idx}, path={chunk_path}, size={os.path.getsize(chunk_path) if os.path.exists(chunk_path) else 0}\n" ) sys.stderr.flush() start_time = time.time() segments, info = model.transcribe(chunk_path, beam_size=5) sys.stderr.write( "ASR_DEBUG: transcribe_chunk: transcription completed, got segments\n" ) sys.stderr.flush() results = [] for segment in segments: results.append( { "start": segment.start + chunk_start, "end": segment.end + chunk_start, "text": segment.text.strip(), } ) elapsed = time.time() - start_time if publisher: publisher.info( "asr", f"Chunk {chunk_idx + 1}/{total_chunks}: {len(results)} segments in {elapsed:.1f}s", ) return results, info def run_asr( video_path: str, output_path: str, uuid: str = "", chunk_duration: int = 600, # 10 minutes default max_direct_duration: int = 1200, # 20 minutes: use direct transcription for shorter files (safe limit) model_size: str = "tiny", compute_type: str = "int8", monitor_interval: int = 60, ) -> None: # Set up signal handlers signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) publisher = RedisPublisher(uuid) if uuid else None if publisher: publisher.info("asr", "ASR_START") sys.stderr.write("ASR_DEBUG: Audio stream check...\n") # Check for audio stream if not has_audio_stream(video_path): if publisher: publisher.info("asr", "No audio stream detected, skipping transcription") output = { "processor_name": "asr", "processor_version": "2.0.0", "contract_version": "1.0", "language": None, "language_probability": None, "segments": [], } with open(output_path, "w") as f: json.dump(output, f, indent=2) if publisher: publisher.complete("asr", "0 segments (no audio)") sys.stderr.write("ASR: No audio stream, skipping transcription\n") sys.stderr.flush() sys.exit(0) # Create temporary directory sys.stderr.write("ASR_DEBUG: Creating temporary directory...\n") temp_dir = tempfile.mkdtemp(prefix="asr_") sys.stderr.write(f"ASR_DEBUG: temp_dir={temp_dir}\n") audio_path = os.path.join(temp_dir, "audio.wav") if publisher: publisher.info("asr", "Extracting audio from video...") sys.stderr.write("ASR_DEBUG: Extracting audio...\n") # Extract audio if not extract_audio(video_path, audio_path): if publisher: publisher.error("asr", "Failed to extract audio") sys.stderr.write("ASR: Failed to extract audio\n") sys.stderr.flush() # Clean up shutil.rmtree(temp_dir, ignore_errors=True) sys.exit(1) sys.stderr.write("ASR_DEBUG: Audio extraction successful, getting duration...\n") # Get audio duration try: total_duration = get_media_duration(audio_path) except Exception as e: if publisher: publisher.error("asr", f"Failed to get audio duration: {e}") sys.stderr.write(f"ASR: Failed to get audio duration: {e}\n") sys.stderr.flush() shutil.rmtree(temp_dir, ignore_errors=True) sys.exit(1) if publisher: publisher.info( "asr", f"Audio duration: {total_duration:.1f}s ({total_duration / 3600:.1f} hrs)", ) sys.stderr.write("ASR_DEBUG: Loading Whisper model...\n") # Load Whisper model if publisher: publisher.info( "asr", f"Loading Whisper model ({model_size}, {compute_type})..." ) try: from faster_whisper import WhisperModel model = WhisperModel(model_size, device="cpu", compute_type=compute_type) except Exception as e: if publisher: publisher.error("asr", f"Failed to load Whisper model: {e}") sys.stderr.write(f"ASR: Failed to load Whisper model: {e}\n") sys.stderr.flush() shutil.rmtree(temp_dir, ignore_errors=True) sys.exit(1) if publisher: publisher.info("asr", "Whisper model loaded successfully") sys.stderr.write("ASR_DEBUG: Whisper model loaded.\n") # Decide whether to use chunked or direct transcription use_chunked = total_duration > max_direct_duration sys.stderr.write( f"ASR_DEBUG: total_duration={total_duration:.1f}s, max_direct_duration={max_direct_duration}s, use_chunked={use_chunked}\n" ) all_segments = [] language = None language_prob = None chunks = [] # Initialize chunks variable if not use_chunked: sys.stderr.write("ASR_DEBUG: Starting direct transcription...\n") # Direct transcription for shorter audio if publisher: publisher.info( "asr", f"Using direct transcription (duration ≤ {max_direct_duration}s)" ) try: segments, info = transcribe_direct(model, audio_path, publisher) all_segments.extend(segments) language = info.language language_prob = info.language_probability except Exception as e: if publisher: publisher.error("asr", f"Direct transcription failed: {e}") sys.stderr.write(f"ASR: Direct transcription failed: {e}\n") sys.stderr.flush() # Fall back to chunked approach use_chunked = True if publisher: publisher.info("asr", "Falling back to chunked transcription") if use_chunked: # Chunked transcription for long audio sys.stderr.write("ASR_DEBUG: Starting chunked transcription...\n") if publisher: publisher.info( "asr", f"Using chunked transcription ({chunk_duration}s chunks)" ) # Calculate chunks chunks = [] start = 0.0 chunk_idx = 0 while start < total_duration: chunk_end = min(start + chunk_duration, total_duration) chunks.append( { "start": start, "end": chunk_end, "duration": chunk_end - start, "idx": chunk_idx, } ) start = chunk_end chunk_idx += 1 if publisher: publisher.info("asr", f"Split into {len(chunks)} chunks") sys.stderr.write(f"ASR_DEBUG: Calculated {len(chunks)} chunks\n") chunk_temp_dir = os.path.join(temp_dir, "chunks") os.makedirs(chunk_temp_dir, exist_ok=True) sys.stderr.write("ASR_DEBUG: Created chunk directory\n") last_resource_report = time.time() sys.stderr.write(f"ASR_DEBUG: Starting loop over {len(chunks)} chunks\n") for i, chunk in enumerate(chunks): sys.stderr.write( f"ASR_DEBUG: Loop iteration {i}, chunk start={chunk['start']:.1f}\n" ) sys.stderr.flush() chunk_path = os.path.join(chunk_temp_dir, f"chunk_{i:04d}.wav") if publisher and os.environ.get("MOMENTRY_DISABLE_REDIS") != "1": sys.stderr.write("ASR_DEBUG: Before publisher.progress\n") sys.stderr.flush() publisher.progress( "asr", i, len(chunks), f"Processing chunk {i + 1}/{len(chunks)}" ) sys.stderr.write("ASR_DEBUG: After publisher.progress\n") sys.stderr.flush() elif publisher: sys.stderr.write( "ASR_DEBUG: Redis disabled, skipping publisher.progress\n" ) sys.stderr.flush() # Extract chunk if not extract_chunk( audio_path, chunk["start"], chunk["duration"], chunk_path ): if publisher: publisher.warning("asr", f"Failed to extract chunk {i}, skipping") continue # Resource monitoring (sample every monitor_interval seconds) current_time = time.time() if ( PSUTIL_AVAILABLE and publisher and (current_time - last_resource_report) >= monitor_interval ): resources = monitor_resources(os.getpid()) if resources["available"]: publisher.info( "asr", f"Resource usage: CPU {resources['cpu_percent']:.1f}%, " f"Memory {resources['memory_mb']:.1f}MB", ) last_resource_report = current_time # Transcribe chunk with retry logic sys.stderr.write( f"ASR_DEBUG: Starting transcription for chunk {i}, retry loop\n" ) sys.stderr.flush() max_retries = 3 transcribed = False last_error = None for retry in range(max_retries): try: segments, info = transcribe_chunk( model, chunk_path, chunk["start"], i, len(chunks), publisher ) all_segments.extend(segments) if language is None: language = info.language language_prob = info.language_probability if publisher: publisher.info( "asr", f"Detected language: {language} (prob {language_prob:.2f})", ) transcribed = True break # Success, exit retry loop except Exception as e: last_error = e if publisher: publisher.warning( "asr", f"Error transcribing chunk {i} (attempt {retry + 1}/{max_retries}): {e}", ) sys.stderr.write( f"ASR: Error transcribing chunk {i} (attempt {retry + 1}/{max_retries}): {e}\n" ) sys.stderr.flush() if retry < max_retries - 1: # Wait before retry (exponential backoff) wait_time = 2**retry # 1, 2, 4 seconds if publisher: publisher.info("asr", f"Retrying in {wait_time}s...") time.sleep(wait_time) else: # Final attempt failed if publisher: publisher.error( "asr", f"Failed to transcribe chunk {i} after {max_retries} attempts: {last_error}", ) sys.stderr.write( f"ASR: Failed to transcribe chunk {i} after {max_retries} attempts: {last_error}\n" ) sys.stderr.flush() # Continue with next chunk (skip this one) # Clean up chunk file sys.stderr.write( f"ASR_DEBUG: Finished processing chunk {i}, transcribed={transcribed}\n" ) sys.stderr.flush() try: os.unlink(chunk_path) except Exception: pass # Clean up temporary directory try: shutil.rmtree(temp_dir, ignore_errors=True) except Exception: pass # Sort segments by start time all_segments.sort(key=lambda x: x["start"]) # Prepare output (maintain same format as original) output = { "processor_name": "asr", "processor_version": "2.0.0", "contract_version": "1.0", "language": language if language is not None else None, "language_probability": language_prob if language_prob is not None else None, "segments": all_segments, } # Add metadata for chunked processing (optional) if use_chunked: output["processing_mode"] = "chunked" output["chunk_count"] = len(chunks) if "chunks" in locals() else 0 output["chunk_duration"] = chunk_duration else: output["processing_mode"] = "direct" # Write output with open(output_path, "w") as f: json.dump(output, f, indent=2) if publisher: publisher.complete( "asr", f"{len(all_segments)} segments ({'chunked' if use_chunked else 'direct'} mode)", ) sys.stderr.write( f"ASR: Transcription complete, {len(all_segments)} segments written to {output_path}\n" ) sys.stderr.flush() sys.exit(0) if __name__ == "__main__": parser = argparse.ArgumentParser( description="ASR Transcription with chunked processing" ) parser.add_argument("video_path", nargs="?", help="Path to video file") parser.add_argument("output_path", nargs="?", help="Output JSON path") parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="") parser.add_argument("--version", action="version", version="2.0.0") parser.add_argument( "--check-health", action="store_true", help="Check dependencies and exit" ) # Hidden arguments for configuration (can be set via environment variables) parser.add_argument( "--chunk-duration", type=int, default=600, help=argparse.SUPPRESS ) # 10 minutes default parser.add_argument( "--max-direct-duration", type=int, default=1200, help=argparse.SUPPRESS ) # 20 minutes (safe limit based on testing) parser.add_argument("--model-size", default="tiny", help=argparse.SUPPRESS) parser.add_argument("--compute-type", default="int8", help=argparse.SUPPRESS) parser.add_argument( "--monitor-interval", type=int, default=60, help=argparse.SUPPRESS ) args = parser.parse_args() # Handle health check if args.check_health: health = check_health() print(json.dumps(health, indent=2)) sys.exit(0 if health["status"] == "healthy" else 1) # Validate required arguments when not doing health check if args.video_path is None or args.output_path is None: parser.error( "video_path and output_path are required when not using --check-health" ) # Allow environment variable overrides chunk_duration_str = os.environ.get("MOMENTRY_ASR_CHUNK_DURATION") if chunk_duration_str is not None: chunk_duration = int(chunk_duration_str) else: chunk_duration = args.chunk_duration max_direct_duration_str = os.environ.get("MOMENTRY_ASR_MAX_DIRECT_DURATION") if max_direct_duration_str is not None: max_direct_duration = int(max_direct_duration_str) else: max_direct_duration = args.max_direct_duration model_size = os.environ.get("MOMENTRY_ASR_MODEL_SIZE") if model_size is None: model_size = args.model_size compute_type = os.environ.get("MOMENTRY_ASR_COMPUTE_TYPE") if compute_type is None: compute_type = args.compute_type run_asr( args.video_path, args.output_path, args.uuid, chunk_duration, max_direct_duration, model_size, compute_type, )