momentry_core/scripts/test_segment_count.py

#!/opt/homebrew/bin/python3.11
"""
Minimal test: run faster-whisper on full video, output each segment's text.
No VAD tuning, no speaker detection, no splitting. Just raw ASR output.
"""
import json, os, sys, time, subprocess, tempfile, shutil
import torchaudio
from faster_whisper import WhisperModel

def extract_audio(video_path, tmp_dir, sr=16000):
    wav_path = os.path.join(tmp_dir, "audio.wav")
    subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
        "-ar", str(sr), "-ac", "1", "-sample_fmt", "s16", wav_path],
        check=True, capture_output=True, timeout=300)
    wav_data, sr_actual = torchaudio.load(wav_path)
    if wav_data.shape[0] > 1:
        wav_data = wav_data.mean(dim=0, keepdim=True)
    return wav_path, wav_data, sr_actual

def main():
    video = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn ｜ Comedy Mystery Romance Thriller ｜ Full Movie.mp4"
    output = "/Users/accusys/momentry/output_dev/segment_texts.txt"

    t0 = time.time()

    # Load model
    print("Loading faster-whisper small int8 CPU...")
    model = WhisperModel("small", device="cpu", compute_type="int8")
    print(f"Model loaded ({time.time()-t0:.0f}s)")

    # Extract audio
    print("Extracting audio...")
    tmp_dir = tempfile.mkdtemp(prefix="asr_test_")
    wav_path, wav_data, sr = extract_audio(video, tmp_dir)
    total_audio_s = wav_data.shape[1] / sr
    print(f"Audio: {total_audio_s:.0f}s, {sr}Hz ({time.time()-t0:.0f}s)")

    # Transcribe - NO VAD filter, let the model segment naturally
    print("Transcribing (vad_filter=False)...")
    segments, info = model.transcribe(wav_path, beam_size=5,
        vad_filter=False, word_timestamps=True)
    print(f"  Detected language: {info.language} (prob: {info.language_probability:.2f})")
    print(f"  Duration after VAD: {info.duration_after_vad:.1f}s")

    # Write each segment to file
    count = 0
    total_words = 0
    total_dur = 0
    with open(output, "w") as f:
        for seg in segments:
            text = seg.text.strip()
            dur = seg.end - seg.start
            words = len(seg.words) if seg.words else 0
            f.write(f"{seg.start:.2f}\t{seg.end:.2f}\t{dur:.2f}\t{words}\t{text}\n")
            count += 1
            total_words += words
            total_dur += dur

    elapsed = time.time() - t0
    print(f"\n=== Results ===")
    print(f"Segments: {count}")
    print(f"Words: {total_words}")
    print(f"Speech duration: {total_dur:.0f}s")
    print(f"Avg segment: {total_dur/count:.1f}s, {total_words/count:.1f} words")
    print(f"Elapsed: {elapsed:.0f}s ({elapsed/60:.1f}min)")
    print(f"Output: {output}")

    shutil.rmtree(tmp_dir, ignore_errors=True)

if __name__ == "__main__":
    main()