#!/opt/homebrew/bin/python3.11 """ Minimal test: run faster-whisper on full video, output each segment's text. No VAD tuning, no speaker detection, no splitting. Just raw ASR output. """ import json, os, sys, time, subprocess, tempfile, shutil import torchaudio from faster_whisper import WhisperModel def extract_audio(video_path, tmp_dir, sr=16000): wav_path = os.path.join(tmp_dir, "audio.wav") subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", video_path, "-ar", str(sr), "-ac", "1", "-sample_fmt", "s16", wav_path], check=True, capture_output=True, timeout=300) wav_data, sr_actual = torchaudio.load(wav_path) if wav_data.shape[0] > 1: wav_data = wav_data.mean(dim=0, keepdim=True) return wav_path, wav_data, sr_actual def main(): video = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn | Comedy Mystery Romance Thriller | Full Movie.mp4" output = "/Users/accusys/momentry/output_dev/segment_texts.txt" t0 = time.time() # Load model print("Loading faster-whisper small int8 CPU...") model = WhisperModel("small", device="cpu", compute_type="int8") print(f"Model loaded ({time.time()-t0:.0f}s)") # Extract audio print("Extracting audio...") tmp_dir = tempfile.mkdtemp(prefix="asr_test_") wav_path, wav_data, sr = extract_audio(video, tmp_dir) total_audio_s = wav_data.shape[1] / sr print(f"Audio: {total_audio_s:.0f}s, {sr}Hz ({time.time()-t0:.0f}s)") # Transcribe - NO VAD filter, let the model segment naturally print("Transcribing (vad_filter=False)...") segments, info = model.transcribe(wav_path, beam_size=5, vad_filter=False, word_timestamps=True) print(f" Detected language: {info.language} (prob: {info.language_probability:.2f})") print(f" Duration after VAD: {info.duration_after_vad:.1f}s") # Write each segment to file count = 0 total_words = 0 total_dur = 0 with open(output, "w") as f: for seg in segments: text = seg.text.strip() dur = seg.end - seg.start words = len(seg.words) if seg.words else 0 f.write(f"{seg.start:.2f}\t{seg.end:.2f}\t{dur:.2f}\t{words}\t{text}\n") count += 1 total_words += words total_dur += dur elapsed = time.time() - t0 print(f"\n=== Results ===") print(f"Segments: {count}") print(f"Words: {total_words}") print(f"Speech duration: {total_dur:.0f}s") print(f"Avg segment: {total_dur/count:.1f}s, {total_words/count:.1f} words") print(f"Elapsed: {elapsed:.0f}s ({elapsed/60:.1f}min)") print(f"Output: {output}") shutil.rmtree(tmp_dir, ignore_errors=True) if __name__ == "__main__": main()