#!/opt/homebrew/bin/python3.11 """ Step 3: Re-run ASR with word_timestamps on full audio. Map words to 4188 fine segments for accurate text. """ import json, sys, os, time, subprocess, tempfile, shutil from faster_whisper import WhisperModel UUID = "aeed71342a899fe4b4c57b7d41bcb692" BASE = "/Users/accusys/momentry/output_dev" VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4" print("=== Load fine ASRX ===") fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json")) fine_segs = fine["segments"] print(f"Fine segments: {len(fine_segs)}") print("\n=== Extract audio WAV ===") tmp_dir = tempfile.mkdtemp(prefix="asr_step3_") wav_path = os.path.join(tmp_dir, "audio.wav") subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO, "-ar", "16000", "-ac", "1", "-sample_fmt", "s16", wav_path], check=True, capture_output=True, timeout=300) print("Loading model with word_timestamps...") t0 = time.time() model = WhisperModel("small", device="cpu", compute_type="int8") print(f" Model loaded in {time.time()-t0:.1f}s") print("Transcribing with word_timestamps=True...") t0 = time.time() segments, info = model.transcribe( wav_path, beam_size=5, vad_filter=True, vad_parameters={"min_silence_duration_ms": 500}, word_timestamps=True ) # Collect all word-level data words = [] for seg in segments: if seg.words: for w in seg.words: wt = w.word.strip() if wt: words.append({"word": wt, "start": w.start, "end": w.end}) else: words.append({"word": seg.text.strip(), "start": seg.start, "end": seg.end}) elapsed = time.time() - t0 print(f" Done in {elapsed:.1f}s, {len(words)} words") # Map words to fine segments print("\n=== Map words to fine segments ===") wi = 0 assigned = 0 for si, fs in enumerate(fine_segs): fstart = fs["start_time"] fend = fs["end_time"] seg_words = [] while wi < len(words): w = words[wi] if w["end"] <= fstart: wi += 1 continue if w["start"] >= fend: break seg_words.append(w["word"]) wi += 1 text = " ".join(seg_words) fs["text"] = text if text: assigned += 1 print(f" Segments with text: {assigned}/{len(fine_segs)}") # Show examples print("\nSplit segment examples:") for fs in fine_segs: # Check if this was split (doesn't match an ASR boundary exactly) is_split = True # We can't easily check here, just show first 10 non-trivial if len(fs.get('text','')) > 10 and is_split: print(f" [{fs['start_time']:.1f}-{fs['end_time']:.1f}] {fs['speaker_name']:15s} \"{fs['text'][:60]}\"") break # just one for now # Count text lengths text_lens = [len(fs.get('text','')) for fs in fine_segs] print(f"\n Avg text length: {sum(text_lens)/len(text_lens):.0f} chars") print(f" Empty texts: {sum(1 for l in text_lens if l == 0)}") # Save fine["_asr_meta"] = {"word_timestamps": True, "asr_runtime_secs": round(elapsed, 1)} json.dump(fine, open(f"{BASE}/{UUID}.asrx_fine.json", "w"), indent=2) print(f"\nSaved") shutil.rmtree(tmp_dir, ignore_errors=True)