#!/opt/homebrew/bin/python3.11 """ Redo ASR word-timestamp mapping correctly. Save words first, then map to fine segments with independent scanning. """ import json, sys, os, time, subprocess, tempfile, shutil from faster_whisper import WhisperModel UUID = "aeed71342a899fe4b4c57b7d41bcb692" BASE = "/Users/accusys/momentry/output_dev" VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4" print("Load fine segments...") fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json")) fine_segs = fine["segments"] print(f"{len(fine_segs)} segments") # Extract full audio tmp_dir = tempfile.mkdtemp(prefix="asr_fix_") wav_path = os.path.join(tmp_dir, "audio.wav") subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO, "-ar", "16000", "-ac", "1", "-sample_fmt", "s16", wav_path], check=True, capture_output=True, timeout=300) print("Loading model...") model = WhisperModel("small", device="cpu", compute_type="int8") # Check if words file exists words_file = f"{BASE}/{UUID}.words.json" if os.path.exists(words_file): print("Loading saved words...") words = json.load(open(words_file)) else: print("Transcribing with word_timestamps...") t0 = time.time() segments, info = model.transcribe( wav_path, beam_size=5, vad_filter=True, vad_parameters={"min_silence_duration_ms": 500}, word_timestamps=True ) words = [] for seg in segments: if seg.words: for w in seg.words: wt = w.word.strip() if wt: words.append({"word": wt, "start": w.start, "end": w.end}) # Also save segment-level as fallback words.append({"word": seg.text.strip(), "start": seg.start, "end": seg.end, "_seg": True}) elapsed = time.time() - t0 print(f" {len(words)} entries in {elapsed:.1f}s") json.dump(words, open(words_file, "w")) # Separate word-level and segment-level word_entries = [w for w in words if not w.get("_seg")] seg_entries = [w for w in words if w.get("_seg")] print(f"Word-level: {len(word_entries)}, Segment-level: {len(seg_entries)}") # Map: for each fine segment, find ALL word entries within its time range print("Mapping words to segments...") assigned = 0 for si, fs in enumerate(fine_segs): fstart = fs["start_time"] fend = fs["end_time"] seg_words = [] # Use word-level entries first (more precise) for w in word_entries: if w["start"] >= fstart and w["end"] <= fend + 0.05: seg_words.append(w["word"]) elif w["start"] > fend: break # words are sorted by time if not seg_words: # Fallback to segment-level for w in seg_entries: if w["start"] >= fstart and w["end"] <= fend + 0.05: seg_words.append(w["word"]) elif w["start"] > fend: break text = " ".join(seg_words) if seg_words else "" fs["text"] = text if text: assigned += 1 if (si + 1) % 500 == 0: print(f" {si+1}/{len(fine_segs)}") print(f"Segments with text: {assigned}/{len(fine_segs)}") # Fix empty segments: use original ASR text asr = json.load(open(f"{BASE}/{UUID}.asr.json")) asr_segs = asr["segments"] asr_bounds = {(s['start'], s['end']): s['text'] for s in asr_segs} for fs in fine_segs: if not fs.get('text', '').strip(): key = (fs['start_time'], fs['end_time']) if key in asr_bounds: fs['text'] = asr_bounds[key] else: fs['text'] = "" with_text = sum(1 for fs in fine_segs if fs.get('text','').strip()) print(f"After fallback: {with_text}/{len(fine_segs)} with text") # Save fine["_asr_meta"]["word_file"] = words_file json.dump(fine, open(f"{BASE}/{UUID}.asrx_fine.json", "w"), indent=2) print("Saved") shutil.rmtree(tmp_dir, ignore_errors=True)