Files
momentry_core/scripts/step3_asr_fine.py
Accusys 39ba5ddf76 feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00

99 lines
3.2 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Step 3: Re-run ASR with word_timestamps on full audio.
Map words to 4188 fine segments for accurate text.
"""
import json, sys, os, time, subprocess, tempfile, shutil
from faster_whisper import WhisperModel
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
BASE = "/Users/accusys/momentry/output_dev"
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
print("=== Load fine ASRX ===")
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
fine_segs = fine["segments"]
print(f"Fine segments: {len(fine_segs)}")
print("\n=== Extract audio WAV ===")
tmp_dir = tempfile.mkdtemp(prefix="asr_step3_")
wav_path = os.path.join(tmp_dir, "audio.wav")
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO,
"-ar", "16000", "-ac", "1", "-sample_fmt", "s16", wav_path],
check=True, capture_output=True, timeout=300)
print("Loading model with word_timestamps...")
t0 = time.time()
model = WhisperModel("small", device="cpu", compute_type="int8")
print(f" Model loaded in {time.time()-t0:.1f}s")
print("Transcribing with word_timestamps=True...")
t0 = time.time()
segments, info = model.transcribe(
wav_path, beam_size=5, vad_filter=True,
vad_parameters={"min_silence_duration_ms": 500},
word_timestamps=True
)
# Collect all word-level data
words = []
for seg in segments:
if seg.words:
for w in seg.words:
wt = w.word.strip()
if wt:
words.append({"word": wt, "start": w.start, "end": w.end})
else:
words.append({"word": seg.text.strip(), "start": seg.start, "end": seg.end})
elapsed = time.time() - t0
print(f" Done in {elapsed:.1f}s, {len(words)} words")
# Map words to fine segments
print("\n=== Map words to fine segments ===")
wi = 0
assigned = 0
for si, fs in enumerate(fine_segs):
fstart = fs["start_time"]
fend = fs["end_time"]
seg_words = []
while wi < len(words):
w = words[wi]
if w["end"] <= fstart:
wi += 1
continue
if w["start"] >= fend:
break
seg_words.append(w["word"])
wi += 1
text = " ".join(seg_words)
fs["text"] = text
if text:
assigned += 1
print(f" Segments with text: {assigned}/{len(fine_segs)}")
# Show examples
print("\nSplit segment examples:")
for fs in fine_segs:
# Check if this was split (doesn't match an ASR boundary exactly)
is_split = True
# We can't easily check here, just show first 10 non-trivial
if len(fs.get('text','')) > 10 and is_split:
print(f" [{fs['start_time']:.1f}-{fs['end_time']:.1f}] {fs['speaker_name']:15s} \"{fs['text'][:60]}\"")
break # just one for now
# Count text lengths
text_lens = [len(fs.get('text','')) for fs in fine_segs]
print(f"\n Avg text length: {sum(text_lens)/len(text_lens):.0f} chars")
print(f" Empty texts: {sum(1 for l in text_lens if l == 0)}")
# Save
fine["_asr_meta"] = {"word_timestamps": True, "asr_runtime_secs": round(elapsed, 1)}
json.dump(fine, open(f"{BASE}/{UUID}.asrx_fine.json", "w"), indent=2)
print(f"\nSaved")
shutil.rmtree(tmp_dir, ignore_errors=True)