feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
114
scripts/fix_asr_text.py
Normal file
114
scripts/fix_asr_text.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Redo ASR word-timestamp mapping correctly.
|
||||
Save words first, then map to fine segments with independent scanning.
|
||||
"""
|
||||
import json, sys, os, time, subprocess, tempfile, shutil
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
BASE = "/Users/accusys/momentry/output_dev"
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
|
||||
print("Load fine segments...")
|
||||
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
|
||||
fine_segs = fine["segments"]
|
||||
print(f"{len(fine_segs)} segments")
|
||||
|
||||
# Extract full audio
|
||||
tmp_dir = tempfile.mkdtemp(prefix="asr_fix_")
|
||||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||||
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO,
|
||||
"-ar", "16000", "-ac", "1", "-sample_fmt", "s16", wav_path],
|
||||
check=True, capture_output=True, timeout=300)
|
||||
|
||||
print("Loading model...")
|
||||
model = WhisperModel("small", device="cpu", compute_type="int8")
|
||||
|
||||
# Check if words file exists
|
||||
words_file = f"{BASE}/{UUID}.words.json"
|
||||
if os.path.exists(words_file):
|
||||
print("Loading saved words...")
|
||||
words = json.load(open(words_file))
|
||||
else:
|
||||
print("Transcribing with word_timestamps...")
|
||||
t0 = time.time()
|
||||
segments, info = model.transcribe(
|
||||
wav_path, beam_size=5, vad_filter=True,
|
||||
vad_parameters={"min_silence_duration_ms": 500},
|
||||
word_timestamps=True
|
||||
)
|
||||
words = []
|
||||
for seg in segments:
|
||||
if seg.words:
|
||||
for w in seg.words:
|
||||
wt = w.word.strip()
|
||||
if wt:
|
||||
words.append({"word": wt, "start": w.start, "end": w.end})
|
||||
# Also save segment-level as fallback
|
||||
words.append({"word": seg.text.strip(), "start": seg.start, "end": seg.end, "_seg": True})
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f" {len(words)} entries in {elapsed:.1f}s")
|
||||
json.dump(words, open(words_file, "w"))
|
||||
|
||||
# Separate word-level and segment-level
|
||||
word_entries = [w for w in words if not w.get("_seg")]
|
||||
seg_entries = [w for w in words if w.get("_seg")]
|
||||
print(f"Word-level: {len(word_entries)}, Segment-level: {len(seg_entries)}")
|
||||
|
||||
# Map: for each fine segment, find ALL word entries within its time range
|
||||
print("Mapping words to segments...")
|
||||
assigned = 0
|
||||
for si, fs in enumerate(fine_segs):
|
||||
fstart = fs["start_time"]
|
||||
fend = fs["end_time"]
|
||||
|
||||
seg_words = []
|
||||
# Use word-level entries first (more precise)
|
||||
for w in word_entries:
|
||||
if w["start"] >= fstart and w["end"] <= fend + 0.05:
|
||||
seg_words.append(w["word"])
|
||||
elif w["start"] > fend:
|
||||
break # words are sorted by time
|
||||
|
||||
if not seg_words:
|
||||
# Fallback to segment-level
|
||||
for w in seg_entries:
|
||||
if w["start"] >= fstart and w["end"] <= fend + 0.05:
|
||||
seg_words.append(w["word"])
|
||||
elif w["start"] > fend:
|
||||
break
|
||||
|
||||
text = " ".join(seg_words) if seg_words else ""
|
||||
fs["text"] = text
|
||||
if text:
|
||||
assigned += 1
|
||||
|
||||
if (si + 1) % 500 == 0:
|
||||
print(f" {si+1}/{len(fine_segs)}")
|
||||
|
||||
print(f"Segments with text: {assigned}/{len(fine_segs)}")
|
||||
|
||||
# Fix empty segments: use original ASR text
|
||||
asr = json.load(open(f"{BASE}/{UUID}.asr.json"))
|
||||
asr_segs = asr["segments"]
|
||||
asr_bounds = {(s['start'], s['end']): s['text'] for s in asr_segs}
|
||||
|
||||
for fs in fine_segs:
|
||||
if not fs.get('text', '').strip():
|
||||
key = (fs['start_time'], fs['end_time'])
|
||||
if key in asr_bounds:
|
||||
fs['text'] = asr_bounds[key]
|
||||
else:
|
||||
fs['text'] = ""
|
||||
|
||||
with_text = sum(1 for fs in fine_segs if fs.get('text','').strip())
|
||||
print(f"After fallback: {with_text}/{len(fine_segs)} with text")
|
||||
|
||||
# Save
|
||||
fine["_asr_meta"]["word_file"] = words_file
|
||||
json.dump(fine, open(f"{BASE}/{UUID}.asrx_fine.json", "w"), indent=2)
|
||||
print("Saved")
|
||||
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
Reference in New Issue
Block a user