Files
momentry_core/scripts/test_asr_large_model.py
Accusys 39ba5ddf76 feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00

75 lines
2.4 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Compare ASR small vs large-v3 on a short test clip.
"""
import json, time, sys, os
from faster_whisper import WhisperModel
CLIP = "/tmp/charade_test_clip.mp4"
models = {
"small": {"size": "small", "device": "cpu", "compute": "int8"},
"large-v3": {"size": "large-v3", "device": "cpu", "compute": "int8"},
}
for name, cfg in models.items():
outfile = f"/tmp/asr_{name}_result.json"
if os.path.exists(outfile):
print(f"{name}: already done, skip")
continue
print(f"\n=== Loading {name} model ===")
t0 = time.time()
model = WhisperModel(cfg["size"], device=cfg["device"], compute_type=cfg["compute"])
print(f" Loaded in {time.time()-t0:.1f}s")
print(f" Transcribing...")
t0 = time.time()
segments, info = model.transcribe(CLIP, beam_size=5, vad_filter=True,
vad_parameters={"min_silence_duration_ms": 500})
segs = []
for seg in segments:
segs.append({"start": round(seg.start + 1540, 2), "end": round(seg.end + 1540, 2),
"text": seg.text.strip()})
elapsed = time.time() - t0
result = {
"model": name,
"language": info.language,
"segments": segs,
"segment_count": len(segs),
"duration_secs": round(elapsed, 1),
}
json.dump(result, open(outfile, "w"), indent=2, ensure_ascii=False)
print(f" Done: {len(segs)} segs in {elapsed:.1f}s")
del model # free memory
print("\n=== Comparison ===")
for name in models:
r = json.load(open(f"/tmp/asr_{name}_result.json"))
print(f"{name}: {r['segment_count']} segs, {r['duration_secs']}s runtime")
# Show differences
small = json.load(open("/tmp/asr_small_result.json"))["segments"]
large = json.load(open("/tmp/asr_large_v3_result.json"))["segments"]
small_texts = set(s["text"] for s in small)
large_texts = set(s["text"] for s in large)
only_small = small_texts - large_texts
only_large = large_texts - small_texts
print(f"\nTexts only in small: {len(only_small)}")
for t in sorted(only_small)[:10]:
print(f" SMALL: \"{t}\"")
print(f"\nTexts only in large: {len(only_large)}")
for t in sorted(only_large)[:10]:
print(f" LARGE: \"{t}\"")
# Compare segment boundaries
print(f"\nSegment time differences (large has more/fewer):")
print(f" Small: {len(small)} segments")
print(f" Large: {len(large)} segments")
print(f" Diff: {len(large) - len(small)}")