Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
84 lines
3.1 KiB
Python
84 lines
3.1 KiB
Python
#!/opt/homebrew/bin/python3.11
|
||
"""
|
||
Comprehensive ASR Model Selection Benchmark
|
||
Tests 5 models × 2 VAD settings across 3 test clips.
|
||
Output: JSON results + markdown report
|
||
"""
|
||
import json, time, os, gc, sys
|
||
from faster_whisper import WhisperModel
|
||
|
||
CLIPS = {
|
||
"A_rapid": {"path": "/tmp/asr_clip_A.mp4", "offset": 1540},
|
||
"B_normal": {"path": "/tmp/asr_clip_B.mp4", "offset": 600},
|
||
"C_complex": {"path": "/tmp/asr_clip_C.mp4", "offset": 4400},
|
||
}
|
||
|
||
MODELS = ["tiny", "base", "small", "medium", "large-v3"]
|
||
VAD_SETTINGS = [200, 500] # min_silence_duration_ms
|
||
|
||
RESULTS_FILE = "/tmp/asr_benchmark_results.json"
|
||
|
||
def run_transcribe(model, clip_path, clip_name, vad_ms):
|
||
segs = []
|
||
t0 = time.time()
|
||
vad_params = {"min_silence_duration_ms": vad_ms}
|
||
segments, info = model.transcribe(clip_path, beam_size=5, vad_filter=True,
|
||
vad_parameters=vad_params)
|
||
for seg in segments:
|
||
segs.append({"start": round(seg.start, 2), "end": round(seg.end, 2),
|
||
"text": seg.text.strip()})
|
||
elapsed = time.time() - t0
|
||
return segs, info, elapsed
|
||
|
||
# Load existing results to skip completed
|
||
all_results = {}
|
||
if os.path.exists(RESULTS_FILE):
|
||
all_results = json.load(open(RESULTS_FILE))
|
||
print(f"Loaded {sum(len(v) for v in all_results.values())} existing results")
|
||
|
||
total = len(CLIPS) * len(MODELS) * len(VAD_SETTINGS)
|
||
done = sum(len(v) for v in all_results.values())
|
||
print(f"Total: {total} tests, {done} already done, {total-done} remaining\n")
|
||
|
||
for clip_name, clip_cfg in CLIPS.items():
|
||
if clip_name not in all_results:
|
||
all_results[clip_name] = {}
|
||
|
||
for model_size in MODELS:
|
||
for vad_ms in VAD_SETTINGS:
|
||
key = f"{model_size}_vad{vad_ms}"
|
||
if key in all_results[clip_name]:
|
||
continue
|
||
|
||
print(f"[{clip_name}] {model_size} VAD={vad_ms}ms ...", end=" ", flush=True)
|
||
t_load = time.time()
|
||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||
load_time = time.time() - t_load
|
||
|
||
segs, info, trans_time = run_transcribe(model, clip_cfg["path"], clip_name, vad_ms)
|
||
|
||
# Total chars
|
||
total_chars = sum(len(s["text"]) for s in segs)
|
||
|
||
all_results[clip_name][key] = {
|
||
"model": model_size,
|
||
"vad_ms": vad_ms,
|
||
"segments": segs,
|
||
"segment_count": len(segs),
|
||
"total_chars": total_chars,
|
||
"runtime_secs": round(trans_time, 1),
|
||
"load_time_secs": round(load_time, 1),
|
||
"language": info.language,
|
||
}
|
||
print(f"{len(segs)} segs, {total_chars} chars, {trans_time:.1f}s")
|
||
|
||
# Free memory between models
|
||
del model
|
||
gc.collect()
|
||
|
||
# Save incrementally
|
||
json.dump(all_results, open(RESULTS_FILE, "w"))
|
||
|
||
print("\n=== All tests complete ===")
|
||
print(json.dumps({k: {kk: {kkk: vv for kkk, vv in v.items() if kkk != "segments"} for kk, v in vv.items()} for k, vv in all_results.items()}, indent=2))
|