feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
Accusys
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions

View File

@@ -0,0 +1,83 @@
#!/opt/homebrew/bin/python3.11
"""
Comprehensive ASR Model Selection Benchmark
Tests 5 models × 2 VAD settings across 3 test clips.
Output: JSON results + markdown report
"""
import json, time, os, gc, sys
from faster_whisper import WhisperModel
CLIPS = {
"A_rapid": {"path": "/tmp/asr_clip_A.mp4", "offset": 1540},
"B_normal": {"path": "/tmp/asr_clip_B.mp4", "offset": 600},
"C_complex": {"path": "/tmp/asr_clip_C.mp4", "offset": 4400},
}
MODELS = ["tiny", "base", "small", "medium", "large-v3"]
VAD_SETTINGS = [200, 500] # min_silence_duration_ms
RESULTS_FILE = "/tmp/asr_benchmark_results.json"
def run_transcribe(model, clip_path, clip_name, vad_ms):
segs = []
t0 = time.time()
vad_params = {"min_silence_duration_ms": vad_ms}
segments, info = model.transcribe(clip_path, beam_size=5, vad_filter=True,
vad_parameters=vad_params)
for seg in segments:
segs.append({"start": round(seg.start, 2), "end": round(seg.end, 2),
"text": seg.text.strip()})
elapsed = time.time() - t0
return segs, info, elapsed
# Load existing results to skip completed
all_results = {}
if os.path.exists(RESULTS_FILE):
all_results = json.load(open(RESULTS_FILE))
print(f"Loaded {sum(len(v) for v in all_results.values())} existing results")
total = len(CLIPS) * len(MODELS) * len(VAD_SETTINGS)
done = sum(len(v) for v in all_results.values())
print(f"Total: {total} tests, {done} already done, {total-done} remaining\n")
for clip_name, clip_cfg in CLIPS.items():
if clip_name not in all_results:
all_results[clip_name] = {}
for model_size in MODELS:
for vad_ms in VAD_SETTINGS:
key = f"{model_size}_vad{vad_ms}"
if key in all_results[clip_name]:
continue
print(f"[{clip_name}] {model_size} VAD={vad_ms}ms ...", end=" ", flush=True)
t_load = time.time()
model = WhisperModel(model_size, device="cpu", compute_type="int8")
load_time = time.time() - t_load
segs, info, trans_time = run_transcribe(model, clip_cfg["path"], clip_name, vad_ms)
# Total chars
total_chars = sum(len(s["text"]) for s in segs)
all_results[clip_name][key] = {
"model": model_size,
"vad_ms": vad_ms,
"segments": segs,
"segment_count": len(segs),
"total_chars": total_chars,
"runtime_secs": round(trans_time, 1),
"load_time_secs": round(load_time, 1),
"language": info.language,
}
print(f"{len(segs)} segs, {total_chars} chars, {trans_time:.1f}s")
# Free memory between models
del model
gc.collect()
# Save incrementally
json.dump(all_results, open(RESULTS_FILE, "w"))
print("\n=== All tests complete ===")
print(json.dumps({k: {kk: {kkk: vv for kkk, vv in v.items() if kkk != "segments"} for kk, v in vv.items()} for k, vv in all_results.items()}, indent=2))