feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
74
scripts/test_asr_large_model.py
Normal file
74
scripts/test_asr_large_model.py
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Compare ASR small vs large-v3 on a short test clip.
|
||||
"""
|
||||
import json, time, sys, os
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
CLIP = "/tmp/charade_test_clip.mp4"
|
||||
|
||||
models = {
|
||||
"small": {"size": "small", "device": "cpu", "compute": "int8"},
|
||||
"large-v3": {"size": "large-v3", "device": "cpu", "compute": "int8"},
|
||||
}
|
||||
|
||||
for name, cfg in models.items():
|
||||
outfile = f"/tmp/asr_{name}_result.json"
|
||||
if os.path.exists(outfile):
|
||||
print(f"{name}: already done, skip")
|
||||
continue
|
||||
|
||||
print(f"\n=== Loading {name} model ===")
|
||||
t0 = time.time()
|
||||
model = WhisperModel(cfg["size"], device=cfg["device"], compute_type=cfg["compute"])
|
||||
print(f" Loaded in {time.time()-t0:.1f}s")
|
||||
|
||||
print(f" Transcribing...")
|
||||
t0 = time.time()
|
||||
segments, info = model.transcribe(CLIP, beam_size=5, vad_filter=True,
|
||||
vad_parameters={"min_silence_duration_ms": 500})
|
||||
segs = []
|
||||
for seg in segments:
|
||||
segs.append({"start": round(seg.start + 1540, 2), "end": round(seg.end + 1540, 2),
|
||||
"text": seg.text.strip()})
|
||||
elapsed = time.time() - t0
|
||||
|
||||
result = {
|
||||
"model": name,
|
||||
"language": info.language,
|
||||
"segments": segs,
|
||||
"segment_count": len(segs),
|
||||
"duration_secs": round(elapsed, 1),
|
||||
}
|
||||
json.dump(result, open(outfile, "w"), indent=2, ensure_ascii=False)
|
||||
print(f" Done: {len(segs)} segs in {elapsed:.1f}s")
|
||||
del model # free memory
|
||||
|
||||
print("\n=== Comparison ===")
|
||||
for name in models:
|
||||
r = json.load(open(f"/tmp/asr_{name}_result.json"))
|
||||
print(f"{name}: {r['segment_count']} segs, {r['duration_secs']}s runtime")
|
||||
|
||||
# Show differences
|
||||
small = json.load(open("/tmp/asr_small_result.json"))["segments"]
|
||||
large = json.load(open("/tmp/asr_large_v3_result.json"))["segments"]
|
||||
|
||||
small_texts = set(s["text"] for s in small)
|
||||
large_texts = set(s["text"] for s in large)
|
||||
|
||||
only_small = small_texts - large_texts
|
||||
only_large = large_texts - small_texts
|
||||
|
||||
print(f"\nTexts only in small: {len(only_small)}")
|
||||
for t in sorted(only_small)[:10]:
|
||||
print(f" SMALL: \"{t}\"")
|
||||
|
||||
print(f"\nTexts only in large: {len(only_large)}")
|
||||
for t in sorted(only_large)[:10]:
|
||||
print(f" LARGE: \"{t}\"")
|
||||
|
||||
# Compare segment boundaries
|
||||
print(f"\nSegment time differences (large has more/fewer):")
|
||||
print(f" Small: {len(small)} segments")
|
||||
print(f" Large: {len(large)} segments")
|
||||
print(f" Diff: {len(large) - len(small)}")
|
||||
Reference in New Issue
Block a user