Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
75 lines
2.4 KiB
Python
75 lines
2.4 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Compare ASR small vs large-v3 on a short test clip.
|
|
"""
|
|
import json, time, sys, os
|
|
from faster_whisper import WhisperModel
|
|
|
|
CLIP = "/tmp/charade_test_clip.mp4"
|
|
|
|
models = {
|
|
"small": {"size": "small", "device": "cpu", "compute": "int8"},
|
|
"large-v3": {"size": "large-v3", "device": "cpu", "compute": "int8"},
|
|
}
|
|
|
|
for name, cfg in models.items():
|
|
outfile = f"/tmp/asr_{name}_result.json"
|
|
if os.path.exists(outfile):
|
|
print(f"{name}: already done, skip")
|
|
continue
|
|
|
|
print(f"\n=== Loading {name} model ===")
|
|
t0 = time.time()
|
|
model = WhisperModel(cfg["size"], device=cfg["device"], compute_type=cfg["compute"])
|
|
print(f" Loaded in {time.time()-t0:.1f}s")
|
|
|
|
print(f" Transcribing...")
|
|
t0 = time.time()
|
|
segments, info = model.transcribe(CLIP, beam_size=5, vad_filter=True,
|
|
vad_parameters={"min_silence_duration_ms": 500})
|
|
segs = []
|
|
for seg in segments:
|
|
segs.append({"start": round(seg.start + 1540, 2), "end": round(seg.end + 1540, 2),
|
|
"text": seg.text.strip()})
|
|
elapsed = time.time() - t0
|
|
|
|
result = {
|
|
"model": name,
|
|
"language": info.language,
|
|
"segments": segs,
|
|
"segment_count": len(segs),
|
|
"duration_secs": round(elapsed, 1),
|
|
}
|
|
json.dump(result, open(outfile, "w"), indent=2, ensure_ascii=False)
|
|
print(f" Done: {len(segs)} segs in {elapsed:.1f}s")
|
|
del model # free memory
|
|
|
|
print("\n=== Comparison ===")
|
|
for name in models:
|
|
r = json.load(open(f"/tmp/asr_{name}_result.json"))
|
|
print(f"{name}: {r['segment_count']} segs, {r['duration_secs']}s runtime")
|
|
|
|
# Show differences
|
|
small = json.load(open("/tmp/asr_small_result.json"))["segments"]
|
|
large = json.load(open("/tmp/asr_large_v3_result.json"))["segments"]
|
|
|
|
small_texts = set(s["text"] for s in small)
|
|
large_texts = set(s["text"] for s in large)
|
|
|
|
only_small = small_texts - large_texts
|
|
only_large = large_texts - small_texts
|
|
|
|
print(f"\nTexts only in small: {len(only_small)}")
|
|
for t in sorted(only_small)[:10]:
|
|
print(f" SMALL: \"{t}\"")
|
|
|
|
print(f"\nTexts only in large: {len(only_large)}")
|
|
for t in sorted(only_large)[:10]:
|
|
print(f" LARGE: \"{t}\"")
|
|
|
|
# Compare segment boundaries
|
|
print(f"\nSegment time differences (large has more/fewer):")
|
|
print(f" Small: {len(small)} segments")
|
|
print(f" Large: {len(large)} segments")
|
|
print(f" Diff: {len(large) - len(small)}")
|