feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions
--- a/scripts/test_asr_large_model.py
+++ b/scripts/test_asr_large_model.py
@@ -0,0 +1,74 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Compare ASR small vs large-v3 on a short test clip.
+"""
+import json, time, sys, os
+from faster_whisper import WhisperModel
+
+CLIP = "/tmp/charade_test_clip.mp4"
+
+models = {
+    "small": {"size": "small", "device": "cpu", "compute": "int8"},
+    "large-v3": {"size": "large-v3", "device": "cpu", "compute": "int8"},
+}
+
+for name, cfg in models.items():
+    outfile = f"/tmp/asr_{name}_result.json"
+    if os.path.exists(outfile):
+        print(f"{name}: already done, skip")
+        continue
+
+    print(f"\n=== Loading {name} model ===")
+    t0 = time.time()
+    model = WhisperModel(cfg["size"], device=cfg["device"], compute_type=cfg["compute"])
+    print(f"  Loaded in {time.time()-t0:.1f}s")
+
+    print(f"  Transcribing...")
+    t0 = time.time()
+    segments, info = model.transcribe(CLIP, beam_size=5, vad_filter=True,
+                                       vad_parameters={"min_silence_duration_ms": 500})
+    segs = []
+    for seg in segments:
+        segs.append({"start": round(seg.start + 1540, 2), "end": round(seg.end + 1540, 2),
+                      "text": seg.text.strip()})
+    elapsed = time.time() - t0
+
+    result = {
+        "model": name,
+        "language": info.language,
+        "segments": segs,
+        "segment_count": len(segs),
+        "duration_secs": round(elapsed, 1),
+    }
+    json.dump(result, open(outfile, "w"), indent=2, ensure_ascii=False)
+    print(f"  Done: {len(segs)} segs in {elapsed:.1f}s")
+    del model  # free memory
+
+print("\n=== Comparison ===")
+for name in models:
+    r = json.load(open(f"/tmp/asr_{name}_result.json"))
+    print(f"{name}: {r['segment_count']} segs, {r['duration_secs']}s runtime")
+
+# Show differences
+small = json.load(open("/tmp/asr_small_result.json"))["segments"]
+large = json.load(open("/tmp/asr_large_v3_result.json"))["segments"]
+
+small_texts = set(s["text"] for s in small)
+large_texts = set(s["text"] for s in large)
+
+only_small = small_texts - large_texts
+only_large = large_texts - small_texts
+
+print(f"\nTexts only in small: {len(only_small)}")
+for t in sorted(only_small)[:10]:
+    print(f"  SMALL: \"{t}\"")
+
+print(f"\nTexts only in large: {len(only_large)}")
+for t in sorted(only_large)[:10]:
+    print(f"  LARGE: \"{t}\"")
+
+# Compare segment boundaries
+print(f"\nSegment time differences (large has more/fewer):")
+print(f"  Small: {len(small)} segments")
+print(f"  Large: {len(large)} segments")
+print(f"  Diff: {len(large) - len(small)}")