feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions
--- a/scripts/zero_shot_combined_test.py
+++ b/scripts/zero_shot_combined_test.py
@@ -0,0 +1,84 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Test Grounding DINO Large with COMBINED prompts — one inference per frame.
+"""
+import json, os, time, cv2, torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
+VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
+OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_objects"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+TIMEPOINTS = [
+    (429, "stamp"), (691, "stamp_letter"), (762, "passport"),
+    (3491, "passport"), (5054, "passport"),
+    (5434, "letter"), (5443, "stamp_envelope"),
+    (5467, "envelope"), (5500, "stamp"), (5506, "stamp"),
+    (5783, "letter"), (5786, "envelope"),
+]
+
+COMBINED_PROMPT = "stamp. postage stamp. envelope. passport. identification. letter."
+
+print("Loading Large model...")
+t0 = time.time()
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_PATH)
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+model.to(device)
+print(f"Loaded in {time.time()-t0:.1f}s")
+
+cap = cv2.VideoCapture(VIDEO)
+fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+
+print(f"\nTesting {len(TIMEPOINTS)} timepoints with combined prompt...")
+t_infer = time.time()
+
+for t_sec, label in TIMEPOINTS:
+    cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
+    ret, frame = cap.read()
+    if frame is None: continue
+
+    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+
+    # ONE inference with ALL prompts
+    inputs = processor(images=img, text=COMBINED_PROMPT, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    target = torch.tensor([img.size[::-1]])
+    dets = processor.post_process_grounded_object_detection(
+        outputs, threshold=0.1, target_sizes=target
+    )[0]
+
+    det_list = []
+    for i in range(len(dets["boxes"])):
+        det_list.append({
+            "bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
+            "score": round(dets["scores"][i].item(), 3),
+            "label": str(dets["labels"][i]) if "labels" in dets else "object",
+        })
+
+    # Classify which expected objects were found
+    found = set()
+    for d in det_list:
+        lbl = d["label"].lower()
+        for obj in ["stamp", "envelope", "passport", "letter"]:
+            if obj in lbl:
+                found.add(obj)
+
+    found_str = ", ".join(sorted(found)) if found else "none"
+    print(f"  {t_sec//60}:{t_sec%60:02d} {label:20s} | {len(det_list)} dets | found: [{found_str}]")
+
+    # Save annotated frame
+    for d in det_list:
+        x1, y1, x2, y2 = [int(v) for v in d["bbox"]]
+        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
+        cv2.putText(frame, f"{d['label']} {d['score']:.2f}", (x1, y1-5),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
+
+    cv2.imwrite(os.path.join(OUTPUT_DIR, f"combined_{t_sec}s.jpg"), frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
+
+cap.release()
+print(f"\nDone in {time.time()-t_infer:.0f}s")
+print(f"Screenshots: {OUTPUT_DIR}/")