momentry_core/scripts/gdino_comparison_test.py

#!/opt/homebrew/bin/python3.11
"""
Grounding DINO Base vs Large comparison test.
Both use Swin-B backbone; Large trained on 7 datasets vs Base's 3.
"""
import json, os, sys, time, cv2, torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/gdino_comparison"
LARGE_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TIMEPOINTS = [
    (2646, "2646s"), (3188, "3188s"), (3697, "3697s"), (5341, "5341s"),
    (5461, "5461s"), (6309, "6309s"), (6377, "6377s"), (6479, "6479s"),
]
PROMPTS = ["gun", "pistol", "rifle", "weapon"]

cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0

def get_frame(t_sec):
    cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
    ret, frame = cap.read()
    return frame if ret else None

models = {
    "base": {"path": "IDEA-Research/grounding-dino-base", "label": "Base (3 datasets)"},
    "large": {"path": LARGE_PATH, "label": "Large (7 datasets)"},
}

all_results = {}
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")

for model_name, model_info in models.items():
    print(f"\n{'='*60}")
    print(f"Loading {model_info['label']} ({model_name})...")
    print(f"{'='*60}")

    t_load = time.time()
    processor = AutoProcessor.from_pretrained(model_info["path"])
    model = AutoModelForZeroShotObjectDetection.from_pretrained(model_info["path"]).to(device)
    load_time = time.time() - t_load
    print(f"  Loaded in {load_time:.1f}s")

    model_dets = {}
    t0 = time.time()

    for t_sec, label in TIMEPOINTS:
        frame = get_frame(t_sec)
        if frame is None: continue
        img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        for prompt in PROMPTS:
            inputs = processor(images=img, text=f"{prompt}.", return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            target = torch.tensor([img.size[::-1]])
            dets = processor.post_process_grounded_object_detection(
                outputs, threshold=0.05, target_sizes=target
            )[0]

            det_list = []
            for i in range(len(dets["boxes"])):
                det_list.append({
                    "bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
                    "score": round(dets["scores"][i].item(), 3),
                    "label": prompt,
                })
            model_dets[f"{label}_prompt-{prompt}"] = det_list

    elapsed = time.time() - t0
    all_results[model_name] = {"elapsed": round(elapsed, 1), "detections": model_dets}
    print(f"  Inference: {elapsed:.1f}s")

    del model
    torch.mps.empty_cache()

cap.release()

# ========== Summary ==========
print(f"\n{'='*60}")
print("COMPARISON SUMMARY")
print(f"{'='*60}")

for model_name in ["base", "large"]:
    d = all_results[model_name]
    dets = d["detections"]
    hits = sum(1 for v in dets.values() if v)
    total = sum(len(v) for v in dets.values())
    print(f"\n{model_name.upper()} ({d['elapsed']}s): {hits}/32 prompt-timepoint hits, {total} total detections")

    for t_sec, label in TIMEPOINTS:
        candidates = []
        for p in PROMPTS:
            key = f"{label}_prompt-{p}"
            key_rev = f"{label}_prompt-{p}."
            for k in [key, key_rev]:
                if k in dets and dets[k]:
                    for dd in dets[k]:
                        candidates.append((p, dd["score"]))
        if candidates:
            best = max(candidates, key=lambda x: x[1])
            print(f"  {t_sec}s ({(t_sec//60)}:{t_sec%60:02d}): best={best[1]:.3f} (prompt='{best[0]}')")
        else:
            print(f"  {t_sec}s: no detections")

# Per-timepoint comparison
print(f"\n{'='*60}")
print("PER-TIMEPOINT COMPARISON")
print(f"{'='*60}")
for t_sec, label in TIMEPOINTS:
    base_best = None
    large_best = None
    for p in PROMPTS:
        for mn in ["base", "large"]:
            dets = all_results[mn]["detections"]
            for k in [f"{label}_prompt-{p}", f"{label}_prompt-{p}."]:
                if k in dets and dets[k]:
                    scores = [dd["score"] for dd in dets[k]]
                    best = max(scores)
                    if mn == "base" and (base_best is None or best > base_best[1]):
                        base_best = (p, best)
                    if mn == "large" and (large_best is None or best > large_best[1]):
                        large_best = (p, best)

    b_str = f"base={base_best[1]:.3f} ({base_best[0]})" if base_best else "base=no det"
    l_str = f"large={large_best[1]:.3f} ({large_best[0]})" if large_best else "large=no det"

    delta = ""
    if base_best and large_best:
        d = large_best[1] - base_best[1]
        delta = f" ({'+'if d>0 else ''}{d:.3f})"

    print(f"  {t_sec}s: {b_str:30s} | {l_str:30s}{delta}")

# Save
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "comparison_results.json"), "w"), indent=2)
print(f"\nSaved to {OUTPUT_DIR}/")