momentry_core/scripts/compare_models_gun_test.py

#!/opt/homebrew/bin/python3.11
"""
Comparison test: Grounding DINO Base vs Florence-2 Base vs Florence-2 Large
Tests on 8 known timepoints with gun prompts.
"""
import json, os, sys, time, cv2, torch
from PIL import Image

VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/model_comparison"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TIMEPOINTS = [
    (2646, "2646s"), (3188, "3188s"), (3697, "3697s"),
    (5341, "5341s"), (5461, "5461s"), (6309, "6309s"),
    (6377, "6377s"), (6479, "6479s"),
]
PROMPTS = {"gun": "gun.", "pistol": "pistol."}
device = "mps" if torch.backends.mps.is_available() else "cpu"

cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
frames = {}
for t_sec, label in TIMEPOINTS:
    cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
    ret, frame = cap.read()
    if ret: frames[label] = frame
cap.release()
print(f"Loaded {len(frames)} frames")

all_results = {}

# ========== Grounding DINO Base ==========
print("\n" + "="*60)
print("Grounding DINO Base")
print("="*60)
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
t0 = time.time()
gd_proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
gd_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to(device)
gd_dets = {}
for label, frame in frames.items():
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    for pname, prompt in PROMPTS.items():
        inputs = gd_proc(images=img, text=prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = gd_model(**inputs)
        target = torch.tensor([img.size[::-1]])
        dets = gd_proc.post_process_grounded_object_detection(outputs, threshold=0.1, target_sizes=target)[0]
        scores = [round(s.item(), 3) for s in dets["scores"]] if len(dets["boxes"]) > 0 else []
        gd_dets[f"{label}_{pname}"] = scores
all_results["grounding-dino-base"] = {"elapsed": round(time.time()-t0, 1), "detections": gd_dets}
print(f"  Done in {all_results['grounding-dino-base']['elapsed']}s")
del gd_model; torch.mps.empty_cache()

# ========== Florence-2 Base ==========
print("\n" + "="*60)
print("Florence-2 Base")
print("="*60)
from transformers import AutoProcessor, AutoModelForCausalLM
t0 = time.time()
f2b_proc = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
f2b_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True).to(device)
f2b_dets = {}
for label, frame in frames.items():
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    for pname, prompt_text in PROMPTS.items():
        task = f"<OD>"  # Object detection task
        text = f"{task}{prompt_text}"
        inputs = f2b_proc(text=text, images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = f2b_model.generate(**inputs, max_new_tokens=100, num_beams=3)
        result = f2b_proc.decode(outputs[0], skip_special_tokens=False)
        # Parse Florence-2 output format
        scores = []
        if "<p>" in result and "</p>" in result:
            # Simple parsing: count detections (Florence-2 outputs positions)
            # Florence-2 outputs: <OD>gun.</s><p><loc_...><loc_...><loc_...><loc_...>gun</p>...
            import re
            detections = re.findall(r'<loc_\d+>', result)
            n_dets = len(detections) // 4  # 4 coords per bbox
            scores = [1.0] * n_dets if n_dets > 0 else []  # Florence-2 doesn't output confidence
        elif prompt_text.replace('.','') in result:
            scores = [1.0]  # At least one detection found
        f2b_dets[f"{label}_{pname}"] = scores
all_results["florence2-base"] = {"elapsed": round(time.time()-t0, 1), "detections": f2b_dets}
print(f"  Done in {all_results['florence2-base']['elapsed']}s")
del f2b_model; torch.mps.empty_cache()

# ========== Florence-2 Large ==========
print("\n" + "="*60)
print("Florence-2 Large")
print("="*60)
t0 = time.time()
f2l_proc = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
f2l_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True).to(device)
f2l_dets = {}
for label, frame in frames.items():
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    for pname, prompt_text in PROMPTS.items():
        task = f"<OD>"
        text = f"{task}{prompt_text}"
        inputs = f2l_proc(text=text, images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = f2l_model.generate(**inputs, max_new_tokens=100, num_beams=3)
        result = f2l_proc.decode(outputs[0], skip_special_tokens=False)
        scores = []
        import re
        detections = re.findall(r'<loc_\d+>', result)
        n_dets = len(detections) // 4
        scores = [1.0] * n_dets if n_dets > 0 else []
        f2l_dets[f"{label}_{pname}"] = scores
all_results["florence2-large"] = {"elapsed": round(time.time()-t0, 1), "detections": f2l_dets}
print(f"  Done in {all_results['florence2-large']['elapsed']}s")
del f2l_model; torch.mps.empty_cache()

# ========== Summary ==========
print("\n" + "="*60)
print(f"{'Model':<25} {'Time':>8} {'Gun hits':>10} {'Gun best':>10} {'Pistol hits':>12} {'Pistol best':>10}")
print("-"*75)
for model_name in ["grounding-dino-base", "florence2-base", "florence2-large"]:
    d = all_results[model_name]
    dets = d["detections"]
    gun_scores = []
    pistol_scores = []
    for label, _, _ in TIMEPOINTS:
        gk = f"{label}s_gun"
        pk = f"{label}s_pistol"
        gun_scores.extend(dets.get(gk, []))
        pistol_scores.extend(dets.get(pk, []))
    gun_hits = sum(1 for s in gun_scores if s > 0)
    pistol_hits = sum(1 for s in pistol_scores if s > 0)
    gun_best = max(gun_scores) if gun_scores else 0
    pistol_best = max(pistol_scores) if pistol_scores else 0
    print(f"{model_name:<25} {d['elapsed']:>7.1f}s {gun_hits:>6d}/8 {gun_best:>8.3f} {pistol_hits:>6d}/8 {pistol_best:>8.3f}")

json.dump(all_results, open(os.path.join(OUTPUT_DIR, "model_comparison.json"), "w"), indent=2)
print(f"\nSaved to {OUTPUT_DIR}/")