#!/opt/homebrew/bin/python3.11 """ Test Grounding DINO Large with COMBINED prompts — one inference per frame. """ import json, os, time, cv2, torch from PIL import Image from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf" VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4" OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_objects" os.makedirs(OUTPUT_DIR, exist_ok=True) TIMEPOINTS = [ (429, "stamp"), (691, "stamp_letter"), (762, "passport"), (3491, "passport"), (5054, "passport"), (5434, "letter"), (5443, "stamp_envelope"), (5467, "envelope"), (5500, "stamp"), (5506, "stamp"), (5783, "letter"), (5786, "envelope"), ] COMBINED_PROMPT = "stamp. postage stamp. envelope. passport. identification. letter." print("Loading Large model...") t0 = time.time() processor = AutoProcessor.from_pretrained(MODEL_PATH) model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_PATH) device = "mps" if torch.backends.mps.is_available() else "cpu" model.to(device) print(f"Loaded in {time.time()-t0:.1f}s") cap = cv2.VideoCapture(VIDEO) fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 print(f"\nTesting {len(TIMEPOINTS)} timepoints with combined prompt...") t_infer = time.time() for t_sec, label in TIMEPOINTS: cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps)) ret, frame = cap.read() if frame is None: continue img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # ONE inference with ALL prompts inputs = processor(images=img, text=COMBINED_PROMPT, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) target = torch.tensor([img.size[::-1]]) dets = processor.post_process_grounded_object_detection( outputs, threshold=0.1, target_sizes=target )[0] det_list = [] for i in range(len(dets["boxes"])): det_list.append({ "bbox": [round(v, 1) for v in dets["boxes"][i].tolist()], "score": round(dets["scores"][i].item(), 3), "label": str(dets["labels"][i]) if "labels" in dets else "object", }) # Classify which expected objects were found found = set() for d in det_list: lbl = d["label"].lower() for obj in ["stamp", "envelope", "passport", "letter"]: if obj in lbl: found.add(obj) found_str = ", ".join(sorted(found)) if found else "none" print(f" {t_sec//60}:{t_sec%60:02d} {label:20s} | {len(det_list)} dets | found: [{found_str}]") # Save annotated frame for d in det_list: x1, y1, x2, y2 = [int(v) for v in d["bbox"]] cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(frame, f"{d['label']} {d['score']:.2f}", (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.imwrite(os.path.join(OUTPUT_DIR, f"combined_{t_sec}s.jpg"), frame, [cv2.IMWRITE_JPEG_QUALITY, 85]) cap.release() print(f"\nDone in {time.time()-t_infer:.0f}s") print(f"Screenshots: {OUTPUT_DIR}/")