#!/opt/homebrew/bin/python3.11 """ Test Grounding DINO Large on stamps, envelopes, passports, letters. """ import json, os, time, cv2, torch from PIL import Image from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf" VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4" OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_objects" os.makedirs(OUTPUT_DIR, exist_ok=True) # Timepoints per object type TESTS = [ # (label, time_sec, prompts) ("stamp_001", 429, ["stamp", "postage stamp"]), ("stamp_002", 691, ["stamp", "envelope", "letter"]), ("stamp_003", 5443, ["stamp", "envelope"]), ("stamp_004", 5500, ["stamp"]), ("stamp_005", 5506, ["stamp"]), ("envelope_001", 5443, ["envelope"]), ("envelope_002", 5467, ["envelope"]), ("envelope_003", 5786, ["envelope"]), ("passport_001", 762, ["passport", "identification"]), ("passport_002", 3491, ["passport", "identification"]), ("passport_003", 5054, ["passport"]), ("letter_001", 691, ["letter", "envelope"]), ("letter_002", 5434, ["letter", "envelope"]), ("letter_003", 5783, ["letter", "stamp"]), ] print(f"Loading Large model...") t0 = time.time() processor = AutoProcessor.from_pretrained(MODEL_PATH) model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_PATH) device = "mps" if torch.backends.mps.is_available() else "cpu" model.to(device) print(f"Loaded in {time.time()-t0:.1f}s, device={device}") cap = cv2.VideoCapture(VIDEO) fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 results = {} t_infer = time.time() for label, t_sec, prompts in TESTS: cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps)) ret, frame = cap.read() if frame is None: continue img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) key = f"{label}_{t_sec}s" results[key] = {"time": t_sec, "time_str": f"{t_sec//60}:{t_sec%60:02d}", "prompts": {}} for prompt in prompts: inputs = processor(images=img, text=f"{prompt}.", return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) target = torch.tensor([img.size[::-1]]) dets = processor.post_process_grounded_object_detection( outputs, threshold=0.1, target_sizes=target )[0] det_list = [] for i in range(len(dets["boxes"])): det_list.append({ "bbox": [round(v, 1) for v in dets["boxes"][i].tolist()], "score": round(dets["scores"][i].item(), 3), }) results[key]["prompts"][prompt] = det_list # Save annotated frame if det_list: cv2_img = frame.copy() for d in det_list: x1, y1, x2, y2 = [int(v) for v in d["bbox"]] cv2.rectangle(cv2_img, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(cv2_img, f"{prompt} {d['score']:.2f}", (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.imwrite(os.path.join(OUTPUT_DIR, f"{label}_{t_sec}s_{prompt}.jpg"), cv2_img, [cv2.IMWRITE_JPEG_QUALITY, 85]) cap.release() elapsed = time.time() - t_infer # Summary print(f"\n{'='*60}") print(f"Results ({elapsed:.0f}s)") print(f"{'='*60}") for key, data in sorted(results.items()): found = [p for p, dets in data["prompts"].items() if dets] if found: best = max( ((p, d["score"]) for p, dets in data["prompts"].items() for d in dets), key=lambda x: x[1] ) print(f" {data['time_str']} {key:20s} ✅ {best[1]:.3f} ({best[0]})") else: print(f" {data['time_str']} {key:20s} ❌ none") json.dump(results, open(os.path.join(OUTPUT_DIR, "results.json"), "w"), indent=2) print(f"\nScreenshots saved to {OUTPUT_DIR}/")