#!/opt/homebrew/bin/python3.11 """ Search for "vase" in the video using OWL-ViT on a subset of frames. """ import os import cv2 import glob from PIL import Image import torch from transformers import OwlViTProcessor, OwlViTForObjectDetection BASE_DIR = "output/384b0ff44aaaa1f1/full_video_scans" RESULTS_DIR = "output/384b0ff44aaaa1f1/vase_search_results" os.makedirs(RESULTS_DIR, exist_ok=True) print("šŸ” Searching for vases...") # Load model processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") model.eval() # Search terms SEARCH_TERMS = ["vase", "flower vase", "urn", "pottery", "glass jar"] frames = sorted(glob.glob(os.path.join(BASE_DIR, "frame_*.jpg"))) print(f"šŸ“ø Scanning {len(frames)} frames...") found_count = 0 for frame_path in frames: frame_name = os.path.basename(frame_path) sec = frame_name.replace("frame_", "").replace("s.jpg", "") image = Image.open(frame_path).convert("RGB") h, w = image.height, image.width target_sizes = torch.Tensor([[h, w]]) for term in SEARCH_TERMS: inputs = processor(text=[[term]], images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) results = processor.post_process_object_detection( outputs=outputs, target_sizes=target_sizes, threshold=0.05 ) for score, label, box in zip( results[0]["scores"], results[0]["labels"], results[0]["boxes"] ): s = float(score) if s > 0.08: # Threshold for visualization x1, y1, x2, y2 = map(int, box.tolist()) img = cv2.imread(frame_path) crop = img[y1:y2, x1:x2] if crop.size > 0: crop_name = f"vase_{sec}s_{term.replace(' ', '_')}_{s:.2f}.jpg" cv2.imwrite(os.path.join(RESULTS_DIR, crop_name), crop) # Annotate cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 3) cv2.putText( img, f"{term} {s:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, ) ann_name = f"annotated_{sec}s.jpg" cv2.imwrite(os.path.join(RESULTS_DIR, ann_name), img) print(f" šŸ“ {sec}s | {term} | {s:.2f}") found_count += 1 print(f"\nšŸ Done. Found {found_count} candidates.")