#!/opt/homebrew/bin/python3.11 """ Test Florence-2 for "Stamps" Detection Florence-2 is superior to OWL-ViT for small objects and detailed description. """ import os import cv2 from PIL import Image from transformers import AutoProcessor, AutoModelForCausalLM UUID = "384b0ff44aaaa1f1" VIDEO_PATH = f"output/{UUID}/{UUID}.mp4" OUTPUT_DIR = f"output/{UUID}/florence2_results" os.makedirs(OUTPUT_DIR, exist_ok=True) # Frame where "stamp" is heavily discussed TIMESTAMP = 6846.0 print(f"📽️ Extracting frame at {TIMESTAMP}s...") cap = cv2.VideoCapture(VIDEO_PATH) cap.set(cv2.CAP_PROP_POS_MSEC, TIMESTAMP * 1000) ret, frame = cap.read() cap.release() if not ret: print("❌ Failed to read frame.") exit() # Save raw frame raw_path = os.path.join(OUTPUT_DIR, f"raw_{int(TIMESTAMP)}.jpg") cv2.imwrite(raw_path, frame) print("💾 Raw frame saved.") image_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) print("🧠 Loading Florence-2 model (this may take a moment)...") try: processor = AutoProcessor.from_pretrained( "microsoft/Florence-2-base", trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( "microsoft/Florence-2-base", trust_remote_code=True ) except Exception as e: print(f"❌ Error loading model: {e}") exit() # Test 1: Open Vocabulary Detection print("🔍 Testing Open Vocabulary Detection for 'stamp'...") prompt = "stamp" inputs = processor(text=prompt, images=image_pil, return_tensors="pt") generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3, ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = processor.post_process_generation( generated_text, task="", image_size=(image_pil.width, image_pil.height), ) print(f"📝 Florence-2 Result: {parsed_answer}") # Test 2: Detailed Caption (To see if it notices the stamp in context) print("📝 Testing Detailed Caption...") inputs = processor(text="", images=image_pil, return_tensors="pt") generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, ) caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] print(f"📝 Caption: {caption}") print("🏁 Done.")