momentry_core/scripts/deep_analysis_112_36.py

#!/opt/homebrew/bin/python3.11
"""
Deep Analysis of 112:36 Frame
1. Detailed Captioning
2. Search for "Envelope" and "Hand holding object"
"""

import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

UUID = "384b0ff44aaaa1f1"
BASE_DIR = f"output/{UUID}/florence2_results"
IMG_NAME = "scan_6756.jpg"  # 112:36
IMG_PATH = os.path.join(BASE_DIR, IMG_NAME)


# Patch for compatibility
def patch_model(model):
    inner_model = model.language_model
    original_prepare = inner_model.prepare_inputs_for_generation

    def patched_prepare(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        **kwargs,
    ):
        is_valid_cache = False
        if past_key_values is not None:
            if isinstance(past_key_values, (list, tuple)) and len(past_key_values) > 0:
                first_layer = past_key_values[0]
                if first_layer is not None and (
                    not isinstance(first_layer, (list, tuple)) or len(first_layer) > 0
                ):
                    is_valid_cache = True

        if not is_valid_cache:
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "past_key_values": None,
                "use_cache": True,
            }
        else:
            return original_prepare(
                input_ids,
                past_key_values=past_key_values,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                **kwargs,
            )

    inner_model.prepare_inputs_for_generation = types.MethodType(
        patched_prepare, inner_model
    )


print(f"📷 Loading image: {IMG_PATH}")
if not os.path.exists(IMG_PATH):
    print("❌ Image not found.")
    exit()

image = Image.open(IMG_PATH).convert("RGB")

print("🧠 Loading Florence-2 model...")
try:
    processor = AutoProcessor.from_pretrained(
        "microsoft/Florence-2-base", trust_remote_code=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/Florence-2-base", trust_remote_code=True, attn_implementation="eager"
    )
    patch_model(model)

    # 1. Detailed Caption
    print("\n📝 Generating Detailed Caption...")
    prompt = "<DETAILED_CAPTION>"
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"🗣️ Caption: {generated_text}")

    # 2. Object Detection for specific items
    search_terms = ["envelope", "letter", "hand holding paper", "stamp", "small paper"]
    img_cv = cv2.imread(IMG_PATH)

    for term in search_terms:
        print(f"\n🔍 Detecting '{term}'...")
        prompt_ovd = "<OPEN_VOCABULARY_DETECTION>"
        # Note: OVD usually takes text input differently or relies on generation.
        # For Florence-2, OVD often requires text_input in processor or prompt format.
        # We will try the standard way first.

        inputs = processor(text=prompt_ovd, images=image, return_tensors="pt")
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3,
        )
        generated_text = processor.batch_decode(
            generated_ids, skip_special_tokens=False
        )[0]

        try:
            parsed_answer = processor.post_process_generation(
                generated_text, task=prompt_ovd, image_size=(image.width, image.height)
            )
            results = parsed_answer.get("<OPEN_VOCABULARY_DETECTION>", {})
            bboxes = results.get("bboxes", [])
            labels = results.get("bboxes_labels", [])

            if bboxes:
                print(f"   ✅ Found '{term}': {labels}")
                for i, (box, label) in enumerate(zip(bboxes, labels)):
                    if term.lower() in label.lower() or (
                        term == "envelope" and "paper" in label.lower()
                    ):
                        x1, y1, x2, y2 = map(int, box)
                        print(f"      📍 Box: ({x1},{y1}) -> ({x2},{y2})")

                        # Crop
                        crop = img_cv[y1:y2, x1:x2]
                        crop_path = os.path.join(
                            BASE_DIR, f"crop_deep_{term.replace(' ', '_')}_{i}.jpg"
                        )
                        cv2.imwrite(crop_path, crop)

                        # Draw
                        cv2.rectangle(img_cv, (x1, y1), (x2, y2), (0, 255, 0), 3)
                        cv2.putText(
                            img_cv,
                            label,
                            (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1,
                            (0, 255, 0),
                            2,
                        )
            else:
                print(f"   ❌ Not found.")
        except Exception as e:
            print(f"   ⚠️ Error: {e}")

    res_path = os.path.join(BASE_DIR, "deep_analysis_result.jpg")
    cv2.imwrite(res_path, img_cv)
    print(f"\n🎨 Result saved to {res_path}")

except Exception as e:
    print(f"❌ Error: {e}")