Files
momentry_core/scripts/deep_analysis_112_36.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

162 lines
5.4 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Deep Analysis of 112:36 Frame
1. Detailed Captioning
2. Search for "Envelope" and "Hand holding object"
"""
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
UUID = "384b0ff44aaaa1f1"
BASE_DIR = f"output/{UUID}/florence2_results"
IMG_NAME = "scan_6756.jpg" # 112:36
IMG_PATH = os.path.join(BASE_DIR, IMG_NAME)
# Patch for compatibility
def patch_model(model):
inner_model = model.language_model
original_prepare = inner_model.prepare_inputs_for_generation
def patched_prepare(
self,
input_ids,
past_key_values=None,
attention_mask=None,
inputs_embeds=None,
**kwargs,
):
is_valid_cache = False
if past_key_values is not None:
if isinstance(past_key_values, (list, tuple)) and len(past_key_values) > 0:
first_layer = past_key_values[0]
if first_layer is not None and (
not isinstance(first_layer, (list, tuple)) or len(first_layer) > 0
):
is_valid_cache = True
if not is_valid_cache:
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": None,
"use_cache": True,
}
else:
return original_prepare(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
**kwargs,
)
inner_model.prepare_inputs_for_generation = types.MethodType(
patched_prepare, inner_model
)
print(f"📷 Loading image: {IMG_PATH}")
if not os.path.exists(IMG_PATH):
print("❌ Image not found.")
exit()
image = Image.open(IMG_PATH).convert("RGB")
print("🧠 Loading Florence-2 model...")
try:
processor = AutoProcessor.from_pretrained(
"microsoft/Florence-2-base", trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Florence-2-base", trust_remote_code=True, attn_implementation="eager"
)
patch_model(model)
# 1. Detailed Caption
print("\n📝 Generating Detailed Caption...")
prompt = "<DETAILED_CAPTION>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"🗣️ Caption: {generated_text}")
# 2. Object Detection for specific items
search_terms = ["envelope", "letter", "hand holding paper", "stamp", "small paper"]
img_cv = cv2.imread(IMG_PATH)
for term in search_terms:
print(f"\n🔍 Detecting '{term}'...")
prompt_ovd = "<OPEN_VOCABULARY_DETECTION>"
# Note: OVD usually takes text input differently or relies on generation.
# For Florence-2, OVD often requires text_input in processor or prompt format.
# We will try the standard way first.
inputs = processor(text=prompt_ovd, images=image, return_tensors="pt")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3,
)
generated_text = processor.batch_decode(
generated_ids, skip_special_tokens=False
)[0]
try:
parsed_answer = processor.post_process_generation(
generated_text, task=prompt_ovd, image_size=(image.width, image.height)
)
results = parsed_answer.get("<OPEN_VOCABULARY_DETECTION>", {})
bboxes = results.get("bboxes", [])
labels = results.get("bboxes_labels", [])
if bboxes:
print(f" ✅ Found '{term}': {labels}")
for i, (box, label) in enumerate(zip(bboxes, labels)):
if term.lower() in label.lower() or (
term == "envelope" and "paper" in label.lower()
):
x1, y1, x2, y2 = map(int, box)
print(f" 📍 Box: ({x1},{y1}) -> ({x2},{y2})")
# Crop
crop = img_cv[y1:y2, x1:x2]
crop_path = os.path.join(
BASE_DIR, f"crop_deep_{term.replace(' ', '_')}_{i}.jpg"
)
cv2.imwrite(crop_path, crop)
# Draw
cv2.rectangle(img_cv, (x1, y1), (x2, y2), (0, 255, 0), 3)
cv2.putText(
img_cv,
label,
(x1, y1 - 10),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 255, 0),
2,
)
else:
print(f" ❌ Not found.")
except Exception as e:
print(f" ⚠️ Error: {e}")
res_path = os.path.join(BASE_DIR, "deep_analysis_result.jpg")
cv2.imwrite(res_path, img_cv)
print(f"\n🎨 Result saved to {res_path}")
except Exception as e:
print(f"❌ Error: {e}")