feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
This commit is contained in:
Warren
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions

View File

@@ -0,0 +1,161 @@
#!/opt/homebrew/bin/python3.11
"""
Deep Analysis of 112:36 Frame
1. Detailed Captioning
2. Search for "Envelope" and "Hand holding object"
"""
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
UUID = "384b0ff44aaaa1f1"
BASE_DIR = f"output/{UUID}/florence2_results"
IMG_NAME = "scan_6756.jpg" # 112:36
IMG_PATH = os.path.join(BASE_DIR, IMG_NAME)
# Patch for compatibility
def patch_model(model):
inner_model = model.language_model
original_prepare = inner_model.prepare_inputs_for_generation
def patched_prepare(
self,
input_ids,
past_key_values=None,
attention_mask=None,
inputs_embeds=None,
**kwargs,
):
is_valid_cache = False
if past_key_values is not None:
if isinstance(past_key_values, (list, tuple)) and len(past_key_values) > 0:
first_layer = past_key_values[0]
if first_layer is not None and (
not isinstance(first_layer, (list, tuple)) or len(first_layer) > 0
):
is_valid_cache = True
if not is_valid_cache:
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": None,
"use_cache": True,
}
else:
return original_prepare(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
**kwargs,
)
inner_model.prepare_inputs_for_generation = types.MethodType(
patched_prepare, inner_model
)
print(f"📷 Loading image: {IMG_PATH}")
if not os.path.exists(IMG_PATH):
print("❌ Image not found.")
exit()
image = Image.open(IMG_PATH).convert("RGB")
print("🧠 Loading Florence-2 model...")
try:
processor = AutoProcessor.from_pretrained(
"microsoft/Florence-2-base", trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Florence-2-base", trust_remote_code=True, attn_implementation="eager"
)
patch_model(model)
# 1. Detailed Caption
print("\n📝 Generating Detailed Caption...")
prompt = "<DETAILED_CAPTION>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"🗣️ Caption: {generated_text}")
# 2. Object Detection for specific items
search_terms = ["envelope", "letter", "hand holding paper", "stamp", "small paper"]
img_cv = cv2.imread(IMG_PATH)
for term in search_terms:
print(f"\n🔍 Detecting '{term}'...")
prompt_ovd = "<OPEN_VOCABULARY_DETECTION>"
# Note: OVD usually takes text input differently or relies on generation.
# For Florence-2, OVD often requires text_input in processor or prompt format.
# We will try the standard way first.
inputs = processor(text=prompt_ovd, images=image, return_tensors="pt")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3,
)
generated_text = processor.batch_decode(
generated_ids, skip_special_tokens=False
)[0]
try:
parsed_answer = processor.post_process_generation(
generated_text, task=prompt_ovd, image_size=(image.width, image.height)
)
results = parsed_answer.get("<OPEN_VOCABULARY_DETECTION>", {})
bboxes = results.get("bboxes", [])
labels = results.get("bboxes_labels", [])
if bboxes:
print(f" ✅ Found '{term}': {labels}")
for i, (box, label) in enumerate(zip(bboxes, labels)):
if term.lower() in label.lower() or (
term == "envelope" and "paper" in label.lower()
):
x1, y1, x2, y2 = map(int, box)
print(f" 📍 Box: ({x1},{y1}) -> ({x2},{y2})")
# Crop
crop = img_cv[y1:y2, x1:x2]
crop_path = os.path.join(
BASE_DIR, f"crop_deep_{term.replace(' ', '_')}_{i}.jpg"
)
cv2.imwrite(crop_path, crop)
# Draw
cv2.rectangle(img_cv, (x1, y1), (x2, y2), (0, 255, 0), 3)
cv2.putText(
img_cv,
label,
(x1, y1 - 10),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 255, 0),
2,
)
else:
print(f" ❌ Not found.")
except Exception as e:
print(f" ⚠️ Error: {e}")
res_path = os.path.join(BASE_DIR, "deep_analysis_result.jpg")
cv2.imwrite(res_path, img_cv)
print(f"\n🎨 Result saved to {res_path}")
except Exception as e:
print(f"❌ Error: {e}")