feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/florence2_scan_stamps.py
+++ b/scripts/florence2_scan_stamps.py
@@ -0,0 +1,104 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Use Florence-2 to scan video frames for "stamp" using open vocabulary detection
+"""
+
+import os
+import cv2
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
+
+UUID = "384b0ff44aaaa1f1"
+VIDEO_PATH = f"output/{UUID}/{UUID}.mp4"
+OUTPUT_DIR = f"output/{UUID}/florence2_stamp_scan"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+# Scan frames at 5-minute intervals throughout the 2-hour video
+TIMESTAMPS = list(range(0, 6879, 300))  # Every 5 minutes
+
+print(f"📽️ Loading Florence-2 model...")
+processor = AutoProcessor.from_pretrained(
+    "microsoft/Florence-2-base", trust_remote_code=True
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Florence-2-base", trust_remote_code=True
+)
+model.eval()
+
+cap = cv2.VideoCapture(VIDEO_PATH)
+print(f"🔍 Scanning {len(TIMESTAMPS)} frames for 'stamp'...")
+
+for ts in TIMESTAMPS:
+    cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
+    ret, frame = cap.read()
+    if not ret:
+        continue
+
+    image_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+
+    # Open Vocabulary Detection for "stamp"
+    prompt = "<OPEN_VOCABULARY_DETECTION>"
+    inputs = processor(
+        text=prompt,
+        images=image_pil,
+        return_tensors="pt",
+        # Florence-2 expects the prompt to include what to detect
+    )
+
+    # For open vocabulary, we need to use a different approach
+    # Florence-2 uses specific task prompts
+    task = "<OPEN_VOCABULARY_DETECTION>"
+    text_input = f"{task} stamp"
+
+    inputs = processor(text=text_input, images=image_pil, return_tensors="pt")
+
+    with torch.no_grad():
+        generated_ids = model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=512,
+            num_beams=3,
+        )
+
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+
+    try:
+        parsed = processor.post_process_generation(
+            generated_text,
+            task=task,
+            image_size=(image_pil.width, image_pil.height),
+        )
+
+        if parsed and "<OPEN_VOCABULARY_DETECTION>" in parsed:
+            detections = parsed["<OPEN_VOCABULARY_DETECTION>"]
+            if detections:
+                print(f"  📍 Frame {ts}s: Found {len(detections)} stamp(s)")
+                for i, det in enumerate(detections):
+                    bbox = det.get("bbox", [0, 0, 0, 0])
+                    x1, y1, x2, y2 = map(int, bbox)
+                    crop = frame[y1:y2, x1:x2]
+                    if crop.size > 0:
+                        crop_path = os.path.join(OUTPUT_DIR, f"stamp_{ts}s_{i}.jpg")
+                        cv2.imwrite(crop_path, crop)
+
+                        # Also draw on full frame
+                        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)
+                        cv2.putText(
+                            frame,
+                            f"stamp {i}",
+                            (x1, y1 - 10),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            1,
+                            (0, 255, 0),
+                            2,
+                        )
+
+                # Save annotated frame
+                ann_path = os.path.join(OUTPUT_DIR, f"annotated_{ts}s.jpg")
+                cv2.imwrite(ann_path, frame)
+    except Exception as e:
+        print(f"  ⚠️  Frame {ts}s: Parse error - {e}")
+
+cap.release()
+print(f"\n🏁 Done. Check {OUTPUT_DIR} for results.")