feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/test_owl_vit_debug.py
+++ b/scripts/test_owl_vit_debug.py
@@ -0,0 +1,89 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Debug OWL-ViT with Multiple Prompts
+"""
+
+import os
+import cv2
+import torch
+from PIL import Image
+from transformers import OwlViTProcessor, OwlViTForObjectDetection
+
+UUID = "384b0ff44aaaa1f1"
+VIDEO_PATH = f"output/{UUID}/{UUID}.mp4"
+OUTPUT_DIR = f"output/{UUID}/owl_vit_results_debug"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+print("🧠 Loading OWL-ViT model...")
+processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
+
+cap = cv2.VideoCapture(VIDEO_PATH)
+
+# Frames we want to check
+timestamps = [5851.6, 5860.4, 6756.6, 6846.0]
+# Prompts to try
+prompts = [
+    ["a postage stamp", "a stamp"],
+    ["a letter", "an envelope", "a piece of paper"],
+    ["a small square paper"],
+]
+
+for t in timestamps:
+    cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
+    ret, frame = cap.read()
+    if not ret:
+        continue
+
+    image_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+
+    # Try different prompt sets
+    found_any = False
+    for i, text_queries in enumerate(prompts):
+        inputs = processor(text=text_queries, images=image_pil, return_tensors="pt")
+        outputs = model(**inputs)
+
+        target_sizes = torch.Tensor([image_pil.size[::-1]])
+        results = processor.post_process_object_detection(
+            outputs=outputs, target_sizes=target_sizes, threshold=0.05
+        )
+
+        for box, score, label in zip(
+            results[0]["boxes"], results[0]["scores"], results[0]["labels"]
+        ):
+            if score > 0.05:
+                found_any = True
+                x_min, y_min, x_max, y_max = box.int().tolist()
+                label_text = text_queries[label.item()]
+                print(f"   🟢 Found '{label_text}' ({score.item():.3f}) at {t:.2f}s")
+
+                # Draw
+                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
+                cv2.putText(
+                    frame,
+                    f"{label_text} {score.item():.3f}",
+                    (x_min, y_min - 10),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.5,
+                    (0, 255, 0),
+                    1,
+                )
+
+    if not found_any:
+        print(f"   🔴 Nothing found at {t:.2f}s")
+        cv2.putText(
+            frame,
+            "NO DETECTIONS",
+            (50, 50),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            1,
+            (0, 0, 255),
+            2,
+        )
+    else:
+        # Save result
+        save_path = os.path.join(OUTPUT_DIR, f"detected_{int(t)}.jpg")
+        cv2.imwrite(save_path, frame)
+        print(f"   💾 Saved to {save_path}")
+
+cap.release()