feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/utils/test_mediapipe.py
+++ b/scripts/utils/test_mediapipe.py
@@ -0,0 +1,377 @@
+#!/opt/homebrew/bin/python3.11
+"""
+MediaPipe Test Script - Test all MediaPipe modules
+
+Test modules:
+1. Face Mesh (468 keypoints)
+2. Pose (33 keypoints)
+3. Hands (21 keypoints per hand)
+4. Holistic (Face + Pose + Hands)
+"""
+
+import sys
+import cv2
+import numpy as np
+import mediapipe as mp
+from pathlib import Path
+
+
+def test_face_mesh():
+    """
+    Test MediaPipe Face Mesh (468 keypoints)
+    """
+    print("=" * 60)
+    print("Testing MediaPipe Face Mesh")
+    print("=" * 60)
+    
+    mp_face_mesh = mp.solutions.face_mesh
+    
+    # Create Face Mesh model
+    face_mesh = mp_face_mesh.FaceMesh(
+        static_image_mode=True,
+        max_num_faces=1,
+        refine_landmarks=True,  # Enable iris detection
+        min_detection_confidence=0.5,
+    )
+    
+    print("✅ Face Mesh model created")
+    
+    # Test on sample image
+    test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
+    
+    if Path(test_image_path).exists():
+        image = cv2.imread(test_image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        results = face_mesh.process(image_rgb)
+        
+        if results.multi_face_landmarks:
+            face_landmarks = results.multi_face_landmarks[0]
+            num_landmarks = len(face_landmarks.landmark)
+            
+            print(f"✅ Face detected: {num_landmarks} landmarks")
+            
+            # Key landmark indices
+            key_indices = {
+                "nose_tip": 1,
+                "left_eye_center": 33,
+                "right_eye_center": 263,
+                "left_iris_center": 468,
+                "right_iris_center": 473,
+                "mouth_top": 13,
+                "mouth_bottom": 14,
+                "mouth_left": 61,
+                "mouth_right": 291,
+            }
+            
+            print("\nKey landmarks:")
+            for name, idx in key_indices.items():
+                if idx < num_landmarks:
+                    landmark = face_landmarks.landmark[idx]
+                    print(f"  {name} ({idx}): x={landmark.x:.3f}, y={landmark.y:.3f}")
+            
+            # Calculate Eye Aspect Ratio (EAR)
+            # Left eye
+            p1 = face_landmarks.landmark[33]  # Left eye top
+            p2 = face_landmarks.landmark[133]  # Left eye bottom
+            p3 = face_landmarks.landmark[159]  # Left eye left
+            p4 = face_landmarks.landmark[145]  # Left eye right
+            
+            vertical_dist = abs(p2.y - p1.y)
+            horizontal_dist = abs(p4.x - p3.x)
+            ear_left = vertical_dist / horizontal_dist if horizontal_dist > 0 else 0
+            
+            print(f"\nEye Aspect Ratio (EAR):")
+            print(f"  Left eye EAR: {ear_left:.3f}")
+            print(f"  Interpretation: {'wide_open' if ear_left > 0.35 else 'normal' if ear_left > 0.2 else 'closed'}")
+            
+            # Calculate Mouth Aspect Ratio (MAR)
+            mouth_top = face_landmarks.landmark[13]
+            mouth_bottom = face_landmarks.landmark[14]
+            mouth_left = face_landmarks.landmark[61]
+            mouth_right = face_landmarks.landmark[291]
+            
+            mouth_height = abs(mouth_bottom.y - mouth_top.y)
+            mouth_width = abs(mouth_right.x - mouth_left.x)
+            mar = mouth_height / mouth_width if mouth_width > 0 else 0
+            
+            print(f"\nMouth Aspect Ratio (MAR):")
+            print(f"  MAR: {mar:.3f}")
+            print(f"  Interpretation: {'open' if mar > 0.5 else 'closed' if mar < 0.2 else 'slightly_open'}")
+        else:
+            print("❌ No face detected")
+    
+    face_mesh.close()
+    print("\n✅ Face Mesh test completed")
+
+
+def test_pose():
+    """
+    Test MediaPipe Pose (33 keypoints)
+    """
+    print("\n" + "=" * 60)
+    print("Testing MediaPipe Pose")
+    print("=" * 60)
+    
+    mp_pose = mp.solutions.pose
+    
+    pose = mp_pose.Pose(
+        static_image_mode=True,
+        model_complexity=2,  # Full model
+        enable_segmentation=False,
+        min_detection_confidence=0.5,
+    )
+    
+    print("✅ Pose model created")
+    
+    test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
+    
+    if Path(test_image_path).exists():
+        image = cv2.imread(test_image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        results = pose.process(image_rgb)
+        
+        if results.pose_landmarks:
+            landmarks = results.pose_landmarks.landmark
+            num_landmarks = len(landmarks)
+            
+            print(f"✅ Pose detected: {num_landmarks} keypoints")
+            
+            # Key keypoints
+            key_indices = {
+                "nose": 0,
+                "left_shoulder": 11,
+                "right_shoulder": 12,
+                "left_elbow": 13,
+                "right_elbow": 14,
+                "left_wrist": 15,
+                "right_wrist": 16,
+                "left_hip": 23,
+                "right_hip": 24,
+                "left_knee": 25,
+                "right_knee": 26,
+                "left_ankle": 27,
+                "right_ankle": 28,
+            }
+            
+            print("\nKey keypoints:")
+            for name, idx in key_indices.items():
+                landmark = landmarks[idx]
+                print(f"  {name} ({idx}): x={landmark.x:.3f}, y={landmark.y:.3f}, visibility={landmark.visibility:.2f}")
+            
+            # Calculate elbow angles
+            def calculate_angle(p1, p2, p3):
+                v1 = np.array([p1.x, p1.y]) - np.array([p2.x, p2.y])
+                v2 = np.array([p3.x, p3.y]) - np.array([p2.x, p2.y])
+                angle = np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
+                return np.degrees(angle)
+            
+            # Right arm angle
+            right_shoulder = landmarks[12]
+            right_elbow = landmarks[14]
+            right_wrist = landmarks[16]
+            
+            right_elbow_angle = calculate_angle(right_shoulder, right_elbow, right_wrist)
+            
+            print(f"\nRight elbow angle: {right_elbow_angle:.1f}°")
+            print(f"  Interpretation: {'extended' if right_elbow_angle > 150 else 'folded' if right_elbow_angle < 90 else 'neutral'}")
+            
+            # Check if arm is raised
+            if right_wrist.y < right_elbow.y < right_shoulder.y:
+                print(f"  Action: raise_right (arm raised)")
+            
+            # Knee angles
+            left_hip = landmarks[23]
+            left_knee = landmarks[25]
+            left_ankle = landmarks[27]
+            
+            left_knee_angle = calculate_angle(left_hip, left_knee, left_ankle)
+            
+            print(f"\nLeft knee angle: {left_knee_angle:.1f}°")
+            print(f"  Interpretation: {'standing' if left_knee_angle > 160 else 'knee_bend' if left_knee_angle < 120 else 'neutral'}")
+        else:
+            print("❌ No pose detected")
+    
+    pose.close()
+    print("\n✅ Pose test completed")
+
+
+def test_hands():
+    """
+    Test MediaPipe Hands (21 keypoints per hand)
+    """
+    print("\n" + "=" * 60)
+    print("Testing MediaPipe Hands")
+    print("=" * 60)
+    
+    mp_hands = mp.solutions.hands
+    
+    hands = mp_hands.Hands(
+        static_image_mode=True,
+        max_num_hands=2,
+        min_detection_confidence=0.5,
+    )
+    
+    print("✅ Hands model created")
+    
+    test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
+    
+    if Path(test_image_path).exists():
+        image = cv2.imread(test_image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        results = hands.process(image_rgb)
+        
+        if results.multi_hand_landmarks:
+            for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
+                hand_label = results.multi_handedness[idx].classification[0].label
+                
+                print(f"\n✅ Hand {idx+1} detected ({hand_label}): 21 keypoints")
+                
+                landmarks = hand_landmarks.landmark
+                
+                # Key landmarks
+                key_indices = {
+                    "wrist": 0,
+                    "thumb_tip": 4,
+                    "index_tip": 8,
+                    "middle_tip": 12,
+                    "ring_tip": 16,
+                    "pinky_tip": 20,
+                }
+                
+                print(f"  Key landmarks:")
+                for name, i in key_indices.items():
+                    lm = landmarks[i]
+                    print(f"    {name} ({i}): x={lm.x:.3f}, y={lm.y:.3f}")
+                
+                # Detect gesture
+                thumb_tip = landmarks[4]
+                index_tip = landmarks[8]
+                middle_tip = landmarks[12]
+                ring_tip = landmarks[16]
+                pinky_tip = landmarks[20]
+                wrist = landmarks[0]
+                
+                # Calculate finger extensions
+                def is_finger_extended(tip, base, wrist):
+                    return tip.y < base.y  # Extended upward
+                
+                thumb_extended = is_finger_extended(landmarks[4], landmarks[2], wrist)
+                index_extended = is_finger_extended(landmarks[8], landmarks[5], wrist)
+                middle_extended = is_finger_extended(landmarks[12], landmarks[9], wrist)
+                ring_extended = is_finger_extended(landmarks[16], landmarks[13], wrist)
+                pinky_extended = is_finger_extended(landmarks[20], landmarks[17], wrist)
+                
+                extensions = [thumb_extended, index_extended, middle_extended, ring_extended, pinky_extended]
+                
+                print(f"\n  Finger extensions: {['thumb', 'index', 'middle', 'ring', 'pinky']}")
+                print(f"    {extensions}")
+                
+                # Detect gesture
+                gesture = "unknown"
+                if all(extensions):
+                    gesture = "open_hand"
+                elif not any(extensions):
+                    gesture = "fist"
+                elif thumb_extended and not any(extensions[1:]):
+                    gesture = "thumbs_up"
+                elif index_extended and middle_extended and not any(extensions[2:]):
+                    gesture = "peace_sign"
+                elif index_extended and not any(extensions[2:]) and not thumb_extended:
+                    gesture = "pointing"
+                
+                print(f"  Detected gesture: {gesture}")
+        else:
+            print("❌ No hands detected")
+    
+    hands.close()
+    print("\n✅ Hands test completed")
+
+
+def test_holistic():
+    """
+    Test MediaPipe Holistic (Face + Pose + Hands combined)
+    """
+    print("\n" + "=" * 60)
+    print("Testing MediaPipe Holistic")
+    print("=" * 60)
+    
+    mp_holistic = mp.solutions.holistic
+    
+    holistic = mp_holistic.Holistic(
+        static_image_mode=True,
+        model_complexity=2,
+        enable_segmentation=False,
+        refine_face_landmarks=True,
+    )
+    
+    print("✅ Holistic model created")
+    
+    test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
+    
+    if Path(test_image_path).exists():
+        image = cv2.imread(test_image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        results = holistic.process(image_rgb)
+        
+        detected_count = 0
+        
+        if results.face_landmarks:
+            num_face = len(results.face_landmarks.landmark)
+            print(f"✅ Face: {num_face} landmarks")
+            detected_count += 1
+        
+        if results.pose_landmarks:
+            num_pose = len(results.pose_landmarks.landmark)
+            print(f"✅ Pose: {num_pose} keypoints")
+            detected_count += 1
+        
+        if results.left_hand_landmarks:
+            num_left_hand = len(results.left_hand_landmarks.landmark)
+            print(f"✅ Left hand: {num_left_hand} keypoints")
+            detected_count += 1
+        
+        if results.right_hand_landmarks:
+            num_right_hand = len(results.right_hand_landmarks.landmark)
+            print(f"✅ Right hand: {num_right_hand} keypoints")
+            detected_count += 1
+        
+        if detected_count == 0:
+            print("❌ No landmarks detected")
+        else:
+            print(f"\nTotal detections: {detected_count} components")
+    
+    holistic.close()
+    print("\n✅ Holistic test completed")
+
+
+def main():
+    print("=" * 70)
+    print("MediaPipe Installation Test")
+    print("=" * 70)
+    
+    print(f"\nMediaPipe version: {mp.__version__}")
+    print()
+    
+    # Test all modules
+    test_face_mesh()
+    test_pose()
+    test_hands()
+    test_holistic()
+    
+    print("\n" + "=" * 70)
+    print("✅ All MediaPipe tests completed!")
+    print("=" * 70)
+    
+    print("\nNext steps:")
+    print("  1. Face Mesh: Use for eye/mouth action detection")
+    print("  2. Pose: Use for arm/leg/feet action detection")
+    print("  3. Hands: Use for hand gesture detection")
+    print("  4. Holistic: Use for full-body action detection")
+
+
+if __name__ == "__main__":
+    main()